fix victim info: display ppid

This commit is contained in:
Alexey Avramov 2019-02-11 17:59:25 +09:00
parent f3d2e4e099
commit a1e7b25ad0
2 changed files with 31 additions and 93 deletions

View File

@ -193,7 +193,7 @@ Please create [issues](https://github.com/hakavlad/nohang/issues). Use cases, fe
- Fix: replace `re.fullmatch()` by `re.search()` - Fix: replace `re.fullmatch()` by `re.search()`
- Validation RE patterns at startup - Validation RE patterns at startup
- Improve output: - Improve output:
- Display `oom_score`, `oom_score_adj`, `euid`, `state`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem`, `realpath` and `cmdline` of the victim in corrective action reports - Display `oom_score`, `oom_score_adj`, `PPID`, `EUID`, `State`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem`, `realpath` and `cmdline` of the victim in corrective action reports
- Print in terminal with colors - Print in terminal with colors
- Print statistics on corrective actions after each corrective action - Print statistics on corrective actions after each corrective action
- Improve poll rate algorithm - Improve poll rate algorithm

116
nohang
View File

@ -1,11 +1,11 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
"""A daemon that prevents OOM in Linux systems.""" """A daemon that prevents OOM in Linux systems."""
import os import os
import ctypes from ctypes import CDLL
from time import sleep, time from time import sleep, time
from operator import itemgetter from operator import itemgetter
from sys import stdout from sys import stdout
from signal import SIGKILL, SIGTERM, SIGSTOP, SIGCONT from signal import SIGKILL, SIGTERM
start_time = time() start_time = time()
@ -40,11 +40,6 @@ HR = '~' * 79
# todo: make config option # todo: make config option
print_total_stat = True print_total_stat = True
stop_cont = False
stop_cont_warn = False
########################################################################## ##########################################################################
# define functions # define functions
@ -56,11 +51,15 @@ def mlockall():
MCL_FUTURE = 2 MCL_FUTURE = 2
MCL_ONFAULT = 4 MCL_ONFAULT = 4
libc = ctypes.CDLL('libc.so.6', use_errno=True) libc = CDLL('libc.so.6', use_errno=True)
result = libc.mlockall(MCL_CURRENT|MCL_FUTURE|MCL_ONFAULT) result = libc.mlockall(
MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT
)
if result != 0: if result != 0:
result = libc.mlockall(MCL_CURRENT|MCL_FUTURE) result = libc.mlockall(
MCL_CURRENT | MCL_FUTURE
)
if result != 0: if result != 0:
print('Cannot lock all memory') print('Cannot lock all memory')
else: else:
@ -72,53 +71,6 @@ def mlockall():
def pid_to_state(pid): def pid_to_state(pid):
return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1] return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1]
def stop():
#print()
#print('Stop running processes...')
t1 = time()
t2 = time()
stopped_list = []
for pid in os.listdir('/proc')[::-1]:
# only directories whose names consist only of numbers, except /proc/1/
if pid[0].isdecimal() is False or pid is '1' or pid == self_pid:
continue
try:
oom_score_r = int(rline1('/proc/' + pid + '/oom_score'))
if oom_score_r > 9:
uid_r = pid_to_uid(pid)
#print('PID: {}, State: {}, oom_score {}'.format(pid, pid_to_state(pid), oom_score_r))
if uid_r != '0':
stopped_list.append(pid)
print('Send SIGSTOP to {}, {}, {}...'.format(
pid, pid_to_name(pid), pid_to_cmdline(pid)[:40]))
os.kill(int(pid), SIGSTOP)
t2 = time()
except FileNotFoundError:
continue
except ProcessLookupError:
continue
print('Stop time:', t2 - t1)
stdout.flush()
return stopped_list
def cont(stopped_list):
print()
print('Continue stopped processes...')
t1 = time()
if len(stopped_list) > 0:
for pid in stopped_list:
print('Send SIGCONT to', [pid], pid_to_name(pid))
try:
os.kill(int(pid), SIGCONT)
except FileNotFoundError:
continue
except ProcessLookupError:
continue
t2 = time()
print('All cont time: ', t2 - t1)
def update_stat_dict_and_print(key): def update_stat_dict_and_print(key):
@ -404,9 +356,6 @@ def send_notify_warn():
(implement Low memory warnings) (implement Low memory warnings)
""" """
if stop_cont_warn:
stopped_list = stop()
# find process with max badness # find process with max badness
fat_tuple = fattest() fat_tuple = fattest()
pid = fat_tuple[0] pid = fat_tuple[0]
@ -428,7 +377,7 @@ def send_notify_warn():
# title = 'Low memory: {}'.format(low_mem_percent) # title = 'Low memory: {}'.format(low_mem_percent)
title = 'Low memory' title = 'Low memory'
body = 'Hog: <b>{}</b> [{}]'.format( body = 'Hog: <b>{}</b>, PID: {}'.format(
name.replace( name.replace(
# symbol '&' can break notifications in some themes, # symbol '&' can break notifications in some themes,
# therefore it is replaced by '*' # therefore it is replaced by '*'
@ -443,9 +392,6 @@ def send_notify_warn():
# send notification to user that runs this nohang # send notification to user that runs this nohang
notify_send_wait(title, body) notify_send_wait(title, body)
if stop_cont_warn:
cont(stopped_list)
def send_notify(signal, name, pid): def send_notify(signal, name, pid):
""" """
@ -456,7 +402,7 @@ def send_notify(signal, name, pid):
pid: str process pid pid: str process pid
""" """
title = 'Hang prevention' title = 'Hang prevention'
body = '<b>{} {}</b> [{}]'.format( body = '<b>{} {}</b>, PID: {}'.format(
notify_sig_dict[signal], notify_sig_dict[signal],
name.replace( name.replace(
# symbol '&' can break notifications in some themes, # symbol '&' can break notifications in some themes,
@ -614,11 +560,6 @@ def find_victim_and_send_signal(signal):
-> implement_corrective_action() -> implement_corrective_action()
""" """
if stop_cont:
stopped_list = stop()
pid, victim_badness = fattest() pid, victim_badness = fattest()
name = pid_to_name(pid) name = pid_to_name(pid)
@ -633,8 +574,14 @@ def find_victim_and_send_signal(signal):
with open('/proc/' + pid + '/status') as f: with open('/proc/' + pid + '/status') as f:
for n, line in enumerate(f): for n, line in enumerate(f):
if n is ppid_index:
ppid = line.split('\t')[1]
if n is uid_index: if n is uid_index:
uid = line.split('\t')[1] uid = line.split('\t')[2]
continue continue
if n is vm_size_index: if n is vm_size_index:
@ -691,16 +638,13 @@ def find_victim_and_send_signal(signal):
for i in range(len(f_list)): for i in range(len(f_list)):
if i is ppid_index: if i is ppid_index:
ppid = f_list[i].split('\t')[2] ppid = f_list[i].split('\t')[1]
for i in range(len(f_list)): for i in range(len(f_list)):
if i is uid_index: if i is uid_index:
uid = f_list[i].split('\t')[2] uid = f_list[i].split('\t')[2]
if i is vm_size_index: if i is vm_size_index:
vm_size = kib_to_mib( vm_size = kib_to_mib(
int(f_list[i].split('\t')[1][:-3])) int(f_list[i].split('\t')[1][:-3]))
@ -748,10 +692,10 @@ def find_victim_and_send_signal(signal):
len_vm = len(str(vm_size)) len_vm = len(str(vm_size))
realpath = os.path.realpath('/proc/' + pid + '/exe') realpath = os.path.realpath('/proc/' + pid + '/exe')
state = pid_to_state(pid) state = pid_to_state(pid)
pname = pid_to_name(ppid.strip('\n '))
# print([ppid], [pname])
if detailed_rss: if detailed_rss:
@ -760,7 +704,8 @@ def find_victim_and_send_signal(signal):
'\n Name: \033[33m{}\033[0m' \ '\n Name: \033[33m{}\033[0m' \
'\n State: \033[33m{}\033[0m' \ '\n State: \033[33m{}\033[0m' \
'\n PID: \033[33m{}\033[0m' \ '\n PID: \033[33m{}\033[0m' \
'\n UID: \033[33m{}\033[0m' \ '\n PPID: \033[33m{}\033[0m (\033[33m{}\033[0m)' \
'\n EUID: \033[33m{}\033[0m' \
'\n badness: \033[33m{}\033[0m, ' \ '\n badness: \033[33m{}\033[0m, ' \
'oom_score: \033[33m{}\033[0m, ' \ 'oom_score: \033[33m{}\033[0m, ' \
'oom_score_adj: \033[33m{}\033[0m' \ 'oom_score_adj: \033[33m{}\033[0m' \
@ -775,6 +720,8 @@ def find_victim_and_send_signal(signal):
name, name,
state, state,
pid, pid,
ppid.strip('\n '),
pname,
uid, uid,
victim_badness, victim_badness,
oom_score, oom_score,
@ -853,13 +800,8 @@ def find_victim_and_send_signal(signal):
m = check_mem_and_swap() m = check_mem_and_swap()
ma = round(int(m[0]) / 1024.0) ma = round(int(m[0]) / 1024.0)
sf = round(int(m[2]) / 1024.0) sf = round(int(m[2]) / 1024.0)
print('\nMemory status before sending a signal:\nMemA' print('\nMemory status before sending a signal:\n MemAvailable'
'v: {} MiB, SwFree: {} MiB'.format(ma, sf)) ': {} MiB, SwapFree: {} MiB'.format(ma, sf))
if stop_cont:
os.kill(int(pid), SIGCONT)
os.kill(int(pid), signal) os.kill(int(pid), signal)
response_time = time() - time0 response_time = time() - time0
@ -909,10 +851,6 @@ def find_victim_and_send_signal(signal):
key = 'victim badness < min_badness' key = 'victim badness < min_badness'
update_stat_dict_and_print(key) update_stat_dict_and_print(key)
if stop_cont:
cont(stopped_list)
sleep_after_send_signal(signal) sleep_after_send_signal(signal)