diff --git a/README.md b/README.md index 9f9ac24..5a7d6f9 100644 --- a/README.md +++ b/README.md @@ -193,7 +193,7 @@ Please create [issues](https://github.com/hakavlad/nohang/issues). Use cases, fe - Fix: replace `re.fullmatch()` by `re.search()` - Validation RE patterns at startup - Improve output: - - Display `oom_score`, `oom_score_adj`, `euid`, `state`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem`, `realpath` and `cmdline` of the victim in corrective action reports + - Display `oom_score`, `oom_score_adj`, `PPID`, `EUID`, `State`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem`, `realpath` and `cmdline` of the victim in corrective action reports - Print in terminal with colors - Print statistics on corrective actions after each corrective action - Improve poll rate algorithm diff --git a/nohang b/nohang index aef85b5..dde93ea 100755 --- a/nohang +++ b/nohang @@ -1,11 +1,11 @@ #!/usr/bin/env python3 """A daemon that prevents OOM in Linux systems.""" import os -import ctypes +from ctypes import CDLL from time import sleep, time from operator import itemgetter from sys import stdout -from signal import SIGKILL, SIGTERM, SIGSTOP, SIGCONT +from signal import SIGKILL, SIGTERM start_time = time() @@ -40,11 +40,6 @@ HR = '~' * 79 # todo: make config option print_total_stat = True - -stop_cont = False -stop_cont_warn = False - - ########################################################################## # define functions @@ -56,69 +51,26 @@ def mlockall(): MCL_FUTURE = 2 MCL_ONFAULT = 4 - libc = ctypes.CDLL('libc.so.6', use_errno=True) + libc = CDLL('libc.so.6', use_errno=True) - result = libc.mlockall(MCL_CURRENT|MCL_FUTURE|MCL_ONFAULT) + result = libc.mlockall( + MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT + ) if result != 0: - result = libc.mlockall(MCL_CURRENT|MCL_FUTURE) + result = libc.mlockall( + MCL_CURRENT | MCL_FUTURE + ) if result != 0: - print('Can not lock all memory') + print('Cannot lock all memory') else: - print('All memory locked with MCL_CURRENT|MCL_FUTURE') + print('All memory locked with MCL_CURRENT | MCL_FUTURE') else: - print('All memory locked with MCL_CURRENT|MCL_FUTURE|MCL_ONFAULT') + print('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT') def pid_to_state(pid): return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1] -def stop(): - #print() - #print('Stop running processes...') - t1 = time() - t2 = time() - stopped_list = [] - for pid in os.listdir('/proc')[::-1]: - # only directories whose names consist only of numbers, except /proc/1/ - if pid[0].isdecimal() is False or pid is '1' or pid == self_pid: - continue - try: - oom_score_r = int(rline1('/proc/' + pid + '/oom_score')) - if oom_score_r > 9: - uid_r = pid_to_uid(pid) - #print('PID: {}, State: {}, oom_score {}'.format(pid, pid_to_state(pid), oom_score_r)) - if uid_r != '0': - stopped_list.append(pid) - print('Send SIGSTOP to {}, {}, {}...'.format( - pid, pid_to_name(pid), pid_to_cmdline(pid)[:40])) - os.kill(int(pid), SIGSTOP) - t2 = time() - except FileNotFoundError: - continue - except ProcessLookupError: - continue - print('Stop time:', t2 - t1) - stdout.flush() - - return stopped_list - -def cont(stopped_list): - print() - print('Continue stopped processes...') - t1 = time() - if len(stopped_list) > 0: - for pid in stopped_list: - print('Send SIGCONT to', [pid], pid_to_name(pid)) - try: - os.kill(int(pid), SIGCONT) - except FileNotFoundError: - continue - except ProcessLookupError: - continue - t2 = time() - print('All cont time: ', t2 - t1) - - def update_stat_dict_and_print(key): @@ -404,9 +356,6 @@ def send_notify_warn(): (implement Low memory warnings) """ - if stop_cont_warn: - stopped_list = stop() - # find process with max badness fat_tuple = fattest() pid = fat_tuple[0] @@ -428,7 +377,7 @@ def send_notify_warn(): # title = 'Low memory: {}'.format(low_mem_percent) title = 'Low memory' - body = 'Hog: {} [{}]'.format( + body = 'Hog: {}, PID: {}'.format( name.replace( # symbol '&' can break notifications in some themes, # therefore it is replaced by '*' @@ -443,9 +392,6 @@ def send_notify_warn(): # send notification to user that runs this nohang notify_send_wait(title, body) - if stop_cont_warn: - cont(stopped_list) - def send_notify(signal, name, pid): """ @@ -456,7 +402,7 @@ def send_notify(signal, name, pid): pid: str process pid """ title = 'Hang prevention' - body = '{} {} [{}]'.format( + body = '{} {}, PID: {}'.format( notify_sig_dict[signal], name.replace( # symbol '&' can break notifications in some themes, @@ -614,11 +560,6 @@ def find_victim_and_send_signal(signal): -> implement_corrective_action() """ - - if stop_cont: - stopped_list = stop() - - pid, victim_badness = fattest() name = pid_to_name(pid) @@ -633,8 +574,14 @@ def find_victim_and_send_signal(signal): with open('/proc/' + pid + '/status') as f: for n, line in enumerate(f): + + + if n is ppid_index: + ppid = line.split('\t')[1] + + if n is uid_index: - uid = line.split('\t')[1] + uid = line.split('\t')[2] continue if n is vm_size_index: @@ -691,16 +638,13 @@ def find_victim_and_send_signal(signal): for i in range(len(f_list)): if i is ppid_index: - ppid = f_list[i].split('\t')[2] - + ppid = f_list[i].split('\t')[1] for i in range(len(f_list)): if i is uid_index: uid = f_list[i].split('\t')[2] - - if i is vm_size_index: vm_size = kib_to_mib( int(f_list[i].split('\t')[1][:-3])) @@ -748,10 +692,10 @@ def find_victim_and_send_signal(signal): len_vm = len(str(vm_size)) - realpath = os.path.realpath('/proc/' + pid + '/exe') state = pid_to_state(pid) - + pname = pid_to_name(ppid.strip('\n ')) + # print([ppid], [pname]) if detailed_rss: @@ -760,7 +704,8 @@ def find_victim_and_send_signal(signal): '\n Name: \033[33m{}\033[0m' \ '\n State: \033[33m{}\033[0m' \ '\n PID: \033[33m{}\033[0m' \ - '\n UID: \033[33m{}\033[0m' \ + '\n PPID: \033[33m{}\033[0m (\033[33m{}\033[0m)' \ + '\n EUID: \033[33m{}\033[0m' \ '\n badness: \033[33m{}\033[0m, ' \ 'oom_score: \033[33m{}\033[0m, ' \ 'oom_score_adj: \033[33m{}\033[0m' \ @@ -775,6 +720,8 @@ def find_victim_and_send_signal(signal): name, state, pid, + ppid.strip('\n '), + pname, uid, victim_badness, oom_score, @@ -853,13 +800,8 @@ def find_victim_and_send_signal(signal): m = check_mem_and_swap() ma = round(int(m[0]) / 1024.0) sf = round(int(m[2]) / 1024.0) - print('\nMemory status before sending a signal:\nMemA' - 'v: {} MiB, SwFree: {} MiB'.format(ma, sf)) - - - if stop_cont: - os.kill(int(pid), SIGCONT) - + print('\nMemory status before sending a signal:\n MemAvailable' + ': {} MiB, SwapFree: {} MiB'.format(ma, sf)) os.kill(int(pid), signal) response_time = time() - time0 @@ -909,10 +851,6 @@ def find_victim_and_send_signal(signal): key = 'victim badness < min_badness' update_stat_dict_and_print(key) - - if stop_cont: - cont(stopped_list) - sleep_after_send_signal(signal)