diff --git a/README.md b/README.md index 894d352..d4bba3c 100644 --- a/README.md +++ b/README.md @@ -215,12 +215,13 @@ Please create [issues](https://github.com/hakavlad/nohang/issues). Use cases, fe - [x] Improve output: - [x] Display `oom_score`, `oom_score_adj`, `Ancestry`, `EUID`, `State`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem`, `Realpath`, `Cmdline` and `Lifetime` of the victim in corrective action reports - [x] Print statistics on corrective actions after each corrective action - - [ ] Add memory report interval - - [ ] Add delta memory info (the rate of change of available memory) + - [x] Add memory report interval + - [x] Add delta memory info (the rate of change of available memory) - [x] Improve poll rate algorithm - [x] Add `max_post_sigterm_victim_lifetime` option: send SIGKILL to the victim if it doesn't respond to SIGTERM for a certain time - [x] Improve victim search algorithm (do it ~30% faster) ([rfjakob/earlyoom#114](https://github.com/rfjakob/earlyoom/issues/114)) - [x] Improve limiting `oom_score_adj`: now it can works with UID != 0 + - [x] Fixed process crash before performing corrective actions if Python 3.3 or Python 3.4 are used to interpret nohang - [x] Improve GUI warnings: - [x] Find env without run `ps` - [x] Handle all timeouts when notify-send starts diff --git a/nohang b/nohang index 656076a..a94226b 100755 --- a/nohang +++ b/nohang @@ -23,10 +23,8 @@ optional arguments: SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK']) - conf_err_mess = 'Invalid config. Exit.' - sig_dict = {SIGKILL: 'SIGKILL', SIGTERM: 'SIGTERM'} @@ -41,12 +39,6 @@ else: wait_time = 10 -# todo: make config option -max_sleep_time = 3 - -# todo: make config option -min_sleep_time = 0.1 - notify_helper_path = '/usr/sbin/nohang_notify_helper' psi_path = '/proc/pressure/memory' @@ -55,21 +47,8 @@ psi_support = os.path.exists(psi_path) HR = '' -# todo: make config option -print_total_stat = True - -print_proc_table = False - -min_mem_report_interval = 5 - -post_kill_exe = '' - victim_dict = dict() -max_ancestry_depth = 1 - -max_post_sigterm_victim_lifetime = 9 - ########################################################################## @@ -96,7 +75,6 @@ def pid_to_starttime(pid): def get_victim_id(pid): - # todo: handle UnicodeDecodeError return pid + '-' + rline1( '/proc/' + pid + '/stat').rpartition(')')[2].split(' ')[20] @@ -577,7 +555,9 @@ def sleep_after_send_signal(signal): sleep(min_delay_after_sigkill) else: if print_sleep_periods: - print(' sleep', min_delay_after_sigterm) + print( + 'Sleep {} sec after implementing a corrective action'.format( + min_delay_after_sigterm)) sleep(min_delay_after_sigterm) @@ -665,8 +645,9 @@ def find_victim(): pid_badness_list = [] if print_proc_table: + print('===============================================================================') print(' PID badness Name eUID cmdline') - print('------- ------- --------------- ---------- -------') + print('------- ------- --------------- ---------- ---------------------------------') for pid in pid_list: @@ -683,7 +664,6 @@ def find_victim(): pid_to_cmdline(pid)) ) - pid_badness_list.append((pid, badness)) # Make list of (pid, badness) tuples, sorted by 'badness' values @@ -700,12 +680,15 @@ def find_victim(): victim_badness = pid_tuple_list[1] victim_name = pid_to_name(pid) + if print_proc_table: + print('===============================================================================') + print( - '\nWorst process (PID: {}, Name: {}, badness: {}) found in {} ms'.format( + '\nWorst process found in {} ms: PID: {}, Name: {}, badness: {}'.format( + round((time() - ft1) * 1000), pid, victim_name, - victim_badness, - round((time() - ft1) * 1000) + victim_badness ) ) @@ -1081,7 +1064,7 @@ def sleep_after_check_mem(): if print_sleep_periods: print( - 'Sleep time: {} sec; (t_mem={}, t_swap={}, t_zram={})'.format( + 'Sleep {} sec (t_mem={}, t_swap={}, t_zram={})'.format( round(t, 2), round(t_mem, 2), round(t_swap, 2), @@ -1538,6 +1521,94 @@ else: exit(1) +if 'max_ancestry_depth' in config_dict: + max_ancestry_depth = string_to_int_convert_test( + config_dict['max_ancestry_depth']) + if min_badness is None: + errprint('Invalid max_ancestry_depth value, not integer\nExit') + exit(1) + if max_ancestry_depth < 1: + errprint('Invalud max_ancestry_depth value\nExit') + exit(1) +else: + errprint('max_ancestry_depth is not in config\nExit') + exit(1) + + +if 'max_post_sigterm_victim_lifetime' in config_dict: + max_post_sigterm_victim_lifetime = string_to_float_convert_test( + config_dict['max_post_sigterm_victim_lifetime']) + if max_post_sigterm_victim_lifetime is None: + errprint('Invalid max_post_sigterm_victim_lifetime value, not float\nExit') + exit(1) + if max_post_sigterm_victim_lifetime < 0: + errprint('max_post_sigterm_victim_lifetime must be non-negative number\nExit') + exit(1) +else: + errprint('max_post_sigterm_victim_lifetime is not in config\nExit') + exit(1) + + +if 'post_kill_exe' in config_dict: + post_kill_exe = config_dict['post_kill_exe'] +else: + errprint('post_kill_exe is not in config\nExit') + exit(1) + + +print_total_stat = conf_parse_bool('print_total_stat') +print_proc_table = conf_parse_bool('print_proc_table') + + +if 'min_mem_report_interval' in config_dict: + min_mem_report_interval = string_to_float_convert_test( + config_dict['min_mem_report_interval']) + if min_mem_report_interval is None: + errprint('Invalid min_mem_report_interval value, not float\nExit') + exit(1) + if min_mem_report_interval < 0: + errprint('min_mem_report_interval must be non-negative number\nExit') + exit(1) +else: + errprint('min_mem_report_interval is not in config\nExit') + exit(1) + + +if 'max_sleep_time' in config_dict: + max_sleep_time = string_to_float_convert_test( + config_dict['max_sleep_time']) + if max_sleep_time is None: + errprint('Invalid max_sleep_time value, not float\nExit') + exit(1) + if max_sleep_time <= 0: + errprint('max_sleep_time must be positive number\nExit') + exit(1) +else: + errprint('max_sleep_time is not in config\nExit') + exit(1) + + +if 'min_sleep_time' in config_dict: + min_sleep_time = string_to_float_convert_test( + config_dict['min_sleep_time']) + if min_sleep_time is None: + errprint('Invalid min_sleep_time value, not float\nExit') + exit(1) + if min_sleep_time <= 0: + errprint('min_sleep_time must be positive number\nExit') + exit(1) +else: + errprint('min_sleep_time is not in config\nExit') + exit(1) + + +if max_sleep_time < min_sleep_time: + errprint( + 'max_sleep_time value must not exceed min_sleep_time value.\nExit' + ) + exit(1) + + ########################################################################## @@ -1890,8 +1961,6 @@ while True: kib_to_mib(swap_min_sigkill_kb), swap_sigkill_pc) - print(mem_info) - implement_corrective_action(SIGKILL) psi_t0 = time() continue @@ -1909,8 +1978,6 @@ while True: kib_to_mib(zram_max_sigkill_kb), percent(zram_max_sigkill_kb / mem_total)) - print(mem_info) - implement_corrective_action(SIGKILL) psi_t0 = time() continue @@ -1956,8 +2023,6 @@ while True: kib_to_mib(zram_max_sigterm_kb), percent(zram_max_sigterm_kb / mem_total)) - print(mem_info) - implement_corrective_action(SIGTERM) psi_t0 = time() diff --git a/nohang.conf b/nohang.conf index bcbd6a7..c1e86e2 100644 --- a/nohang.conf +++ b/nohang.conf @@ -99,6 +99,18 @@ rate_zram = 0.5 See also https://github.com/rfjakob/earlyoom/issues/61 + + Максимальное время сна между проверками памяти. + Положительное число. + +max_sleep_time = 3 + + Минимальное время сна между проверками памяти. + Положительное число, не превышающее max_sleep_time. + +min_sleep_time = 0.1 + + ##################################################################### 3. The prevention of killing innocent victims @@ -312,8 +324,48 @@ print_config = False print_mem_check_results = False + Минимальная периодичность печати состояния памяти. + 0 - печатать все проверки памяти. + Неотрицательное число. + +min_mem_report_interval = 10 + Print sleep periods between memory checks. Valid values are True and False. print_sleep_periods = False + Печатать общую статистику по корректирующим действиям с момента + запуска nohang после каждого корректирующего действия. + +print_total_stat = True + + Печатать таблицу процессов перед каждым корректирующим действием. + +print_proc_table = False + + + Максимальная глубина показа родословной. По умолчанию (1) + показывается только родитель - PPID. + Целое положительное число. + +max_ancestry_depth = 3 + + +##################################################################### + + 8. Misc + + Жертва может не реагировать на SIGTERM. + max_post_sigterm_victim_lifetime - это время, при превышении + которого жертва получит SIGKILL. + Неотрицательные числа. + +max_post_sigterm_victim_lifetime = 9 + + Выполнить произвольную команду после SIGKILL. + Пустая строка - ничего не выполнять. + Произвольная строка. + +post_kill_exe = +