From 944c13be7e12f7c6f08c550ad4c42fd0ce851c6a Mon Sep 17 00:00:00 2001 From: Alexey Avramov Date: Tue, 21 May 2019 12:25:31 +0900 Subject: [PATCH] do not check zram by default --- nohang | 645 +++++++++++++++++++++-------------------------- nohang.conf | 76 +++--- trash/thanatolog | 2 +- 3 files changed, 332 insertions(+), 391 deletions(-) diff --git a/nohang b/nohang index 225f156..2853228 100755 --- a/nohang +++ b/nohang @@ -11,92 +11,6 @@ from sre_constants import error as invalid_re from signal import signal, SIGKILL, SIGTERM, SIGINT, SIGQUIT, SIGHUP -start_time = time() - - -help_mess = """usage: nohang [-h] [-v] [-t] [-p] [-c CONFIG] - -optional arguments: - -h, --help show this help message and exit - -v, --version print version - -t, --test print some tests - -p, --print-proc-table - print table of processes with their badness values - -c CONFIG, --config CONFIG - path to the config file, default values: - ./nohang.conf, /etc/nohang/nohang.conf""" - - -SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK']) - -SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE']) - -conf_err_mess = 'Invalid config. Exit.' - -sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP] - -sig_dict = { - SIGKILL: 'SIGKILL', - SIGINT: 'SIGINT', - SIGQUIT: 'SIGQUIT', - SIGHUP: 'SIGHUP', - SIGTERM: 'SIGTERM' -} - -self_pid = str(os.getpid()) - -self_uid = os.geteuid() - -if self_uid == 0: - root = True -else: - root = False - - -if os.path.exists('./nohang_notify_helper'): - notify_helper_path = './nohang_notify_helper' -else: - notify_helper_path = '/usr/sbin/nohang_notify_helper' - - -victim_dict = dict() - - - -victim_id = None -actions_time_dict = dict() -actions_time_dict['action_handled'] = [time(), victim_id] -# print(actions_time_dict) - - - -# will store corrective actions stat -stat_dict = dict() - - -separate_log = False # will be overwritten after parse config - - -def find_cgroup_indexes(): - """ Find cgroup-line positions in /proc/*/cgroup file. - """ - - cgroup_v1_index = None - cgroup_v2_index = None - - with open('/proc/self/cgroup') as f: - for index, line in enumerate(f): - if ':name=' in line: - cgroup_v1_index = index - if line.startswith('0::'): - cgroup_v2_index = index - - return cgroup_v1_index, cgroup_v2_index - - -cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes() - - ########################################################################## # define functions @@ -115,7 +29,62 @@ def print_self_rss(): ''' +def get_swap_threshold_tuple(string): + # re (Num %, True) or (Num KiB, False) + """Returns KiB value if abs val was set in config, or tuple with %""" + # return tuple with abs and bool: (abs %, True) or (abs MiB, False) + + if string.endswith('%'): + valid = string_to_float_convert_test(string[:-1]) + if valid is None: + errprint('somewhere swap unit is not float_%') + exit(1) + + value = float(string[:-1].strip()) + if value < 0 or value > 100: + errprint('invalid value, must be from the range[0; 100] %') + exit(1) + + return value, True + + elif string.endswith('M'): + valid = string_to_float_convert_test(string[:-1]) + if valid is None: + errprint('somewhere swap unit is not float_M') + exit(1) + + value = float(string[:-1].strip()) * 1024 + if value < 0: + errprint('invalid unit in config (negative value)') + exit(1) + + return value, False + + else: + errprint( + 'Invalid config file. There are invalid units somewhere\nExit') + exit(1) + + +def find_cgroup_indexes(): + """ Find cgroup-line positions in /proc/*/cgroup file. + """ + + cgroup_v1_index = cgroup_v2_index = None + + with open('/proc/self/cgroup') as f: + for index, line in enumerate(f): + if ':name=' in line: + cgroup_v1_index = index + if line.startswith('0::'): + cgroup_v2_index = index + + return cgroup_v1_index, cgroup_v2_index + + def pid_to_rss(pid): + """ + """ try: rss = int(rline1( '/proc/{}/statm'.format(pid)).split(' ')[1]) * SC_PAGESIZE @@ -129,6 +98,8 @@ def pid_to_rss(pid): def pid_to_vm_size(pid): + """ + """ try: vm_size = int(rline1( '/proc/{}/statm'.format(pid)).partition(' ')[0]) * SC_PAGESIZE @@ -141,12 +112,6 @@ def pid_to_vm_size(pid): return vm_size - - - - - - def signal_handler(signum, frame): """ """ @@ -193,13 +158,6 @@ def write_self_oom_score_adj(new_value): write('/proc/self/oom_score_adj', new_value) -self_oom_score_adj_min = '-600' -self_oom_score_adj_max = '-6' - - -write_self_oom_score_adj(self_oom_score_adj_min) - - def valid_re(reg_exp): """Validate regular expression. """ @@ -431,6 +389,8 @@ def pid_to_environ(pid): def pid_to_realpath(pid): + """ + """ try: return os.path.realpath('/proc/' + pid + '/exe') except FileNotFoundError: @@ -615,9 +575,6 @@ def pid_to_status_unicode(pid): return None -########################################################################## - - def uptime(): """ """ @@ -993,9 +950,6 @@ def get_pid_list(): return pid_list -pid_list = get_pid_list() - - def get_non_decimal_pids(): """ """ @@ -1362,23 +1316,14 @@ def find_victim_info(pid, victim_badness, name): return victim_info - - - - - - - - - - def implement_corrective_action(signal): """ Find victim with highest badness and send SIGTERM/SIGKILL """ + time0 = time() - - # выходим из фции, если для SIGTERM порога не превышено время min_delay_after_sigterm и спим в течение over_sleep + # выходим из фции, если для SIGTERM порога не превышено время + # min_delay_after_sigterm и спим в течение over_sleep if signal is SIGTERM: dt = time() - actions_time_dict['action_handled'][0] @@ -1397,13 +1342,12 @@ def implement_corrective_action(signal): else: print('min_delay_after_sigterm IS EXCEEDED, it is time to action') - - - """ - При заходе в фцию проверяем права на сигтерм. Права на сигкилл у нас всегда есть - (потому что идем дальше только после полн освободж памяти после смерти жертвы) + При заходе в фцию проверяем права на сигтерм. Права на сигкилл у нас + всегда есть + (потому что идем дальше только после полн освободж памяти после + смерти жертвы) actions_time_dict[action_handled] = time() actions_time_dict[veto] = True @@ -1414,7 +1358,6 @@ def implement_corrective_action(signal): """ - log(mem_info) pid, victim_badness, name = find_victim(print_proc_table) @@ -1425,10 +1368,8 @@ def implement_corrective_action(signal): victim_info = find_victim_info(pid, victim_badness, name) log(victim_info) - - - - # пороги могли превысиься за время поиска жертвы (поиск может занимать сотни миллисекунд) + # пороги могли превысиься за время поиска жертвы (поиск может занимать + # сотни миллисекунд) mem_available, swap_total, swap_free = check_mem_and_swap() ma_mib = int(mem_available) / 1024.0 @@ -1445,15 +1386,8 @@ def implement_corrective_action(signal): log('Hard threshold exceeded') signal = SIGKILL - - victim_id = get_victim_id(pid) - - - - - # kill the victim if it doesn't respond to SIGTERM В ТЕЧЕНИЕ # ЗАДАННОГО ВРЕМЕНИ @@ -1465,19 +1399,10 @@ def implement_corrective_action(signal): dt = time() - victim_dict[victim_id] if dt > max_post_sigterm_victim_lifetime: - print('max_post_sigterm_victim_lifetime exceeded: the victim will get SIGKILL') + print('max_post_sigterm_victim_lifetime exceeded: the ' + 'victim will get SIGKILL') signal = SIGKILL - - - - - - - - - - # matching with re to customize corrective actions soft_match = False @@ -1529,8 +1454,10 @@ def implement_corrective_action(signal): response_time = time() - time0 - # тут надо, как и при дефолтном действии, проверять существование жертвы, ее реакцию на действие, - # и время ее смерти в случае успеха, о обновление таймстемпов действия + # тут надо, как и при дефолтном действии, проверять существование + # жертвы, ее реакцию на действие, + # и время ее смерти в случае успеха, о обновление таймстемпов + # действия etc_info = 'Implement a corrective act' \ 'ion:\n Run the command: {}' \ @@ -1552,71 +1479,49 @@ def implement_corrective_action(signal): command.replace('$PID', pid).replace( '$NAME', pid_to_name(pid))) - - - - - - - else: # обычное действие через сигнал try: - os.kill(int(pid), signal) kill_timestamp = time() response_time = kill_timestamp - time0 - - - - - - while True: exe_exists = os.path.exists('/proc/{}/exe'.format(pid)) rss = pid_to_rss(pid) dt = time() - kill_timestamp log('Victim VmRSS: {} KiB'.format(rss)) if not exe_exists or rss == 0 or dt > 0.01: - #print(dt) + # print(dt) break sleep(0.001) if dt > 0.01: - log('Timer (value = 0.01 sec) expired; seems' \ + log('Timer (value = 0.01 sec) expired; seems' ' like the victim handles signal') - actions_time_dict['action_handled'] = [time(), get_victim_id(pid)] - + actions_time_dict['action_handled'] = [ + time(), get_victim_id(pid)] if victim_id not in victim_dict: # хз как надо. victim_dict.update({victim_id: time()}) - # log('actions_time_dict', actions_time_dict) # log('victim_dict', victim_dict) - - - else: log('Process exited (VmRSS = 0) in {} sec'.format( round(dt, 5))) - - - - - - if signal is SIGKILL or not exe_exists or rss == 0: while True: sleep(0.001) - rss = pid_to_rss(pid) # рсс не важен когда путь не существует. Проверяй просто существование пид. + # рсс не важен когда путь не существует. Проверяй + # просто существование пид. + rss = pid_to_rss(pid) if rss is None: break t1 = time() @@ -1624,7 +1529,6 @@ def implement_corrective_action(signal): log('The victim died in {} sec'.format( round(kill_duration, 3))) - mem_available, swap_total, swap_free = check_mem_and_swap() ma_mib = int(mem_available) / 1024.0 @@ -1636,12 +1540,6 @@ def implement_corrective_action(signal): ) ) - - - - - - send_result = 'total response time: {} ms'.format( round(response_time * 1000)) @@ -1700,11 +1598,12 @@ def implement_corrective_action(signal): update_stat_dict_and_print(key) # тут надо поспать хорошенько. а может и счетчики поправить. - # херню несу. во-первых, внезапно может кто-то появиться c блльшим бэднес.. Далее надо минимизировать аутпут спам. + # херню несу. во-первых, внезапно может кто-то появиться c блльшим + # бэднес.. Далее надо минимизировать аутпут спам. sleep(over_sleep) - - # обновлять время не на каждый кил, а только на килл той жертвы, которая не отвечала на софт экшн. + # обновлять время не на каждый кил, а только на килл той жертвы, + # которая не отвечала на софт экшн. # Вывод: ко времени действия прилагать также виктим айди. print('##################################################################') @@ -1739,17 +1638,23 @@ def sleep_after_check_mem(): t_mem = mem_point / rate_mem t_swap = swap_point / rate_swap - t_zram = (mem_total * 0.8 - mem_used_zram) / rate_zram - if t_zram < 0: - t_zram = 0 + + if CHECK_ZRAM: + t_zram = (mem_total * 0.8 - mem_used_zram) / rate_zram + if t_zram < 0: + t_zram = 0 + t_mem_zram = t_mem + t_zram t_mem_swap = t_mem + t_swap - t_mem_zram = t_mem + t_zram - if t_mem_swap <= t_mem_zram: - t = t_mem_swap + if CHECK_ZRAM: + + if t_mem_swap <= t_mem_zram: + t = t_mem_swap + else: + t = t_mem_zram else: - t = t_mem_zram + t = t_mem_swap if t > max_sleep: t = max_sleep @@ -1841,6 +1746,83 @@ def calculate_percent(arg_key): ########################################################################## +start_time = time() + + +help_mess = """usage: nohang [-h] [-v] [-t] [-p] [-c CONFIG] + +optional arguments: + -h, --help show this help message and exit + -v, --version print version + -t, --test print some tests + -p, --print-proc-table + print table of processes with their badness values + -c CONFIG, --config CONFIG + path to the config file, default values: + ./nohang.conf, /etc/nohang/nohang.conf""" + + +SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK']) + +SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE']) + +conf_err_mess = 'Invalid config. Exit.' + +sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP] + +sig_dict = { + SIGKILL: 'SIGKILL', + SIGINT: 'SIGINT', + SIGQUIT: 'SIGQUIT', + SIGHUP: 'SIGHUP', + SIGTERM: 'SIGTERM' +} + +self_pid = str(os.getpid()) + +self_uid = os.geteuid() + +if self_uid == 0: + root = True +else: + root = False + + +if os.path.exists('./nohang_notify_helper'): + notify_helper_path = './nohang_notify_helper' +else: + notify_helper_path = '/usr/sbin/nohang_notify_helper' + + +victim_dict = dict() + + +victim_id = None +actions_time_dict = dict() +actions_time_dict['action_handled'] = [time(), victim_id] +# print(actions_time_dict) + + +# will store corrective actions stat +stat_dict = dict() + + +separate_log = False # will be overwritten after parse config + + +cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes() + + +self_oom_score_adj_min = '-600' +self_oom_score_adj_max = '-6' + + +write_self_oom_score_adj(self_oom_score_adj_min) + + +pid_list = get_pid_list() + + print_proc_table_flag = False if len(argv) == 1: @@ -1879,9 +1861,6 @@ else: exit(1) -########################################################################## - - # find mem_total # find positions of SwapFree and SwapTotal in /proc/meminfo @@ -1928,8 +1907,6 @@ except ValueError: detailed_rss = False # print('It is not Linux 4.5+') -########################################################################## - log('Config: ' + config) @@ -2167,6 +2144,8 @@ gui_low_memory_warnings = conf_parse_bool('gui_low_memory_warnings') gui_notifications = conf_parse_bool('gui_notifications') decrease_oom_score_adj = conf_parse_bool('decrease_oom_score_adj') ignore_psi = conf_parse_bool('ignore_psi') +ignore_zram = conf_parse_bool('ignore_zram') + (mem_min_sigterm_kb, mem_min_sigterm_mb, mem_min_sigterm_percent ) = calculate_percent('mem_min_sigterm') @@ -2559,43 +2538,6 @@ psi_support = os.path.exists(psi_path) # Get KiB levels if it's possible. -def get_swap_threshold_tuple(string): - # re (Num %, True) or (Num KiB, False) - """Returns KiB value if abs val was set in config, or tuple with %""" - # return tuple with abs and bool: (abs %, True) or (abs MiB, False) - - if string.endswith('%'): - valid = string_to_float_convert_test(string[:-1]) - if valid is None: - errprint('somewhere swap unit is not float_%') - exit(1) - - value = float(string[:-1].strip()) - if value < 0 or value > 100: - errprint('invalid value, must be from the range[0; 100] %') - exit(1) - - return value, True - - elif string.endswith('M'): - valid = string_to_float_convert_test(string[:-1]) - if valid is None: - errprint('somewhere swap unit is not float_M') - exit(1) - - value = float(string[:-1].strip()) * 1024 - if value < 0: - errprint('invalid unit in config (negative value)') - exit(1) - - return value, False - - else: - errprint( - 'Invalid config file. There are invalid units somewhere\nExit') - exit(1) - - swap_min_sigterm_tuple = get_swap_threshold_tuple(swap_min_sigterm) swap_min_sigkill_tuple = get_swap_threshold_tuple(swap_min_sigkill) swap_min_warnings_tuple = get_swap_threshold_tuple(swap_min_warnings) @@ -2732,14 +2674,9 @@ mlockall() # print_self_rss() -log('Monitoring has started!') - -stdout.flush() - -########################################################################## - psi_avg_string = '' # will be overwritten if PSI monitoring enabled +mem_used_zram = 0 if psi_support and not ignore_psi: psi_t0 = time() @@ -2760,58 +2697,26 @@ for i in sig_list: signal(i, signal_handler) +CHECK_PSI = False +if psi_support and not ignore_psi: + CHECK_PSI = True + + +CHECK_ZRAM = not ignore_zram + +log('Monitoring has started!') + +stdout.flush() + + +########################################################################## + + while True: - if psi_support and not ignore_psi: + # Q = time() - psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics) - - if print_mem_check_results: - psi_avg_string = 'PSI avg value: {} | '.format( - str(psi_avg_value).rjust(6)) - - if psi_avg_value >= sigkill_psi_threshold: - sigkill_psi_exceeded = True - else: - sigkill_psi_exceeded = False - - if psi_avg_value >= sigterm_psi_threshold: - sigterm_psi_exceeded = True - else: - sigterm_psi_exceeded = False - - if time() - psi_t0 >= psi_post_action_delay: - psi_post_action_delay_exceeded = True - else: - psi_post_action_delay_exceeded = False - - if psi_debug: - log('sigterm_psi_exceeded: {}, sigkill_psi_exceeded: {}, ps' - 'i_post_action_delay_exceeded: {}'.format( - sigterm_psi_exceeded, - sigkill_psi_exceeded, - psi_post_action_delay_exceeded)) - - if sigkill_psi_exceeded and psi_post_action_delay_exceeded: - time0 = time() - mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \ - 'old ({})'.format( - psi_avg_value, sigkill_psi_threshold) - - implement_corrective_action(SIGKILL) - - psi_t0 = time() - continue - - if sigterm_psi_exceeded and psi_post_action_delay_exceeded: - time0 = time() - mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \ - 'shold ({})'.format(psi_avg_value, sigterm_psi_threshold) - - implement_corrective_action(SIGTERM) - - psi_t0 = time() - continue + # FIND VALUES: mem, swap, zram, psi mem_available, swap_total, swap_free = check_mem_and_swap() @@ -2825,7 +2730,29 @@ while True: if swap_warn_is_percent: swap_min_warnings_kb = swap_total * swap_min_warnings_percent / 100.0 - mem_used_zram = check_zram() + if swap_total > swap_min_sigkill_kb: + swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1)) + else: + swap_sigkill_pc = '-' + + if swap_total > swap_min_sigterm_kb: + swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1)) + else: + swap_sigterm_pc = '-' + + if CHECK_ZRAM: + mem_used_zram = check_zram() + + if CHECK_PSI: + psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics) + if time() - psi_t0 >= psi_post_action_delay: + psi_post_action_delay_exceeded = True + else: + psi_post_action_delay_exceeded = False + + if print_mem_check_results: + psi_avg_string = 'PSI avg value: {} | '.format( + str(psi_avg_value).rjust(6)) if print_mem_check_results: @@ -2894,20 +2821,12 @@ while True: ) ) - if swap_total > swap_min_sigkill_kb: - swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1)) - else: - swap_sigkill_pc = '-' + ########################################################################### - if swap_total > swap_min_sigterm_kb: - swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1)) - else: - swap_sigterm_pc = '-' + # CHECK HARD THRESHOLDS (SIGKILL LEVEL) - # MEM SWAP KILL if (mem_available <= mem_min_sigkill_kb and swap_free <= swap_min_sigkill_kb): - time0 = time() mem_info = 'Hard threshold exceeded\nMemory status that requ' \ 'ires corrective actions:' \ @@ -2924,33 +2843,47 @@ while True: swap_sigkill_pc) implement_corrective_action(SIGKILL) - psi_t0 = time() continue - # ZRAM KILL - if mem_used_zram >= zram_max_sigkill_kb: - time0 = time() + if CHECK_ZRAM: + if mem_used_zram >= zram_max_sigkill_kb: - mem_info = 'Hard threshold exceeded\nMemory status that requir' \ - 'es corrective actions:' \ - '\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \ - 'kill [{} MiB, {} %]'.format( - kib_to_mib(mem_used_zram), - percent(mem_used_zram / mem_total), - kib_to_mib(zram_max_sigkill_kb), - percent(zram_max_sigkill_kb / mem_total)) + mem_info = 'Hard threshold exceeded\nMemory status that requir' \ + 'es corrective actions:' \ + '\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \ + 'kill [{} MiB, {} %]'.format( + kib_to_mib(mem_used_zram), + percent(mem_used_zram / mem_total), + kib_to_mib(zram_max_sigkill_kb), + percent(zram_max_sigkill_kb / mem_total)) - implement_corrective_action(SIGKILL) + implement_corrective_action(SIGKILL) + psi_t0 = time() + continue - psi_t0 = time() - continue + if CHECK_PSI: + if psi_avg_value >= sigkill_psi_threshold: + sigkill_psi_exceeded = True + else: + sigkill_psi_exceeded = False - # MEM SWAP TERM - if mem_available <= mem_min_sigterm_kb and \ - swap_free <= swap_min_sigterm_kb: + if sigkill_psi_exceeded and psi_post_action_delay_exceeded: - time0 = time() + mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \ + 'old ({})'.format( + psi_avg_value, sigkill_psi_threshold) + + implement_corrective_action(SIGKILL) + psi_t0 = time() + continue + + ########################################################################### + + # CHECK SOFT THRESHOLDS (SIGTERM LEVEL) + + if (mem_available <= mem_min_sigterm_kb and + swap_free <= swap_min_sigterm_kb): mem_info = 'Soft threshold exceeded\nMemory status that requi' \ 'res corrective actions:' \ @@ -2967,34 +2900,54 @@ while True: swap_sigterm_pc) implement_corrective_action(SIGTERM) - psi_t0 = time() continue - # ZRAM TERM - if mem_used_zram >= zram_max_sigterm_kb: - time0 = time() + if CHECK_ZRAM: + if mem_used_zram >= zram_max_sigterm_kb: - mem_info = 'Soft threshold exceeded\nMemory status that requ' \ - 'ires corrective actions:' \ - '\n MemUsedZram [{} MiB, {} %] >= ' \ - 'zram_max_sigterm [{} M, {} %]'.format( - kib_to_mib(mem_used_zram), - percent(mem_used_zram / mem_total), - kib_to_mib(zram_max_sigterm_kb), - percent(zram_max_sigterm_kb / mem_total)) + mem_info = 'Soft threshold exceeded\nMemory status that require' \ + 's corrective actions:\n MemUsedZram [{} MiB, {} %] >= zra' \ + 'm_max_sigterm [{} M, {} %]'.format( + kib_to_mib(mem_used_zram), + percent(mem_used_zram / mem_total), + kib_to_mib(zram_max_sigterm_kb), + percent(zram_max_sigterm_kb / mem_total)) - implement_corrective_action(SIGTERM) + implement_corrective_action(SIGTERM) + psi_t0 = time() + continue - psi_t0 = time() - continue + if CHECK_PSI: + if psi_avg_value >= sigterm_psi_threshold: + sigterm_psi_exceeded = True + else: + sigterm_psi_exceeded = False + + if psi_debug: + log('sigterm_psi_exceeded: {}, sigkill_psi_exceeded: {}, ps' + 'i_post_action_delay_exceeded: {}'.format( + sigterm_psi_exceeded, + sigkill_psi_exceeded, + psi_post_action_delay_exceeded)) + + if sigterm_psi_exceeded and psi_post_action_delay_exceeded: + + mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \ + 'shold ({})'.format(psi_avg_value, sigterm_psi_threshold) + + implement_corrective_action(SIGTERM) + psi_t0 = time() + continue + + ########################################################################### - # LOW MEMORY WARNINGS if gui_low_memory_warnings: - if mem_available <= mem_min_warnings_kb and \ - swap_free <= swap_min_warnings_kb + 0.1 or \ - mem_used_zram >= zram_max_warnings_kb: + if (mem_available <= mem_min_warnings_kb and + swap_free <= swap_min_warnings_kb + 0.1 or + mem_used_zram >= zram_max_warnings_kb): + warn_time_delta = time() - warn_time_now warn_time_now = time() warn_timer += warn_time_delta @@ -3003,17 +2956,7 @@ while True: warn_timer = 0 + # x = time() - Q + # print(x * 1000) - - # SLEEP BETWEEN MEM CHECKS sleep_after_check_mem() - - - - - - - - - - diff --git a/nohang.conf b/nohang.conf index b0b5e23..b436dd4 100644 --- a/nohang.conf +++ b/nohang.conf @@ -1,34 +1,38 @@ This is nohang config file. Lines starting with #, tabs and spaces are comments. Lines starting with @ contain optional parameters. + All values are case sensitive. + Be careful: nohang doesn't forbid you to shoot yourself in the foot. The configuration includes the following sections: + 0. Common zram settings 1. Memory levels to respond to as an OOM threat 2. Response on PSI memory metrics 3. The frequency of checking the level of available memory (and CPU usage) 4. The prevention of killing innocent victims - 5. Impact on the badness of processes via matching their - - names, - - cgroups, - - cmdlines and - - UIDs - with regular expressions + 5. Impact on the badness of processes via matching their names, cgroups and + cmdlines with specified regular expressions 6. Customize corrective actions: the execution of a specific command instead of sending the SIGTERM signal 7. GUI notifications: - - OOM prevention results and - low memory warnings + - OOM prevention results 8. Output verbosity 9. Misc Just read the description of the parameters and edit the values. Please restart the program after editing the config. - Bool values are case sensitive. +############################################################################### -##################################################################### + 0. Common zram settings + + See https://www.kernel.org/doc/Documentation/blockdev/zram.txt + You maybe need to set `ignore_zram = False` if you has a big zram disksize. + +ignore_zram = False 1. Thresholds below which a signal should be sent to the victim @@ -57,9 +61,9 @@ swap_min_sigkill = 5 % numbers from the range [0; 90] %. zram_max_sigterm = 50 % -zram_max_sigkill = 55 % +zram_max_sigkill = 60 % -##################################################################### +############################################################################### 2. Response on PSI memory metrics (it needs Linux 4.20 and up) @@ -102,7 +106,7 @@ sigkill_psi_threshold = 90 psi_post_action_delay = 60 -##################################################################### +############################################################################### 3. The frequency of checking the amount of available memory (and CPU usage) @@ -124,7 +128,7 @@ psi_post_action_delay = 60 rate_mem = 4000 rate_swap = 1500 -rate_zram = 500 +rate_zram = 6000 See also https://github.com/rfjakob/earlyoom/issues/61 @@ -135,7 +139,7 @@ min_sleep = 0.1 over_sleep = 0.05 -##################################################################### +############################################################################### 4. The prevention of killing innocent victims @@ -144,7 +148,7 @@ over_sleep = 0.05 min_badness = 20 Valid values are non-negative floating-point numbers. - Min delay if a victim does not respond to SIGTERM in 10 ms. + Min delay if a victim doesn't respond to SIGTERM in 10 ms. min_delay_after_sigterm = 3 @@ -157,7 +161,7 @@ decrease_oom_score_adj = False oom_score_adj_max = 0 -##################################################################### +############################################################################### 5. Impact on the badness of processes via matching their names, cmdlines or UIDs with regular expressions using re.search(). @@ -194,21 +198,15 @@ oom_score_adj_max = 0 A good option that allows fine adjustment. - Prefer electron-based apps and chromium tabs + Prefer chromium tabs and electron-based apps @CMDLINE_RE 200 /// --type=renderer - Prefer firefox tabs - @CMDLINE_RE 100 /// -greomni|-childID - - - @CMDLINE_RE -500 /// python - - - + Prefer firefox tabs (Web Content and WebExtensions) + @CMDLINE_RE 100 /// -appomni @CMDLINE_RE -200 /// ^/usr/lib/virtualbox - 5.3 Matching UIDs with RE patterns + 5.3 Matching eUIDs with RE patterns The most slow option @@ -232,10 +230,11 @@ oom_score_adj_max = 0 @ENVIRON_RE 100 /// USER=user - Note that you can control badness also via systemd units via OOMScoreAdjust, see - https://www.freedesktop.org/software/systemd/man/systemd.exec.html#OOMScoreAdjust= + Note that you can control badness also via systemd units via + OOMScoreAdjust, see + www.freedesktop.org/software/systemd/man/systemd.exec.html#OOMScoreAdjust= -##################################################################### +############################################################################### 6. Customize corrective actions. @@ -247,14 +246,15 @@ oom_score_adj_max = 0 @SOFT_ACTION_RE_NAME ^foo$ /// kill -SEGV $PID @SOFT_ACTION_RE_NAME ^bash$ /// kill -9 $PID - @SOFT_ACTION_RE_CGROUP_V1 ^/system.slice/ /// systemctl restart $SERVICE - @SOFT_ACTION_RE_CGROUP_V1 foo.service$ /// systemctl restart $SERVICE + @SOFT_ACTION_RE_CGROUP_V1 ^/system.slice/ /// systemctl restart $SERVICE + @SOFT_ACTION_RE_CGROUP_V1 foo.service$ /// systemctl restart $SERVICE $PID will be replaced by process PID. $NAME will be replaced by process name. - $SERVICE will be replaced by .service if it exists (overwise it will be relpaced by empty line). + $SERVICE will be replaced by .service if it exists (overwise it will be + relpaced by empty line) -##################################################################### +############################################################################### 7. GUI notifications: - OOM prevention results and @@ -289,7 +289,7 @@ min_time_between_warnings = 15 Ampersands (&) will be replaced with asterisks (*) in process names and in commands. -##################################################################### +############################################################################### 8. Verbosity @@ -303,7 +303,7 @@ print_config = False print_mem_check_results = False -min_mem_report_interval = 60 +min_mem_report_interval = 300 Print sleep periods between memory checks. Valid values are True and False. @@ -327,15 +327,13 @@ extra_table_info = cgroup_v1 print_victim_info = False - # print_victim_cmdline - -max_ancestry_depth = 1 +max_ancestry_depth = 10 separate_log = False psi_debug = False -##################################################################### +############################################################################### 9. Misc diff --git a/trash/thanatolog b/trash/thanatolog index 76c2599..6c3e46b 100755 --- a/trash/thanatolog +++ b/trash/thanatolog @@ -122,7 +122,7 @@ send_signal = SIGTERM # os.kill(int(pid), SIGCONT) -os.kill(int(pid), send_signal) +# os.kill(int(pid), send_signal) t0 = time()