diff --git a/nohang b/nohang index 4f1aa9f..1e2b5ab 100755 --- a/nohang +++ b/nohang @@ -815,11 +815,11 @@ def send_notify_warn(): send_notification(title, body) -def send_notify(signal, name, pid): +def send_notify(threshold, name, pid): """ Notificate about OOM Preventing. - signal: key for notify_sig_dict + threshold: key for notify_sig_dict name: str process name pid: str process pid """ @@ -831,7 +831,7 @@ def send_notify(signal, name, pid): title = 'Freeze prevention' body = '{} [{}] {}'.format( - notify_sig_dict[signal], + notify_sig_dict[threshold], pid, name.replace( # symbol '&' can break notifications in some themes, @@ -1041,6 +1041,7 @@ def find_victim(_print_proc_table): )[0] pid = pid_tuple_list[0] + victim_id = get_victim_id(pid) # Get maximum 'badness' value victim_badness = pid_tuple_list[1] @@ -1062,7 +1063,7 @@ def find_victim(_print_proc_table): ) ) - return pid, victim_badness, victim_name + return pid, victim_badness, victim_name, victim_id def find_victim_info(pid, victim_badness, name): @@ -1271,18 +1272,6 @@ def find_victim_info(pid, victim_badness, name): return victim_info - - - - - - - - - - - - def check_mem_swap_ex(): """ Check: is mem and swap threshold exceeded? @@ -1314,8 +1303,8 @@ def check_mem_swap_ex(): if (mem_available <= mem_min_sigkill_kb and swap_free <= swap_min_sigkill_kb): - mem_info = 'Hard threshold exceeded\nMemory status that requ' \ - 'ires corrective actions:' \ + mem_info = 'Memory status that requ' \ + 'ires corrective actions (hard threshold exceeded):' \ '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ 'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ 'p_min_sigkill [{} MiB, {} %]'.format( @@ -1328,13 +1317,13 @@ def check_mem_swap_ex(): kib_to_mib(swap_min_sigkill_kb), swap_sigkill_pc) - return SIGKILL, mem_info + return SIGKILL, mem_info, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total if (mem_available <= mem_min_sigterm_kb and swap_free <= swap_min_sigterm_kb): - mem_info = 'Soft threshold exceeded\nMemory status that requi' \ - 'res corrective actions:' \ + mem_info = 'Memory status that requi' \ + 'res corrective actions (soft threshold exceeded):' \ '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ 'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ 'p_min_sigterm [{} MiB, {} %]'.format( @@ -1347,24 +1336,146 @@ def check_mem_swap_ex(): kib_to_mib(swap_min_sigterm_kb), swap_sigterm_pc) - return SIGTERM, mem_info + return SIGTERM, mem_info, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total - return None, None + if gui_low_memory_warnings: + + if (mem_available <= mem_min_warnings_kb and swap_free <= + swap_min_warnings_kb + 0.1): + return 'WARN', None, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total + + return None, None, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total +def check_zram_ex(): + """ + """ + mem_used_zram = check_zram() + + if mem_used_zram >= zram_max_sigkill_kb: + + mem_info = 'Memory status that requir' \ + 'es corrective actions (hard threshold exceeded):' \ + '\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \ + 'kill [{} MiB, {} %]'.format( + kib_to_mib(mem_used_zram), + percent(mem_used_zram / mem_total), + kib_to_mib(zram_max_sigkill_kb), + percent(zram_max_sigkill_kb / mem_total)) + + return SIGKILL, mem_info, mem_used_zram + + if mem_used_zram >= zram_max_sigterm_kb: + + mem_info = 'Memory status that require' \ + 's corrective actions (soft threshold exceeded):\n MemUsedZram [{} MiB, {} %] >= zra' \ + 'm_max_sigterm [{} M, {} %]'.format( + kib_to_mib(mem_used_zram), + percent(mem_used_zram / mem_total), + kib_to_mib(zram_max_sigterm_kb), + percent(zram_max_sigterm_kb / mem_total)) + + return SIGTERM, mem_info, mem_used_zram + + if gui_low_memory_warnings: + if mem_used_zram >= zram_max_warnings_kb: + return 'WARN', None, mem_used_zram + + return None, None, mem_used_zram +def check_psi_ex(psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0): + """ + """ + delta0 = time() - x0 + x0 = time() + psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics) + # print(psi_avg_value) + psi_post_action_delay_timer = time() - psi_t0 + if psi_post_action_delay_timer >= psi_post_action_delay: + psi_post_action_delay_exceeded = True + else: + psi_post_action_delay_exceeded = False + if psi_avg_value >= sigkill_psi_threshold: + sigkill_psi_exceeded = True + psi_kill_exceeded_timer += delta0 + else: + sigkill_psi_exceeded = False + psi_kill_exceeded_timer = 0 + if psi_debug: + log('psi_post_action_delay_timer: {}'.format( + round(psi_post_action_delay_timer, 3))) + log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded' + ': {}\npsi_kill_exceeded_timer: {}'.format( + psi_post_action_delay_exceeded, + sigkill_psi_exceeded, + round(psi_kill_exceeded_timer, 1) + ) + ) + if (psi_kill_exceeded_timer >= psi_excess_duration and + psi_post_action_delay_exceeded): + mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \ + 'PSI avg exceeded psi_excess_duration (value' \ + ' = {} sec) for {} seconds'.format( + psi_avg_value, + sigkill_psi_threshold, + psi_excess_duration, + round(psi_kill_exceeded_timer, 1) + ) + # psi_t0 = time() # ВОТ ГДЕ ПРОБЛЕМА. Таймер надо сбрасывать именно после применения корректирующего действия. Именно ПОСЛЕ. А не здесь. + # Или после любого применения, или после успешного. + # Если жертва умерла в процессе поиска - сбрасываем. Если отправлен + # сигнал - сбрасываем. + + return SIGKILL, mem_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0 + + if psi_avg_value >= sigterm_psi_threshold: + sigterm_psi_exceeded = True + psi_term_exceeded_timer += delta0 + else: + sigterm_psi_exceeded = False + psi_term_exceeded_timer = 0 + + if psi_debug: + + log('sigterm_psi_exceeded: {}\n' + 'psi_term_exceeded_timer: {}\n'.format( + sigterm_psi_exceeded, + round(psi_term_exceeded_timer, 1) + ) + ) + + if (psi_term_exceeded_timer >= psi_excess_duration and + psi_post_action_delay_exceeded): + + mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \ + 'PSI avg exceeded psi_excess_duration (value' \ + ' = {} sec) for {} seconds'.format( + psi_avg_value, + sigterm_psi_threshold, + psi_excess_duration, + round(psi_term_exceeded_timer, 1) + ) + + return SIGTERM, mem_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0 + + if gui_low_memory_warnings: + + if psi_avg_value >= psi_avg_warnings: + return 'WARN', None, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0 + + return None, None, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0 def is_victim_alive(pid): @@ -1384,31 +1495,26 @@ def is_victim_alive(pid): return 0 - - - - - - - -def implement_corrective_action(signal, mem_info): +def implement_corrective_action( + threshold, + mem_info_list, + psi_t0, + psi_kill_exceeded_timer, + psi_term_exceeded_timer, + x0, psi_s, zram_s, zram_m, psi_m): """ Find victim with highest badness and send SIGTERM/SIGKILL """ - - # Ёбаная запутанная фция. Распутать всё нахуй. Выделить части в отдельн фции. # Разбить саму фцию на части. Сделать ее структуру простой и понятной. - - time0 = time() # начало корр действия. Для вычисл времени действия. - + time0 = time() # начало корр действия. Для вычисл времени действия. # выходим из фции, если для SIGTERM порога не превышено время # min_delay_after_sigterm и спим в течение over_sleep # если хард порог превышен - идем дальше. - if signal is SIGTERM: + if threshold is SIGTERM: dt = time() - actions_time_dict['action_handled'][0] @@ -1422,7 +1528,7 @@ def implement_corrective_action(signal, mem_info): sleep(over_sleep) - return None # время задержки между действиями не истекло + return psi_t0 # время задержки между действиями не истекло else: log('min_delay_after_sigterm IS EXCEEDED, it is time to action') @@ -1442,60 +1548,104 @@ def implement_corrective_action(signal, mem_info): """ - log(mem_info) + for i in mem_info_list: + log(i) # ищем жертву с ее бэднес. - pid, victim_badness, name = find_victim(print_proc_table) + pid, victim_badness, name, victim_id = find_victim(print_proc_table) # sleep(0.1) - new_signal, mem_info = check_mem_swap_ex() - #log(new_signal) - #log(mem_info) - if new_signal is None: + + + + log('Recheck memory levels...') + + + + # перепроверяем пороги: они могли измениться за время поиска жертвы + (masf_s, masf_m, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, + swap_free, swap_total) = check_mem_swap_ex() + + + if CHECK_ZRAM: + zram_s, zram_m, mem_used_zram = check_zram_ex() + + if CHECK_PSI: + (psi_s, psi_m, psi_t0, psi_kill_exceeded_timer, + psi_term_exceeded_timer, x0) = check_psi_ex( + psi_t0,psi_kill_exceeded_timer,psi_term_exceeded_timer,x0) + + + + + + + + + + if masf_s is SIGKILL or zram_s is SIGKILL or psi_s is SIGKILL: + + new_threshold = SIGKILL + mem_info_list = [] + + if masf_s is SIGKILL or masf_s is SIGTERM: + mem_info_list.append(masf_m) + + if zram_s is SIGKILL or zram_s is SIGTERM: + mem_info_list.append(zram_m) + + if psi_s is SIGKILL or psi_s is SIGTERM: + mem_info_list.append(psi_m) + + elif masf_s is SIGTERM or zram_s is SIGTERM or psi_s is SIGTERM: + + new_threshold = SIGTERM + mem_info_list = [] + + if masf_s is SIGKILL or masf_s is SIGTERM: + mem_info_list.append(masf_m) + + if zram_s is SIGKILL or zram_s is SIGTERM: + mem_info_list.append(zram_m) + + if psi_s is SIGKILL or psi_s is SIGTERM: + mem_info_list.append(psi_m) + + else: log('Thresholds is not exceeded now') - return None + return psi_t0 - if new_signal is not signal: - log(mem_info) - signal = new_signal - #log(mem_info) + + + # печать порогов + for i in mem_info_list: + log(i) + + # может это излишне + if new_threshold is None or new_threshold == 'WARN': + log('Thresholds is not exceeded now') + return psi_t0 + + threshold = new_threshold if victim_badness >= min_badness: + psi_t0 = time() + if print_victim_info: victim_info = find_victim_info(pid, victim_badness, name) log(victim_info) - # пороги могли превысиься за время поиска жертвы (поиск может занимать - # сотни миллисекунд) - mem_available, swap_total, swap_free = check_mem_and_swap() - - ma_mib = int(mem_available) / 1024.0 - sf_mib = int(swap_free) / 1024.0 - log('Memory status before implementing a corrective act' - 'ion:\n MemAvailable' - ': {} MiB, SwapFree: {} MiB'.format( - round(ma_mib, 1), round(sf_mib, 1) - ) - ) - - if (mem_available <= mem_min_sigkill_kb and - swap_free <= swap_min_sigkill_kb): - log('Hard threshold exceeded') - signal = SIGKILL - - victim_id = get_victim_id(pid) - # kill the victim if it doesn't respond to SIGTERM В ТЕЧЕНИЕ # ЗАДАННОГО ВРЕМЕНИ # переопределяем сигнал для старых жертв - if signal is SIGTERM: + if threshold is SIGTERM: if victim_id in victim_dict: @@ -1504,12 +1654,12 @@ def implement_corrective_action(signal, mem_info): if dt > max_post_sigterm_victim_lifetime: print('max_post_sigterm_victim_lifetime exceeded: the ' 'victim will get SIGKILL') - signal = SIGKILL + threshold = SIGKILL # matching with re to customize corrective actions soft_match = False - if soft_actions and signal is SIGTERM: + if soft_actions and threshold is SIGTERM: name = pid_to_name(pid) cgroup_v1 = pid_to_cgroup_v1(pid) service = '' @@ -1530,9 +1680,7 @@ def implement_corrective_action(signal, mem_info): soft_match = True break - - - if soft_match: # ПЕРЕОПРЕДЕЛЕНИЕ МЯГКОГО КОРРЕКТИРУЮЩЕГО ДЕЙСТВИЯ + if soft_match: # ПЕРЕОПРЕДЕЛЕНИЕ МЯГКОГО КОРРЕКТИРУЮЩЕГО ДЕЙСТВИЯ # todo: make new func m = check_mem_and_swap() @@ -1588,16 +1736,11 @@ def implement_corrective_action(signal, mem_info): # обычное действие через сигнал - - # вот тут поработать. Тут ебаный цикл. Нахуй его. + try: # вот тут засрано, в блоке try должно быть только kill(), остальное ниже за пределами - - - try: - - os.kill(int(pid), signal) + os.kill(int(pid), threshold) kill_timestamp = time() response_time = kill_timestamp - time0 @@ -1625,10 +1768,7 @@ def implement_corrective_action(signal, mem_info): log('Process exited (VmRSS = 0) in {} sec'.format( round(dt, 5))) - - - - if signal is SIGKILL or victim_alive == 2: + if threshold is SIGKILL or victim_alive == 2: # жертва умирает от SIGKILL. Дожидаемся ее полной смерти. while True: @@ -1641,8 +1781,7 @@ def implement_corrective_action(signal, mem_info): log('The victim died in {} sec'.format( round(kill_duration, 3))) - - + """ mem_available, swap_total, swap_free = check_mem_and_swap() ma_mib = int(mem_available) / 1024.0 @@ -1653,17 +1792,18 @@ def implement_corrective_action(signal, mem_info): round(ma_mib, 1), round(sf_mib, 1) ) ) + """ send_result = 'total response time: {} ms'.format( round(response_time * 1000)) preventing_oom_message = 'Implement a corrective action:' \ '\n Send {} to the victim; {}'.format( - sig_dict[signal], send_result) + sig_dict[threshold], send_result) - key = 'Send {} to {}'.format(sig_dict[signal], name) + key = 'Send {} to {}'.format(sig_dict[threshold], name) - if signal is SIGKILL and post_kill_exe != '': + if threshold is SIGKILL and post_kill_exe != '': cmd = post_kill_exe.replace('$PID', pid).replace( '$NAME', pid_to_name(pid)) @@ -1673,7 +1813,7 @@ def implement_corrective_action(signal, mem_info): exe(cmd) if gui_notifications: - send_notify(signal, name, pid) + send_notify(threshold, name, pid) except FileNotFoundError: response_time = time() - time0 @@ -1696,10 +1836,11 @@ def implement_corrective_action(signal, mem_info): update_stat_dict_and_print(key) - # нехуй делать, бэднес жертвы слишком мал else: + # может эту часть наверх отправить через if + response_time = time() - time0 victim_badness_is_too_small = 'victim badness {} < min_b' \ 'adness {}; nothing to do; response time: {} ms'.format( @@ -1724,21 +1865,13 @@ def implement_corrective_action(signal, mem_info): print('##################################################################') + sleep(over_sleep) # Спать если бэднес жертвы мал + # Что делать с psi_t0 если у мертвы мал бэднес? Ничего, потому что кор действия не было. + # демон может жрать 10% цпу при этом. Можно отдельн парам ввести. А можно + # не вводить. кек - - - - - - - - - - - - - + return psi_t0 def sleep_after_check_mem(): @@ -1802,8 +1935,8 @@ def sleep_after_check_mem(): log( 'Sleep {} sec (t_mem={}, t_swap={}{})'.format( - round(t, 2),round(t_mem, 2),round(t_swap, 2), z) - ) + round(t, 2), round(t_mem, 2), round(t_swap, 2), z) + ) try: stdout.flush() @@ -1874,27 +2007,9 @@ def calculate_percent(arg_key): return mem_min_kb, mem_min_mb, mem_min_percent - - - - - - - - - - - - - - - - ########################################################################## - - victim_dict = dict() victim_id = None actions_time_dict = dict() @@ -1902,31 +2017,6 @@ actions_time_dict['action_handled'] = [time(), victim_id] # print(actions_time_dict) - - - - - - - - - - - - - - - - - - - - - - - - - start_time = time() @@ -1974,17 +2064,6 @@ else: notify_helper_path = '/usr/sbin/nohang_notify_helper' - - - - - - - - - - - # will store corrective actions stat stat_dict = dict() @@ -2319,8 +2398,6 @@ print_victim_info = conf_parse_bool('print_victim_info') print_victim_cmdline = conf_parse_bool('print_victim_cmdline') - - print_config = conf_parse_bool('print_config') print_mem_check_results = conf_parse_bool('print_mem_check_results') print_sleep_periods = conf_parse_bool('print_sleep_periods') @@ -2459,6 +2536,20 @@ else: exit(1) +if 'psi_avg_warnings' in config_dict: + psi_avg_warnings = string_to_float_convert_test( + config_dict['psi_avg_warnings']) + if psi_avg_warnings is None: + errprint('Invalid psi_avg_warnings value, not float\nExit') + exit(1) + if psi_avg_warnings < 0 or psi_avg_warnings > 100: + errprint('psi_avg_warnings must be in the range [0; 100]\nExit') + exit(1) +else: + errprint('psi_avg_warnings not in config\nExit') + exit(1) + + if 'min_badness' in config_dict: min_badness = string_to_int_convert_test( config_dict['min_badness']) @@ -2876,9 +2967,6 @@ psi_avg_string = '' # will be overwritten if PSI monitoring enabled mem_used_zram = 0 -if psi_support and not ignore_psi: - psi_t0 = time() - if print_mem_check_results: @@ -2895,10 +2983,25 @@ for i in sig_list: signal(i, signal_handler) +x0 = time() +delta0 = 0 + + +threshold = None +mem_info = None + + +#print(x0, 'x0') + CHECK_PSI = False if psi_support and not ignore_psi: CHECK_PSI = True +psi_kill_exceeded_timer = 0 +psi_term_exceeded_timer = 0 +psi_t0 = time() +psi_s = zram_s = zram_m = psi_m = None + CHECK_ZRAM = not ignore_zram @@ -2907,58 +3010,36 @@ log('Monitoring has started!') stdout.flush() -psi_kill_exceeded_timer = psi_term_exceeded_timer = delta0 = 0 - -x0 = time() - ########################################################################## while True: - delta0 = time() - x0 - x0 = time() - - # FIND VALUES: mem, swap, zram, psi - - mem_available, swap_total, swap_free = check_mem_and_swap() - - # if swap_min_sigkill is set in percent - if swap_kill_is_percent: - swap_min_sigkill_kb = swap_total * swap_min_sigkill_percent / 100.0 - - if swap_term_is_percent: - swap_min_sigterm_kb = swap_total * swap_min_sigterm_percent / 100.0 - - if swap_warn_is_percent: - swap_min_warnings_kb = swap_total * swap_min_warnings_percent / 100.0 - - if swap_total > swap_min_sigkill_kb: - swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1)) - else: - swap_sigkill_pc = '-' - - if swap_total > swap_min_sigterm_kb: - swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1)) - else: - swap_sigterm_pc = '-' + (masf_s, masf_m, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, + swap_free, swap_total) = check_mem_swap_ex() if CHECK_ZRAM: - mem_used_zram = check_zram() + zram_s, zram_m, mem_used_zram = check_zram_ex() if CHECK_PSI: - psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics) - if time() - psi_t0 >= psi_post_action_delay: - psi_post_action_delay_exceeded = True - else: - psi_post_action_delay_exceeded = False + (psi_s, psi_m, psi_t0, psi_kill_exceeded_timer, + psi_term_exceeded_timer, x0) = check_psi_ex( + psi_t0,psi_kill_exceeded_timer,psi_term_exceeded_timer,x0) - if print_mem_check_results: - psi_avg_string = 'PSI avg: {} | '.format( - str(psi_avg_value).rjust(6)) if print_mem_check_results: + if CHECK_PSI: + psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics) + if time() - psi_t0 >= psi_post_action_delay: + psi_post_action_delay_exceeded = True + else: + psi_post_action_delay_exceeded = False + + if print_mem_check_results: + psi_avg_string = 'PSI avg: {} | '.format( + str(psi_avg_value).rjust(6)) + wt1 = time() delta = (mem_available + swap_free) - new_mem @@ -3024,168 +3105,64 @@ while True: ) ) - ########################################################################### - # CHECK HARD THRESHOLDS (SIGKILL LEVEL) + if masf_s is SIGKILL or zram_s is SIGKILL or psi_s is SIGKILL: - if (mem_available <= mem_min_sigkill_kb and - swap_free <= swap_min_sigkill_kb): + threshold = SIGKILL + mem_info_list = [] - mem_info = 'Hard threshold exceeded\nMemory status that requ' \ - 'ires corrective actions:' \ - '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ - 'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ - 'p_min_sigkill [{} MiB, {} %]'.format( - kib_to_mib(mem_available), - percent(mem_available / mem_total), - kib_to_mib(mem_min_sigkill_kb), - percent(mem_min_sigkill_kb / mem_total), - kib_to_mib(swap_free), - percent(swap_free / (swap_total + 0.1)), - kib_to_mib(swap_min_sigkill_kb), - swap_sigkill_pc) + if masf_m is not None: + mem_info_list.append(masf_m) - implement_corrective_action(SIGKILL, mem_info) - psi_t0 = time() + if zram_m is not None: + mem_info_list.append(zram_m) + + if psi_m is not None: + mem_info_list.append(psi_m) + + psi_t0 = implement_corrective_action( + threshold, + mem_info_list, + psi_t0, + psi_kill_exceeded_timer, + psi_term_exceeded_timer, + x0, psi_s, zram_s, zram_m, psi_m) continue - if CHECK_ZRAM: - if mem_used_zram >= zram_max_sigkill_kb: + if masf_s is SIGTERM or zram_s is SIGTERM or psi_s is SIGTERM: - mem_info = 'Hard threshold exceeded\nMemory status that requir' \ - 'es corrective actions:' \ - '\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \ - 'kill [{} MiB, {} %]'.format( - kib_to_mib(mem_used_zram), - percent(mem_used_zram / mem_total), - kib_to_mib(zram_max_sigkill_kb), - percent(zram_max_sigkill_kb / mem_total)) + threshold = SIGTERM + mem_info_list = [] - implement_corrective_action(SIGKILL, mem_info) - psi_t0 = time() - continue + if masf_m is not None: + mem_info_list.append(masf_m) - if CHECK_PSI: + if zram_m is not None: + mem_info_list.append(zram_m) - if psi_avg_value >= sigkill_psi_threshold: - sigkill_psi_exceeded = True - psi_kill_exceeded_timer += delta0 - else: - sigkill_psi_exceeded = False - psi_kill_exceeded_timer = 0 + if psi_m is not None: + mem_info_list.append(psi_m) - if psi_debug: - - log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded' - ': {}\npsi_kill_exceeded_timer: {}'.format( - psi_post_action_delay_exceeded, - sigkill_psi_exceeded, - round(psi_kill_exceeded_timer, 1) - ) - ) - - if (psi_kill_exceeded_timer >= psi_excess_duration and - psi_post_action_delay_exceeded): - - mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \ - 'PSI avg exceeded psi_excess_duration (value' \ - ' = {} sec) for {} seconds'.format( - psi_avg_value, - sigkill_psi_threshold, - psi_excess_duration, - round(psi_kill_exceeded_timer, 1) - ) - - implement_corrective_action(SIGKILL, mem_info) - psi_t0 = time() - continue - - ########################################################################### - - # CHECK SOFT THRESHOLDS (SIGTERM LEVEL) - - if (mem_available <= mem_min_sigterm_kb and - swap_free <= swap_min_sigterm_kb): - - mem_info = 'Soft threshold exceeded\nMemory status that requi' \ - 'res corrective actions:' \ - '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ - 'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ - 'p_min_sigterm [{} MiB, {} %]'.format( - kib_to_mib(mem_available), - percent(mem_available / mem_total), - kib_to_mib(mem_min_sigterm_kb), - round(mem_min_sigterm_percent, 1), - kib_to_mib(swap_free), - percent(swap_free / (swap_total + 0.1)), - kib_to_mib(swap_min_sigterm_kb), - swap_sigterm_pc) - - implement_corrective_action(SIGTERM, mem_info) - psi_t0 = time() + psi_t0 = implement_corrective_action( + threshold, + mem_info_list, + psi_t0, + psi_kill_exceeded_timer, + psi_term_exceeded_timer, + x0, psi_s, zram_s, zram_m, psi_m) continue - if CHECK_ZRAM: - if mem_used_zram >= zram_max_sigterm_kb: - - mem_info = 'Soft threshold exceeded\nMemory status that require' \ - 's corrective actions:\n MemUsedZram [{} MiB, {} %] >= zra' \ - 'm_max_sigterm [{} M, {} %]'.format( - kib_to_mib(mem_used_zram), - percent(mem_used_zram / mem_total), - kib_to_mib(zram_max_sigterm_kb), - percent(zram_max_sigterm_kb / mem_total)) - - implement_corrective_action(SIGTERM, mem_info) - psi_t0 = time() - continue - - if CHECK_PSI: - if psi_avg_value >= sigterm_psi_threshold: - sigterm_psi_exceeded = True - psi_term_exceeded_timer += delta0 - else: - sigterm_psi_exceeded = False - psi_term_exceeded_timer = 0 - - if psi_debug: - - log('sigterm_psi_exceeded: {}\n' - 'psi_term_exceeded_timer: {}\n'.format( - sigterm_psi_exceeded, - round(psi_term_exceeded_timer, 1) - ) - ) - - if (psi_term_exceeded_timer >= psi_excess_duration and - psi_post_action_delay_exceeded): - - mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \ - 'PSI avg exceeded psi_excess_duration (value' \ - ' = {} sec) for {} seconds'.format( - psi_avg_value, - sigterm_psi_threshold, - psi_excess_duration, - round(psi_term_exceeded_timer, 1) - ) - - implement_corrective_action(SIGTERM, mem_info) - psi_t0 = time() - continue - - ########################################################################### - if gui_low_memory_warnings: - if (mem_available <= mem_min_warnings_kb and - swap_free <= swap_min_warnings_kb + 0.1 or - mem_used_zram >= zram_max_warnings_kb): + if masf_s == 'WARN' or zram_s == 'WARN' or psi_s == 'WARN': warn_time_delta = time() - warn_time_now warn_time_now = time() warn_timer += warn_time_delta if warn_timer > min_time_between_warnings: + send_notify_warn() + warn_timer = 0 sleep_after_check_mem() diff --git a/nohang.conf b/nohang.conf index ad3e0b9..a2ed25a 100644 --- a/nohang.conf +++ b/nohang.conf @@ -107,7 +107,7 @@ sigterm_psi_threshold = 60 sigkill_psi_threshold = 90 >= 0, float -psi_excess_duration = 30 +psi_excess_duration = 40 psi_post_action_delay = 20 @@ -289,6 +289,8 @@ swap_min_warnings = 50 % zram_max_warnings = 40 % +psi_avg_warnings = 60 + Valid values are floating-point numbers from the range [1; 300]. min_time_between_warnings = 15 @@ -336,7 +338,7 @@ print_victim_info = True print_victim_cmdline = False -max_ancestry_depth = 1 +max_ancestry_depth = 5 separate_log = False diff --git a/trash/psi_dummy b/trash/psi_dummy index 60b9136..f490e2e 100644 --- a/trash/psi_dummy +++ b/trash/psi_dummy @@ -1,2 +1,2 @@ -some avg10=29.70 avg60=51.59 avg300=22.92 total=195239452 -full avg10=28.82 avg60=49.77 avg300=21.83 total=182504463 +some avg10=56.70 avg60=51.59 avg300=22.92 total=195239452 +full avg10=28.82 avg60=49.77 avg300=21.83 total=182504463