diff --git a/nohang b/nohang
index 4f1aa9f..1e2b5ab 100755
--- a/nohang
+++ b/nohang
@@ -815,11 +815,11 @@ def send_notify_warn():
send_notification(title, body)
-def send_notify(signal, name, pid):
+def send_notify(threshold, name, pid):
"""
Notificate about OOM Preventing.
- signal: key for notify_sig_dict
+ threshold: key for notify_sig_dict
name: str process name
pid: str process pid
"""
@@ -831,7 +831,7 @@ def send_notify(signal, name, pid):
title = 'Freeze prevention'
body = '{} [{}] {}'.format(
- notify_sig_dict[signal],
+ notify_sig_dict[threshold],
pid,
name.replace(
# symbol '&' can break notifications in some themes,
@@ -1041,6 +1041,7 @@ def find_victim(_print_proc_table):
)[0]
pid = pid_tuple_list[0]
+ victim_id = get_victim_id(pid)
# Get maximum 'badness' value
victim_badness = pid_tuple_list[1]
@@ -1062,7 +1063,7 @@ def find_victim(_print_proc_table):
)
)
- return pid, victim_badness, victim_name
+ return pid, victim_badness, victim_name, victim_id
def find_victim_info(pid, victim_badness, name):
@@ -1271,18 +1272,6 @@ def find_victim_info(pid, victim_badness, name):
return victim_info
-
-
-
-
-
-
-
-
-
-
-
-
def check_mem_swap_ex():
"""
Check: is mem and swap threshold exceeded?
@@ -1314,8 +1303,8 @@ def check_mem_swap_ex():
if (mem_available <= mem_min_sigkill_kb and
swap_free <= swap_min_sigkill_kb):
- mem_info = 'Hard threshold exceeded\nMemory status that requ' \
- 'ires corrective actions:' \
+ mem_info = 'Memory status that requ' \
+ 'ires corrective actions (hard threshold exceeded):' \
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
'p_min_sigkill [{} MiB, {} %]'.format(
@@ -1328,13 +1317,13 @@ def check_mem_swap_ex():
kib_to_mib(swap_min_sigkill_kb),
swap_sigkill_pc)
- return SIGKILL, mem_info
+ return SIGKILL, mem_info, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
if (mem_available <= mem_min_sigterm_kb and
swap_free <= swap_min_sigterm_kb):
- mem_info = 'Soft threshold exceeded\nMemory status that requi' \
- 'res corrective actions:' \
+ mem_info = 'Memory status that requi' \
+ 'res corrective actions (soft threshold exceeded):' \
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
'p_min_sigterm [{} MiB, {} %]'.format(
@@ -1347,24 +1336,146 @@ def check_mem_swap_ex():
kib_to_mib(swap_min_sigterm_kb),
swap_sigterm_pc)
- return SIGTERM, mem_info
+ return SIGTERM, mem_info, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
- return None, None
+ if gui_low_memory_warnings:
+
+ if (mem_available <= mem_min_warnings_kb and swap_free <=
+ swap_min_warnings_kb + 0.1):
+ return 'WARN', None, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
+
+ return None, None, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
+def check_zram_ex():
+ """
+ """
+ mem_used_zram = check_zram()
+
+ if mem_used_zram >= zram_max_sigkill_kb:
+
+ mem_info = 'Memory status that requir' \
+ 'es corrective actions (hard threshold exceeded):' \
+ '\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
+ 'kill [{} MiB, {} %]'.format(
+ kib_to_mib(mem_used_zram),
+ percent(mem_used_zram / mem_total),
+ kib_to_mib(zram_max_sigkill_kb),
+ percent(zram_max_sigkill_kb / mem_total))
+
+ return SIGKILL, mem_info, mem_used_zram
+
+ if mem_used_zram >= zram_max_sigterm_kb:
+
+ mem_info = 'Memory status that require' \
+ 's corrective actions (soft threshold exceeded):\n MemUsedZram [{} MiB, {} %] >= zra' \
+ 'm_max_sigterm [{} M, {} %]'.format(
+ kib_to_mib(mem_used_zram),
+ percent(mem_used_zram / mem_total),
+ kib_to_mib(zram_max_sigterm_kb),
+ percent(zram_max_sigterm_kb / mem_total))
+
+ return SIGTERM, mem_info, mem_used_zram
+
+ if gui_low_memory_warnings:
+ if mem_used_zram >= zram_max_warnings_kb:
+ return 'WARN', None, mem_used_zram
+
+ return None, None, mem_used_zram
+def check_psi_ex(psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0):
+ """
+ """
+ delta0 = time() - x0
+ x0 = time()
+ psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
+ # print(psi_avg_value)
+ psi_post_action_delay_timer = time() - psi_t0
+ if psi_post_action_delay_timer >= psi_post_action_delay:
+ psi_post_action_delay_exceeded = True
+ else:
+ psi_post_action_delay_exceeded = False
+ if psi_avg_value >= sigkill_psi_threshold:
+ sigkill_psi_exceeded = True
+ psi_kill_exceeded_timer += delta0
+ else:
+ sigkill_psi_exceeded = False
+ psi_kill_exceeded_timer = 0
+ if psi_debug:
+ log('psi_post_action_delay_timer: {}'.format(
+ round(psi_post_action_delay_timer, 3)))
+ log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded'
+ ': {}\npsi_kill_exceeded_timer: {}'.format(
+ psi_post_action_delay_exceeded,
+ sigkill_psi_exceeded,
+ round(psi_kill_exceeded_timer, 1)
+ )
+ )
+ if (psi_kill_exceeded_timer >= psi_excess_duration and
+ psi_post_action_delay_exceeded):
+ mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \
+ 'PSI avg exceeded psi_excess_duration (value' \
+ ' = {} sec) for {} seconds'.format(
+ psi_avg_value,
+ sigkill_psi_threshold,
+ psi_excess_duration,
+ round(psi_kill_exceeded_timer, 1)
+ )
+ # psi_t0 = time() # ВОТ ГДЕ ПРОБЛЕМА. Таймер надо сбрасывать именно после применения корректирующего действия. Именно ПОСЛЕ. А не здесь.
+ # Или после любого применения, или после успешного.
+ # Если жертва умерла в процессе поиска - сбрасываем. Если отправлен
+ # сигнал - сбрасываем.
+
+ return SIGKILL, mem_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
+
+ if psi_avg_value >= sigterm_psi_threshold:
+ sigterm_psi_exceeded = True
+ psi_term_exceeded_timer += delta0
+ else:
+ sigterm_psi_exceeded = False
+ psi_term_exceeded_timer = 0
+
+ if psi_debug:
+
+ log('sigterm_psi_exceeded: {}\n'
+ 'psi_term_exceeded_timer: {}\n'.format(
+ sigterm_psi_exceeded,
+ round(psi_term_exceeded_timer, 1)
+ )
+ )
+
+ if (psi_term_exceeded_timer >= psi_excess_duration and
+ psi_post_action_delay_exceeded):
+
+ mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \
+ 'PSI avg exceeded psi_excess_duration (value' \
+ ' = {} sec) for {} seconds'.format(
+ psi_avg_value,
+ sigterm_psi_threshold,
+ psi_excess_duration,
+ round(psi_term_exceeded_timer, 1)
+ )
+
+ return SIGTERM, mem_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
+
+ if gui_low_memory_warnings:
+
+ if psi_avg_value >= psi_avg_warnings:
+ return 'WARN', None, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
+
+ return None, None, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
def is_victim_alive(pid):
@@ -1384,31 +1495,26 @@ def is_victim_alive(pid):
return 0
-
-
-
-
-
-
-
-def implement_corrective_action(signal, mem_info):
+def implement_corrective_action(
+ threshold,
+ mem_info_list,
+ psi_t0,
+ psi_kill_exceeded_timer,
+ psi_term_exceeded_timer,
+ x0, psi_s, zram_s, zram_m, psi_m):
"""
Find victim with highest badness and send SIGTERM/SIGKILL
"""
-
-
# Ёбаная запутанная фция. Распутать всё нахуй. Выделить части в отдельн фции.
# Разбить саму фцию на части. Сделать ее структуру простой и понятной.
-
- time0 = time() # начало корр действия. Для вычисл времени действия.
-
+ time0 = time() # начало корр действия. Для вычисл времени действия.
# выходим из фции, если для SIGTERM порога не превышено время
# min_delay_after_sigterm и спим в течение over_sleep
# если хард порог превышен - идем дальше.
- if signal is SIGTERM:
+ if threshold is SIGTERM:
dt = time() - actions_time_dict['action_handled'][0]
@@ -1422,7 +1528,7 @@ def implement_corrective_action(signal, mem_info):
sleep(over_sleep)
- return None # время задержки между действиями не истекло
+ return psi_t0 # время задержки между действиями не истекло
else:
log('min_delay_after_sigterm IS EXCEEDED, it is time to action')
@@ -1442,60 +1548,104 @@ def implement_corrective_action(signal, mem_info):
"""
- log(mem_info)
+ for i in mem_info_list:
+ log(i)
# ищем жертву с ее бэднес.
- pid, victim_badness, name = find_victim(print_proc_table)
+ pid, victim_badness, name, victim_id = find_victim(print_proc_table)
# sleep(0.1)
- new_signal, mem_info = check_mem_swap_ex()
- #log(new_signal)
- #log(mem_info)
- if new_signal is None:
+
+
+
+ log('Recheck memory levels...')
+
+
+
+ # перепроверяем пороги: они могли измениться за время поиска жертвы
+ (masf_s, masf_m, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb,
+ swap_free, swap_total) = check_mem_swap_ex()
+
+
+ if CHECK_ZRAM:
+ zram_s, zram_m, mem_used_zram = check_zram_ex()
+
+ if CHECK_PSI:
+ (psi_s, psi_m, psi_t0, psi_kill_exceeded_timer,
+ psi_term_exceeded_timer, x0) = check_psi_ex(
+ psi_t0,psi_kill_exceeded_timer,psi_term_exceeded_timer,x0)
+
+
+
+
+
+
+
+
+
+ if masf_s is SIGKILL or zram_s is SIGKILL or psi_s is SIGKILL:
+
+ new_threshold = SIGKILL
+ mem_info_list = []
+
+ if masf_s is SIGKILL or masf_s is SIGTERM:
+ mem_info_list.append(masf_m)
+
+ if zram_s is SIGKILL or zram_s is SIGTERM:
+ mem_info_list.append(zram_m)
+
+ if psi_s is SIGKILL or psi_s is SIGTERM:
+ mem_info_list.append(psi_m)
+
+ elif masf_s is SIGTERM or zram_s is SIGTERM or psi_s is SIGTERM:
+
+ new_threshold = SIGTERM
+ mem_info_list = []
+
+ if masf_s is SIGKILL or masf_s is SIGTERM:
+ mem_info_list.append(masf_m)
+
+ if zram_s is SIGKILL or zram_s is SIGTERM:
+ mem_info_list.append(zram_m)
+
+ if psi_s is SIGKILL or psi_s is SIGTERM:
+ mem_info_list.append(psi_m)
+
+ else:
log('Thresholds is not exceeded now')
- return None
+ return psi_t0
- if new_signal is not signal:
- log(mem_info)
- signal = new_signal
- #log(mem_info)
+
+
+ # печать порогов
+ for i in mem_info_list:
+ log(i)
+
+ # может это излишне
+ if new_threshold is None or new_threshold == 'WARN':
+ log('Thresholds is not exceeded now')
+ return psi_t0
+
+ threshold = new_threshold
if victim_badness >= min_badness:
+ psi_t0 = time()
+
if print_victim_info:
victim_info = find_victim_info(pid, victim_badness, name)
log(victim_info)
- # пороги могли превысиься за время поиска жертвы (поиск может занимать
- # сотни миллисекунд)
- mem_available, swap_total, swap_free = check_mem_and_swap()
-
- ma_mib = int(mem_available) / 1024.0
- sf_mib = int(swap_free) / 1024.0
- log('Memory status before implementing a corrective act'
- 'ion:\n MemAvailable'
- ': {} MiB, SwapFree: {} MiB'.format(
- round(ma_mib, 1), round(sf_mib, 1)
- )
- )
-
- if (mem_available <= mem_min_sigkill_kb and
- swap_free <= swap_min_sigkill_kb):
- log('Hard threshold exceeded')
- signal = SIGKILL
-
- victim_id = get_victim_id(pid)
-
# kill the victim if it doesn't respond to SIGTERM В ТЕЧЕНИЕ
# ЗАДАННОГО ВРЕМЕНИ
# переопределяем сигнал для старых жертв
- if signal is SIGTERM:
+ if threshold is SIGTERM:
if victim_id in victim_dict:
@@ -1504,12 +1654,12 @@ def implement_corrective_action(signal, mem_info):
if dt > max_post_sigterm_victim_lifetime:
print('max_post_sigterm_victim_lifetime exceeded: the '
'victim will get SIGKILL')
- signal = SIGKILL
+ threshold = SIGKILL
# matching with re to customize corrective actions
soft_match = False
- if soft_actions and signal is SIGTERM:
+ if soft_actions and threshold is SIGTERM:
name = pid_to_name(pid)
cgroup_v1 = pid_to_cgroup_v1(pid)
service = ''
@@ -1530,9 +1680,7 @@ def implement_corrective_action(signal, mem_info):
soft_match = True
break
-
-
- if soft_match: # ПЕРЕОПРЕДЕЛЕНИЕ МЯГКОГО КОРРЕКТИРУЮЩЕГО ДЕЙСТВИЯ
+ if soft_match: # ПЕРЕОПРЕДЕЛЕНИЕ МЯГКОГО КОРРЕКТИРУЮЩЕГО ДЕЙСТВИЯ
# todo: make new func
m = check_mem_and_swap()
@@ -1588,16 +1736,11 @@ def implement_corrective_action(signal, mem_info):
# обычное действие через сигнал
-
-
# вот тут поработать. Тут ебаный цикл. Нахуй его.
+ try: # вот тут засрано, в блоке try должно быть только kill(), остальное ниже за пределами
-
-
- try:
-
- os.kill(int(pid), signal)
+ os.kill(int(pid), threshold)
kill_timestamp = time()
response_time = kill_timestamp - time0
@@ -1625,10 +1768,7 @@ def implement_corrective_action(signal, mem_info):
log('Process exited (VmRSS = 0) in {} sec'.format(
round(dt, 5)))
-
-
-
- if signal is SIGKILL or victim_alive == 2:
+ if threshold is SIGKILL or victim_alive == 2:
# жертва умирает от SIGKILL. Дожидаемся ее полной смерти.
while True:
@@ -1641,8 +1781,7 @@ def implement_corrective_action(signal, mem_info):
log('The victim died in {} sec'.format(
round(kill_duration, 3)))
-
-
+ """
mem_available, swap_total, swap_free = check_mem_and_swap()
ma_mib = int(mem_available) / 1024.0
@@ -1653,17 +1792,18 @@ def implement_corrective_action(signal, mem_info):
round(ma_mib, 1), round(sf_mib, 1)
)
)
+ """
send_result = 'total response time: {} ms'.format(
round(response_time * 1000))
preventing_oom_message = 'Implement a corrective action:' \
'\n Send {} to the victim; {}'.format(
- sig_dict[signal], send_result)
+ sig_dict[threshold], send_result)
- key = 'Send {} to {}'.format(sig_dict[signal], name)
+ key = 'Send {} to {}'.format(sig_dict[threshold], name)
- if signal is SIGKILL and post_kill_exe != '':
+ if threshold is SIGKILL and post_kill_exe != '':
cmd = post_kill_exe.replace('$PID', pid).replace(
'$NAME', pid_to_name(pid))
@@ -1673,7 +1813,7 @@ def implement_corrective_action(signal, mem_info):
exe(cmd)
if gui_notifications:
- send_notify(signal, name, pid)
+ send_notify(threshold, name, pid)
except FileNotFoundError:
response_time = time() - time0
@@ -1696,10 +1836,11 @@ def implement_corrective_action(signal, mem_info):
update_stat_dict_and_print(key)
-
# нехуй делать, бэднес жертвы слишком мал
else:
+ # может эту часть наверх отправить через if
+
response_time = time() - time0
victim_badness_is_too_small = 'victim badness {} < min_b' \
'adness {}; nothing to do; response time: {} ms'.format(
@@ -1724,21 +1865,13 @@ def implement_corrective_action(signal, mem_info):
print('##################################################################')
+ sleep(over_sleep) # Спать если бэднес жертвы мал
+ # Что делать с psi_t0 если у мертвы мал бэднес? Ничего, потому что кор действия не было.
+ # демон может жрать 10% цпу при этом. Можно отдельн парам ввести. А можно
+ # не вводить. кек
-
-
-
-
-
-
-
-
-
-
-
-
-
+ return psi_t0
def sleep_after_check_mem():
@@ -1802,8 +1935,8 @@ def sleep_after_check_mem():
log(
'Sleep {} sec (t_mem={}, t_swap={}{})'.format(
- round(t, 2),round(t_mem, 2),round(t_swap, 2), z)
- )
+ round(t, 2), round(t_mem, 2), round(t_swap, 2), z)
+ )
try:
stdout.flush()
@@ -1874,27 +2007,9 @@ def calculate_percent(arg_key):
return mem_min_kb, mem_min_mb, mem_min_percent
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
##########################################################################
-
-
victim_dict = dict()
victim_id = None
actions_time_dict = dict()
@@ -1902,31 +2017,6 @@ actions_time_dict['action_handled'] = [time(), victim_id]
# print(actions_time_dict)
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
start_time = time()
@@ -1974,17 +2064,6 @@ else:
notify_helper_path = '/usr/sbin/nohang_notify_helper'
-
-
-
-
-
-
-
-
-
-
-
# will store corrective actions stat
stat_dict = dict()
@@ -2319,8 +2398,6 @@ print_victim_info = conf_parse_bool('print_victim_info')
print_victim_cmdline = conf_parse_bool('print_victim_cmdline')
-
-
print_config = conf_parse_bool('print_config')
print_mem_check_results = conf_parse_bool('print_mem_check_results')
print_sleep_periods = conf_parse_bool('print_sleep_periods')
@@ -2459,6 +2536,20 @@ else:
exit(1)
+if 'psi_avg_warnings' in config_dict:
+ psi_avg_warnings = string_to_float_convert_test(
+ config_dict['psi_avg_warnings'])
+ if psi_avg_warnings is None:
+ errprint('Invalid psi_avg_warnings value, not float\nExit')
+ exit(1)
+ if psi_avg_warnings < 0 or psi_avg_warnings > 100:
+ errprint('psi_avg_warnings must be in the range [0; 100]\nExit')
+ exit(1)
+else:
+ errprint('psi_avg_warnings not in config\nExit')
+ exit(1)
+
+
if 'min_badness' in config_dict:
min_badness = string_to_int_convert_test(
config_dict['min_badness'])
@@ -2876,9 +2967,6 @@ psi_avg_string = '' # will be overwritten if PSI monitoring enabled
mem_used_zram = 0
-if psi_support and not ignore_psi:
- psi_t0 = time()
-
if print_mem_check_results:
@@ -2895,10 +2983,25 @@ for i in sig_list:
signal(i, signal_handler)
+x0 = time()
+delta0 = 0
+
+
+threshold = None
+mem_info = None
+
+
+#print(x0, 'x0')
+
CHECK_PSI = False
if psi_support and not ignore_psi:
CHECK_PSI = True
+psi_kill_exceeded_timer = 0
+psi_term_exceeded_timer = 0
+psi_t0 = time()
+psi_s = zram_s = zram_m = psi_m = None
+
CHECK_ZRAM = not ignore_zram
@@ -2907,58 +3010,36 @@ log('Monitoring has started!')
stdout.flush()
-psi_kill_exceeded_timer = psi_term_exceeded_timer = delta0 = 0
-
-x0 = time()
-
##########################################################################
while True:
- delta0 = time() - x0
- x0 = time()
-
- # FIND VALUES: mem, swap, zram, psi
-
- mem_available, swap_total, swap_free = check_mem_and_swap()
-
- # if swap_min_sigkill is set in percent
- if swap_kill_is_percent:
- swap_min_sigkill_kb = swap_total * swap_min_sigkill_percent / 100.0
-
- if swap_term_is_percent:
- swap_min_sigterm_kb = swap_total * swap_min_sigterm_percent / 100.0
-
- if swap_warn_is_percent:
- swap_min_warnings_kb = swap_total * swap_min_warnings_percent / 100.0
-
- if swap_total > swap_min_sigkill_kb:
- swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1))
- else:
- swap_sigkill_pc = '-'
-
- if swap_total > swap_min_sigterm_kb:
- swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1))
- else:
- swap_sigterm_pc = '-'
+ (masf_s, masf_m, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb,
+ swap_free, swap_total) = check_mem_swap_ex()
if CHECK_ZRAM:
- mem_used_zram = check_zram()
+ zram_s, zram_m, mem_used_zram = check_zram_ex()
if CHECK_PSI:
- psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
- if time() - psi_t0 >= psi_post_action_delay:
- psi_post_action_delay_exceeded = True
- else:
- psi_post_action_delay_exceeded = False
+ (psi_s, psi_m, psi_t0, psi_kill_exceeded_timer,
+ psi_term_exceeded_timer, x0) = check_psi_ex(
+ psi_t0,psi_kill_exceeded_timer,psi_term_exceeded_timer,x0)
- if print_mem_check_results:
- psi_avg_string = 'PSI avg: {} | '.format(
- str(psi_avg_value).rjust(6))
if print_mem_check_results:
+ if CHECK_PSI:
+ psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
+ if time() - psi_t0 >= psi_post_action_delay:
+ psi_post_action_delay_exceeded = True
+ else:
+ psi_post_action_delay_exceeded = False
+
+ if print_mem_check_results:
+ psi_avg_string = 'PSI avg: {} | '.format(
+ str(psi_avg_value).rjust(6))
+
wt1 = time()
delta = (mem_available + swap_free) - new_mem
@@ -3024,168 +3105,64 @@ while True:
)
)
- ###########################################################################
- # CHECK HARD THRESHOLDS (SIGKILL LEVEL)
+ if masf_s is SIGKILL or zram_s is SIGKILL or psi_s is SIGKILL:
- if (mem_available <= mem_min_sigkill_kb and
- swap_free <= swap_min_sigkill_kb):
+ threshold = SIGKILL
+ mem_info_list = []
- mem_info = 'Hard threshold exceeded\nMemory status that requ' \
- 'ires corrective actions:' \
- '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
- 'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
- 'p_min_sigkill [{} MiB, {} %]'.format(
- kib_to_mib(mem_available),
- percent(mem_available / mem_total),
- kib_to_mib(mem_min_sigkill_kb),
- percent(mem_min_sigkill_kb / mem_total),
- kib_to_mib(swap_free),
- percent(swap_free / (swap_total + 0.1)),
- kib_to_mib(swap_min_sigkill_kb),
- swap_sigkill_pc)
+ if masf_m is not None:
+ mem_info_list.append(masf_m)
- implement_corrective_action(SIGKILL, mem_info)
- psi_t0 = time()
+ if zram_m is not None:
+ mem_info_list.append(zram_m)
+
+ if psi_m is not None:
+ mem_info_list.append(psi_m)
+
+ psi_t0 = implement_corrective_action(
+ threshold,
+ mem_info_list,
+ psi_t0,
+ psi_kill_exceeded_timer,
+ psi_term_exceeded_timer,
+ x0, psi_s, zram_s, zram_m, psi_m)
continue
- if CHECK_ZRAM:
- if mem_used_zram >= zram_max_sigkill_kb:
+ if masf_s is SIGTERM or zram_s is SIGTERM or psi_s is SIGTERM:
- mem_info = 'Hard threshold exceeded\nMemory status that requir' \
- 'es corrective actions:' \
- '\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
- 'kill [{} MiB, {} %]'.format(
- kib_to_mib(mem_used_zram),
- percent(mem_used_zram / mem_total),
- kib_to_mib(zram_max_sigkill_kb),
- percent(zram_max_sigkill_kb / mem_total))
+ threshold = SIGTERM
+ mem_info_list = []
- implement_corrective_action(SIGKILL, mem_info)
- psi_t0 = time()
- continue
+ if masf_m is not None:
+ mem_info_list.append(masf_m)
- if CHECK_PSI:
+ if zram_m is not None:
+ mem_info_list.append(zram_m)
- if psi_avg_value >= sigkill_psi_threshold:
- sigkill_psi_exceeded = True
- psi_kill_exceeded_timer += delta0
- else:
- sigkill_psi_exceeded = False
- psi_kill_exceeded_timer = 0
+ if psi_m is not None:
+ mem_info_list.append(psi_m)
- if psi_debug:
-
- log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded'
- ': {}\npsi_kill_exceeded_timer: {}'.format(
- psi_post_action_delay_exceeded,
- sigkill_psi_exceeded,
- round(psi_kill_exceeded_timer, 1)
- )
- )
-
- if (psi_kill_exceeded_timer >= psi_excess_duration and
- psi_post_action_delay_exceeded):
-
- mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \
- 'PSI avg exceeded psi_excess_duration (value' \
- ' = {} sec) for {} seconds'.format(
- psi_avg_value,
- sigkill_psi_threshold,
- psi_excess_duration,
- round(psi_kill_exceeded_timer, 1)
- )
-
- implement_corrective_action(SIGKILL, mem_info)
- psi_t0 = time()
- continue
-
- ###########################################################################
-
- # CHECK SOFT THRESHOLDS (SIGTERM LEVEL)
-
- if (mem_available <= mem_min_sigterm_kb and
- swap_free <= swap_min_sigterm_kb):
-
- mem_info = 'Soft threshold exceeded\nMemory status that requi' \
- 'res corrective actions:' \
- '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
- 'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
- 'p_min_sigterm [{} MiB, {} %]'.format(
- kib_to_mib(mem_available),
- percent(mem_available / mem_total),
- kib_to_mib(mem_min_sigterm_kb),
- round(mem_min_sigterm_percent, 1),
- kib_to_mib(swap_free),
- percent(swap_free / (swap_total + 0.1)),
- kib_to_mib(swap_min_sigterm_kb),
- swap_sigterm_pc)
-
- implement_corrective_action(SIGTERM, mem_info)
- psi_t0 = time()
+ psi_t0 = implement_corrective_action(
+ threshold,
+ mem_info_list,
+ psi_t0,
+ psi_kill_exceeded_timer,
+ psi_term_exceeded_timer,
+ x0, psi_s, zram_s, zram_m, psi_m)
continue
- if CHECK_ZRAM:
- if mem_used_zram >= zram_max_sigterm_kb:
-
- mem_info = 'Soft threshold exceeded\nMemory status that require' \
- 's corrective actions:\n MemUsedZram [{} MiB, {} %] >= zra' \
- 'm_max_sigterm [{} M, {} %]'.format(
- kib_to_mib(mem_used_zram),
- percent(mem_used_zram / mem_total),
- kib_to_mib(zram_max_sigterm_kb),
- percent(zram_max_sigterm_kb / mem_total))
-
- implement_corrective_action(SIGTERM, mem_info)
- psi_t0 = time()
- continue
-
- if CHECK_PSI:
- if psi_avg_value >= sigterm_psi_threshold:
- sigterm_psi_exceeded = True
- psi_term_exceeded_timer += delta0
- else:
- sigterm_psi_exceeded = False
- psi_term_exceeded_timer = 0
-
- if psi_debug:
-
- log('sigterm_psi_exceeded: {}\n'
- 'psi_term_exceeded_timer: {}\n'.format(
- sigterm_psi_exceeded,
- round(psi_term_exceeded_timer, 1)
- )
- )
-
- if (psi_term_exceeded_timer >= psi_excess_duration and
- psi_post_action_delay_exceeded):
-
- mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \
- 'PSI avg exceeded psi_excess_duration (value' \
- ' = {} sec) for {} seconds'.format(
- psi_avg_value,
- sigterm_psi_threshold,
- psi_excess_duration,
- round(psi_term_exceeded_timer, 1)
- )
-
- implement_corrective_action(SIGTERM, mem_info)
- psi_t0 = time()
- continue
-
- ###########################################################################
-
if gui_low_memory_warnings:
- if (mem_available <= mem_min_warnings_kb and
- swap_free <= swap_min_warnings_kb + 0.1 or
- mem_used_zram >= zram_max_warnings_kb):
+ if masf_s == 'WARN' or zram_s == 'WARN' or psi_s == 'WARN':
warn_time_delta = time() - warn_time_now
warn_time_now = time()
warn_timer += warn_time_delta
if warn_timer > min_time_between_warnings:
+
send_notify_warn()
+
warn_timer = 0
sleep_after_check_mem()
diff --git a/nohang.conf b/nohang.conf
index ad3e0b9..a2ed25a 100644
--- a/nohang.conf
+++ b/nohang.conf
@@ -107,7 +107,7 @@ sigterm_psi_threshold = 60
sigkill_psi_threshold = 90
>= 0, float
-psi_excess_duration = 30
+psi_excess_duration = 40
psi_post_action_delay = 20
@@ -289,6 +289,8 @@ swap_min_warnings = 50 %
zram_max_warnings = 40 %
+psi_avg_warnings = 60
+
Valid values are floating-point numbers from the range [1; 300].
min_time_between_warnings = 15
@@ -336,7 +338,7 @@ print_victim_info = True
print_victim_cmdline = False
-max_ancestry_depth = 1
+max_ancestry_depth = 5
separate_log = False
diff --git a/trash/psi_dummy b/trash/psi_dummy
index 60b9136..f490e2e 100644
--- a/trash/psi_dummy
+++ b/trash/psi_dummy
@@ -1,2 +1,2 @@
-some avg10=29.70 avg60=51.59 avg300=22.92 total=195239452
-full avg10=28.82 avg60=49.77 avg300=21.83 total=182504463
+some avg10=56.70 avg60=51.59 avg300=22.92 total=195239452
+full avg10=28.82 avg60=49.77 avg300=21.83 total=182504463