fix broken psi and zram

This commit is contained in:
Alexey Avramov 2019-06-09 21:14:41 +09:00
parent fdf2a1bebf
commit 30132b3e03
3 changed files with 330 additions and 351 deletions

671
nohang
View File

@ -815,11 +815,11 @@ def send_notify_warn():
send_notification(title, body)
def send_notify(signal, name, pid):
def send_notify(threshold, name, pid):
"""
Notificate about OOM Preventing.
signal: key for notify_sig_dict
threshold: key for notify_sig_dict
name: str process name
pid: str process pid
"""
@ -831,7 +831,7 @@ def send_notify(signal, name, pid):
title = 'Freeze prevention'
body = '<b>{}</b> [{}] <b>{}</b>'.format(
notify_sig_dict[signal],
notify_sig_dict[threshold],
pid,
name.replace(
# symbol '&' can break notifications in some themes,
@ -1041,6 +1041,7 @@ def find_victim(_print_proc_table):
)[0]
pid = pid_tuple_list[0]
victim_id = get_victim_id(pid)
# Get maximum 'badness' value
victim_badness = pid_tuple_list[1]
@ -1062,7 +1063,7 @@ def find_victim(_print_proc_table):
)
)
return pid, victim_badness, victim_name
return pid, victim_badness, victim_name, victim_id
def find_victim_info(pid, victim_badness, name):
@ -1271,18 +1272,6 @@ def find_victim_info(pid, victim_badness, name):
return victim_info
def check_mem_swap_ex():
"""
Check: is mem and swap threshold exceeded?
@ -1314,8 +1303,8 @@ def check_mem_swap_ex():
if (mem_available <= mem_min_sigkill_kb and
swap_free <= swap_min_sigkill_kb):
mem_info = 'Hard threshold exceeded\nMemory status that requ' \
'ires corrective actions:' \
mem_info = 'Memory status that requ' \
'ires corrective actions (hard threshold exceeded):' \
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
'p_min_sigkill [{} MiB, {} %]'.format(
@ -1328,13 +1317,13 @@ def check_mem_swap_ex():
kib_to_mib(swap_min_sigkill_kb),
swap_sigkill_pc)
return SIGKILL, mem_info
return SIGKILL, mem_info, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
if (mem_available <= mem_min_sigterm_kb and
swap_free <= swap_min_sigterm_kb):
mem_info = 'Soft threshold exceeded\nMemory status that requi' \
'res corrective actions:' \
mem_info = 'Memory status that requi' \
'res corrective actions (soft threshold exceeded):' \
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
'p_min_sigterm [{} MiB, {} %]'.format(
@ -1347,24 +1336,146 @@ def check_mem_swap_ex():
kib_to_mib(swap_min_sigterm_kb),
swap_sigterm_pc)
return SIGTERM, mem_info
return SIGTERM, mem_info, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
return None, None
if gui_low_memory_warnings:
if (mem_available <= mem_min_warnings_kb and swap_free <=
swap_min_warnings_kb + 0.1):
return 'WARN', None, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
return None, None, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
def check_zram_ex():
"""
"""
mem_used_zram = check_zram()
if mem_used_zram >= zram_max_sigkill_kb:
mem_info = 'Memory status that requir' \
'es corrective actions (hard threshold exceeded):' \
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
'kill [{} MiB, {} %]'.format(
kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigkill_kb),
percent(zram_max_sigkill_kb / mem_total))
return SIGKILL, mem_info, mem_used_zram
if mem_used_zram >= zram_max_sigterm_kb:
mem_info = 'Memory status that require' \
's corrective actions (soft threshold exceeded):\n MemUsedZram [{} MiB, {} %] >= zra' \
'm_max_sigterm [{} M, {} %]'.format(
kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigterm_kb),
percent(zram_max_sigterm_kb / mem_total))
return SIGTERM, mem_info, mem_used_zram
if gui_low_memory_warnings:
if mem_used_zram >= zram_max_warnings_kb:
return 'WARN', None, mem_used_zram
return None, None, mem_used_zram
def check_psi_ex(psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0):
"""
"""
delta0 = time() - x0
x0 = time()
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
# print(psi_avg_value)
psi_post_action_delay_timer = time() - psi_t0
if psi_post_action_delay_timer >= psi_post_action_delay:
psi_post_action_delay_exceeded = True
else:
psi_post_action_delay_exceeded = False
if psi_avg_value >= sigkill_psi_threshold:
sigkill_psi_exceeded = True
psi_kill_exceeded_timer += delta0
else:
sigkill_psi_exceeded = False
psi_kill_exceeded_timer = 0
if psi_debug:
log('psi_post_action_delay_timer: {}'.format(
round(psi_post_action_delay_timer, 3)))
log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded'
': {}\npsi_kill_exceeded_timer: {}'.format(
psi_post_action_delay_exceeded,
sigkill_psi_exceeded,
round(psi_kill_exceeded_timer, 1)
)
)
if (psi_kill_exceeded_timer >= psi_excess_duration and
psi_post_action_delay_exceeded):
mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \
'PSI avg exceeded psi_excess_duration (value' \
' = {} sec) for {} seconds'.format(
psi_avg_value,
sigkill_psi_threshold,
psi_excess_duration,
round(psi_kill_exceeded_timer, 1)
)
# psi_t0 = time() # ВОТ ГДЕ ПРОБЛЕМА. Таймер надо сбрасывать именно после применения корректирующего действия. Именно ПОСЛЕ. А не здесь.
# Или после любого применения, или после успешного.
# Если жертва умерла в процессе поиска - сбрасываем. Если отправлен
# сигнал - сбрасываем.
return SIGKILL, mem_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
if psi_avg_value >= sigterm_psi_threshold:
sigterm_psi_exceeded = True
psi_term_exceeded_timer += delta0
else:
sigterm_psi_exceeded = False
psi_term_exceeded_timer = 0
if psi_debug:
log('sigterm_psi_exceeded: {}\n'
'psi_term_exceeded_timer: {}\n'.format(
sigterm_psi_exceeded,
round(psi_term_exceeded_timer, 1)
)
)
if (psi_term_exceeded_timer >= psi_excess_duration and
psi_post_action_delay_exceeded):
mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \
'PSI avg exceeded psi_excess_duration (value' \
' = {} sec) for {} seconds'.format(
psi_avg_value,
sigterm_psi_threshold,
psi_excess_duration,
round(psi_term_exceeded_timer, 1)
)
return SIGTERM, mem_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
if gui_low_memory_warnings:
if psi_avg_value >= psi_avg_warnings:
return 'WARN', None, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
return None, None, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
def is_victim_alive(pid):
@ -1384,31 +1495,26 @@ def is_victim_alive(pid):
return 0
def implement_corrective_action(signal, mem_info):
def implement_corrective_action(
threshold,
mem_info_list,
psi_t0,
psi_kill_exceeded_timer,
psi_term_exceeded_timer,
x0, psi_s, zram_s, zram_m, psi_m):
"""
Find victim with highest badness and send SIGTERM/SIGKILL
"""
# Ёбаная запутанная фция. Распутать всё нахуй. Выделить части в отдельн фции.
# Разбить саму фцию на части. Сделать ее структуру простой и понятной.
time0 = time() # начало корр действия. Для вычисл времени действия.
time0 = time() # начало корр действия. Для вычисл времени действия.
# выходим из фции, если для SIGTERM порога не превышено время
# min_delay_after_sigterm и спим в течение over_sleep
# если хард порог превышен - идем дальше.
if signal is SIGTERM:
if threshold is SIGTERM:
dt = time() - actions_time_dict['action_handled'][0]
@ -1422,7 +1528,7 @@ def implement_corrective_action(signal, mem_info):
sleep(over_sleep)
return None # время задержки между действиями не истекло
return psi_t0 # время задержки между действиями не истекло
else:
log('min_delay_after_sigterm IS EXCEEDED, it is time to action')
@ -1442,60 +1548,104 @@ def implement_corrective_action(signal, mem_info):
"""
log(mem_info)
for i in mem_info_list:
log(i)
# ищем жертву с ее бэднес.
pid, victim_badness, name = find_victim(print_proc_table)
pid, victim_badness, name, victim_id = find_victim(print_proc_table)
# sleep(0.1)
new_signal, mem_info = check_mem_swap_ex()
#log(new_signal)
#log(mem_info)
if new_signal is None:
log('Recheck memory levels...')
# перепроверяем пороги: они могли измениться за время поиска жертвы
(masf_s, masf_m, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb,
swap_free, swap_total) = check_mem_swap_ex()
if CHECK_ZRAM:
zram_s, zram_m, mem_used_zram = check_zram_ex()
if CHECK_PSI:
(psi_s, psi_m, psi_t0, psi_kill_exceeded_timer,
psi_term_exceeded_timer, x0) = check_psi_ex(
psi_t0,psi_kill_exceeded_timer,psi_term_exceeded_timer,x0)
if masf_s is SIGKILL or zram_s is SIGKILL or psi_s is SIGKILL:
new_threshold = SIGKILL
mem_info_list = []
if masf_s is SIGKILL or masf_s is SIGTERM:
mem_info_list.append(masf_m)
if zram_s is SIGKILL or zram_s is SIGTERM:
mem_info_list.append(zram_m)
if psi_s is SIGKILL or psi_s is SIGTERM:
mem_info_list.append(psi_m)
elif masf_s is SIGTERM or zram_s is SIGTERM or psi_s is SIGTERM:
new_threshold = SIGTERM
mem_info_list = []
if masf_s is SIGKILL or masf_s is SIGTERM:
mem_info_list.append(masf_m)
if zram_s is SIGKILL or zram_s is SIGTERM:
mem_info_list.append(zram_m)
if psi_s is SIGKILL or psi_s is SIGTERM:
mem_info_list.append(psi_m)
else:
log('Thresholds is not exceeded now')
return None
return psi_t0
if new_signal is not signal:
log(mem_info)
signal = new_signal
#log(mem_info)
# печать порогов
for i in mem_info_list:
log(i)
# может это излишне
if new_threshold is None or new_threshold == 'WARN':
log('Thresholds is not exceeded now')
return psi_t0
threshold = new_threshold
if victim_badness >= min_badness:
psi_t0 = time()
if print_victim_info:
victim_info = find_victim_info(pid, victim_badness, name)
log(victim_info)
# пороги могли превысиься за время поиска жертвы (поиск может занимать
# сотни миллисекунд)
mem_available, swap_total, swap_free = check_mem_and_swap()
ma_mib = int(mem_available) / 1024.0
sf_mib = int(swap_free) / 1024.0
log('Memory status before implementing a corrective act'
'ion:\n MemAvailable'
': {} MiB, SwapFree: {} MiB'.format(
round(ma_mib, 1), round(sf_mib, 1)
)
)
if (mem_available <= mem_min_sigkill_kb and
swap_free <= swap_min_sigkill_kb):
log('Hard threshold exceeded')
signal = SIGKILL
victim_id = get_victim_id(pid)
# kill the victim if it doesn't respond to SIGTERM В ТЕЧЕНИЕ
# ЗАДАННОГО ВРЕМЕНИ
# переопределяем сигнал для старых жертв
if signal is SIGTERM:
if threshold is SIGTERM:
if victim_id in victim_dict:
@ -1504,12 +1654,12 @@ def implement_corrective_action(signal, mem_info):
if dt > max_post_sigterm_victim_lifetime:
print('max_post_sigterm_victim_lifetime exceeded: the '
'victim will get SIGKILL')
signal = SIGKILL
threshold = SIGKILL
# matching with re to customize corrective actions
soft_match = False
if soft_actions and signal is SIGTERM:
if soft_actions and threshold is SIGTERM:
name = pid_to_name(pid)
cgroup_v1 = pid_to_cgroup_v1(pid)
service = ''
@ -1530,9 +1680,7 @@ def implement_corrective_action(signal, mem_info):
soft_match = True
break
if soft_match: # ПЕРЕОПРЕДЕЛЕНИЕ МЯГКОГО КОРРЕКТИРУЮЩЕГО ДЕЙСТВИЯ
if soft_match: # ПЕРЕОПРЕДЕЛЕНИЕ МЯГКОГО КОРРЕКТИРУЮЩЕГО ДЕЙСТВИЯ
# todo: make new func
m = check_mem_and_swap()
@ -1588,16 +1736,11 @@ def implement_corrective_action(signal, mem_info):
# обычное действие через сигнал
# вот тут поработать. Тут ебаный цикл. Нахуй его.
try: # вот тут засрано, в блоке try должно быть только kill(), остальное ниже за пределами
try:
os.kill(int(pid), signal)
os.kill(int(pid), threshold)
kill_timestamp = time()
response_time = kill_timestamp - time0
@ -1625,10 +1768,7 @@ def implement_corrective_action(signal, mem_info):
log('Process exited (VmRSS = 0) in {} sec'.format(
round(dt, 5)))
if signal is SIGKILL or victim_alive == 2:
if threshold is SIGKILL or victim_alive == 2:
# жертва умирает от SIGKILL. Дожидаемся ее полной смерти.
while True:
@ -1641,8 +1781,7 @@ def implement_corrective_action(signal, mem_info):
log('The victim died in {} sec'.format(
round(kill_duration, 3)))
"""
mem_available, swap_total, swap_free = check_mem_and_swap()
ma_mib = int(mem_available) / 1024.0
@ -1653,17 +1792,18 @@ def implement_corrective_action(signal, mem_info):
round(ma_mib, 1), round(sf_mib, 1)
)
)
"""
send_result = 'total response time: {} ms'.format(
round(response_time * 1000))
preventing_oom_message = 'Implement a corrective action:' \
'\n Send {} to the victim; {}'.format(
sig_dict[signal], send_result)
sig_dict[threshold], send_result)
key = 'Send {} to {}'.format(sig_dict[signal], name)
key = 'Send {} to {}'.format(sig_dict[threshold], name)
if signal is SIGKILL and post_kill_exe != '':
if threshold is SIGKILL and post_kill_exe != '':
cmd = post_kill_exe.replace('$PID', pid).replace(
'$NAME', pid_to_name(pid))
@ -1673,7 +1813,7 @@ def implement_corrective_action(signal, mem_info):
exe(cmd)
if gui_notifications:
send_notify(signal, name, pid)
send_notify(threshold, name, pid)
except FileNotFoundError:
response_time = time() - time0
@ -1696,10 +1836,11 @@ def implement_corrective_action(signal, mem_info):
update_stat_dict_and_print(key)
# нехуй делать, бэднес жертвы слишком мал
else:
# может эту часть наверх отправить через if
response_time = time() - time0
victim_badness_is_too_small = 'victim badness {} < min_b' \
'adness {}; nothing to do; response time: {} ms'.format(
@ -1724,21 +1865,13 @@ def implement_corrective_action(signal, mem_info):
print('##################################################################')
sleep(over_sleep) # Спать если бэднес жертвы мал
# Что делать с psi_t0 если у мертвы мал бэднес? Ничего, потому что кор действия не было.
# демон может жрать 10% цпу при этом. Можно отдельн парам ввести. А можно
# не вводить. кек
return psi_t0
def sleep_after_check_mem():
@ -1802,8 +1935,8 @@ def sleep_after_check_mem():
log(
'Sleep {} sec (t_mem={}, t_swap={}{})'.format(
round(t, 2),round(t_mem, 2),round(t_swap, 2), z)
)
round(t, 2), round(t_mem, 2), round(t_swap, 2), z)
)
try:
stdout.flush()
@ -1874,27 +2007,9 @@ def calculate_percent(arg_key):
return mem_min_kb, mem_min_mb, mem_min_percent
##########################################################################
victim_dict = dict()
victim_id = None
actions_time_dict = dict()
@ -1902,31 +2017,6 @@ actions_time_dict['action_handled'] = [time(), victim_id]
# print(actions_time_dict)
start_time = time()
@ -1974,17 +2064,6 @@ else:
notify_helper_path = '/usr/sbin/nohang_notify_helper'
# will store corrective actions stat
stat_dict = dict()
@ -2319,8 +2398,6 @@ print_victim_info = conf_parse_bool('print_victim_info')
print_victim_cmdline = conf_parse_bool('print_victim_cmdline')
print_config = conf_parse_bool('print_config')
print_mem_check_results = conf_parse_bool('print_mem_check_results')
print_sleep_periods = conf_parse_bool('print_sleep_periods')
@ -2459,6 +2536,20 @@ else:
exit(1)
if 'psi_avg_warnings' in config_dict:
psi_avg_warnings = string_to_float_convert_test(
config_dict['psi_avg_warnings'])
if psi_avg_warnings is None:
errprint('Invalid psi_avg_warnings value, not float\nExit')
exit(1)
if psi_avg_warnings < 0 or psi_avg_warnings > 100:
errprint('psi_avg_warnings must be in the range [0; 100]\nExit')
exit(1)
else:
errprint('psi_avg_warnings not in config\nExit')
exit(1)
if 'min_badness' in config_dict:
min_badness = string_to_int_convert_test(
config_dict['min_badness'])
@ -2876,9 +2967,6 @@ psi_avg_string = '' # will be overwritten if PSI monitoring enabled
mem_used_zram = 0
if psi_support and not ignore_psi:
psi_t0 = time()
if print_mem_check_results:
@ -2895,10 +2983,25 @@ for i in sig_list:
signal(i, signal_handler)
x0 = time()
delta0 = 0
threshold = None
mem_info = None
#print(x0, 'x0')
CHECK_PSI = False
if psi_support and not ignore_psi:
CHECK_PSI = True
psi_kill_exceeded_timer = 0
psi_term_exceeded_timer = 0
psi_t0 = time()
psi_s = zram_s = zram_m = psi_m = None
CHECK_ZRAM = not ignore_zram
@ -2907,58 +3010,36 @@ log('Monitoring has started!')
stdout.flush()
psi_kill_exceeded_timer = psi_term_exceeded_timer = delta0 = 0
x0 = time()
##########################################################################
while True:
delta0 = time() - x0
x0 = time()
# FIND VALUES: mem, swap, zram, psi
mem_available, swap_total, swap_free = check_mem_and_swap()
# if swap_min_sigkill is set in percent
if swap_kill_is_percent:
swap_min_sigkill_kb = swap_total * swap_min_sigkill_percent / 100.0
if swap_term_is_percent:
swap_min_sigterm_kb = swap_total * swap_min_sigterm_percent / 100.0
if swap_warn_is_percent:
swap_min_warnings_kb = swap_total * swap_min_warnings_percent / 100.0
if swap_total > swap_min_sigkill_kb:
swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1))
else:
swap_sigkill_pc = '-'
if swap_total > swap_min_sigterm_kb:
swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1))
else:
swap_sigterm_pc = '-'
(masf_s, masf_m, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb,
swap_free, swap_total) = check_mem_swap_ex()
if CHECK_ZRAM:
mem_used_zram = check_zram()
zram_s, zram_m, mem_used_zram = check_zram_ex()
if CHECK_PSI:
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
if time() - psi_t0 >= psi_post_action_delay:
psi_post_action_delay_exceeded = True
else:
psi_post_action_delay_exceeded = False
(psi_s, psi_m, psi_t0, psi_kill_exceeded_timer,
psi_term_exceeded_timer, x0) = check_psi_ex(
psi_t0,psi_kill_exceeded_timer,psi_term_exceeded_timer,x0)
if print_mem_check_results:
psi_avg_string = 'PSI avg: {} | '.format(
str(psi_avg_value).rjust(6))
if print_mem_check_results:
if CHECK_PSI:
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
if time() - psi_t0 >= psi_post_action_delay:
psi_post_action_delay_exceeded = True
else:
psi_post_action_delay_exceeded = False
if print_mem_check_results:
psi_avg_string = 'PSI avg: {} | '.format(
str(psi_avg_value).rjust(6))
wt1 = time()
delta = (mem_available + swap_free) - new_mem
@ -3024,168 +3105,64 @@ while True:
)
)
###########################################################################
# CHECK HARD THRESHOLDS (SIGKILL LEVEL)
if masf_s is SIGKILL or zram_s is SIGKILL or psi_s is SIGKILL:
if (mem_available <= mem_min_sigkill_kb and
swap_free <= swap_min_sigkill_kb):
threshold = SIGKILL
mem_info_list = []
mem_info = 'Hard threshold exceeded\nMemory status that requ' \
'ires corrective actions:' \
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
'p_min_sigkill [{} MiB, {} %]'.format(
kib_to_mib(mem_available),
percent(mem_available / mem_total),
kib_to_mib(mem_min_sigkill_kb),
percent(mem_min_sigkill_kb / mem_total),
kib_to_mib(swap_free),
percent(swap_free / (swap_total + 0.1)),
kib_to_mib(swap_min_sigkill_kb),
swap_sigkill_pc)
if masf_m is not None:
mem_info_list.append(masf_m)
implement_corrective_action(SIGKILL, mem_info)
psi_t0 = time()
if zram_m is not None:
mem_info_list.append(zram_m)
if psi_m is not None:
mem_info_list.append(psi_m)
psi_t0 = implement_corrective_action(
threshold,
mem_info_list,
psi_t0,
psi_kill_exceeded_timer,
psi_term_exceeded_timer,
x0, psi_s, zram_s, zram_m, psi_m)
continue
if CHECK_ZRAM:
if mem_used_zram >= zram_max_sigkill_kb:
if masf_s is SIGTERM or zram_s is SIGTERM or psi_s is SIGTERM:
mem_info = 'Hard threshold exceeded\nMemory status that requir' \
'es corrective actions:' \
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
'kill [{} MiB, {} %]'.format(
kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigkill_kb),
percent(zram_max_sigkill_kb / mem_total))
threshold = SIGTERM
mem_info_list = []
implement_corrective_action(SIGKILL, mem_info)
psi_t0 = time()
continue
if masf_m is not None:
mem_info_list.append(masf_m)
if CHECK_PSI:
if zram_m is not None:
mem_info_list.append(zram_m)
if psi_avg_value >= sigkill_psi_threshold:
sigkill_psi_exceeded = True
psi_kill_exceeded_timer += delta0
else:
sigkill_psi_exceeded = False
psi_kill_exceeded_timer = 0
if psi_m is not None:
mem_info_list.append(psi_m)
if psi_debug:
log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded'
': {}\npsi_kill_exceeded_timer: {}'.format(
psi_post_action_delay_exceeded,
sigkill_psi_exceeded,
round(psi_kill_exceeded_timer, 1)
)
)
if (psi_kill_exceeded_timer >= psi_excess_duration and
psi_post_action_delay_exceeded):
mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \
'PSI avg exceeded psi_excess_duration (value' \
' = {} sec) for {} seconds'.format(
psi_avg_value,
sigkill_psi_threshold,
psi_excess_duration,
round(psi_kill_exceeded_timer, 1)
)
implement_corrective_action(SIGKILL, mem_info)
psi_t0 = time()
continue
###########################################################################
# CHECK SOFT THRESHOLDS (SIGTERM LEVEL)
if (mem_available <= mem_min_sigterm_kb and
swap_free <= swap_min_sigterm_kb):
mem_info = 'Soft threshold exceeded\nMemory status that requi' \
'res corrective actions:' \
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
'p_min_sigterm [{} MiB, {} %]'.format(
kib_to_mib(mem_available),
percent(mem_available / mem_total),
kib_to_mib(mem_min_sigterm_kb),
round(mem_min_sigterm_percent, 1),
kib_to_mib(swap_free),
percent(swap_free / (swap_total + 0.1)),
kib_to_mib(swap_min_sigterm_kb),
swap_sigterm_pc)
implement_corrective_action(SIGTERM, mem_info)
psi_t0 = time()
psi_t0 = implement_corrective_action(
threshold,
mem_info_list,
psi_t0,
psi_kill_exceeded_timer,
psi_term_exceeded_timer,
x0, psi_s, zram_s, zram_m, psi_m)
continue
if CHECK_ZRAM:
if mem_used_zram >= zram_max_sigterm_kb:
mem_info = 'Soft threshold exceeded\nMemory status that require' \
's corrective actions:\n MemUsedZram [{} MiB, {} %] >= zra' \
'm_max_sigterm [{} M, {} %]'.format(
kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigterm_kb),
percent(zram_max_sigterm_kb / mem_total))
implement_corrective_action(SIGTERM, mem_info)
psi_t0 = time()
continue
if CHECK_PSI:
if psi_avg_value >= sigterm_psi_threshold:
sigterm_psi_exceeded = True
psi_term_exceeded_timer += delta0
else:
sigterm_psi_exceeded = False
psi_term_exceeded_timer = 0
if psi_debug:
log('sigterm_psi_exceeded: {}\n'
'psi_term_exceeded_timer: {}\n'.format(
sigterm_psi_exceeded,
round(psi_term_exceeded_timer, 1)
)
)
if (psi_term_exceeded_timer >= psi_excess_duration and
psi_post_action_delay_exceeded):
mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \
'PSI avg exceeded psi_excess_duration (value' \
' = {} sec) for {} seconds'.format(
psi_avg_value,
sigterm_psi_threshold,
psi_excess_duration,
round(psi_term_exceeded_timer, 1)
)
implement_corrective_action(SIGTERM, mem_info)
psi_t0 = time()
continue
###########################################################################
if gui_low_memory_warnings:
if (mem_available <= mem_min_warnings_kb and
swap_free <= swap_min_warnings_kb + 0.1 or
mem_used_zram >= zram_max_warnings_kb):
if masf_s == 'WARN' or zram_s == 'WARN' or psi_s == 'WARN':
warn_time_delta = time() - warn_time_now
warn_time_now = time()
warn_timer += warn_time_delta
if warn_timer > min_time_between_warnings:
send_notify_warn()
warn_timer = 0
sleep_after_check_mem()

View File

@ -107,7 +107,7 @@ sigterm_psi_threshold = 60
sigkill_psi_threshold = 90
>= 0, float
psi_excess_duration = 30
psi_excess_duration = 40
psi_post_action_delay = 20
@ -289,6 +289,8 @@ swap_min_warnings = 50 %
zram_max_warnings = 40 %
psi_avg_warnings = 60
Valid values are floating-point numbers from the range [1; 300].
min_time_between_warnings = 15
@ -336,7 +338,7 @@ print_victim_info = True
print_victim_cmdline = False
max_ancestry_depth = 1
max_ancestry_depth = 5
separate_log = False

View File

@ -1,2 +1,2 @@
some avg10=29.70 avg60=51.59 avg300=22.92 total=195239452
full avg10=28.82 avg60=49.77 avg300=21.83 total=182504463
some avg10=56.70 avg60=51.59 avg300=22.92 total=195239452
full avg10=28.82 avg60=49.77 avg300=21.83 total=182504463