fix broken psi and zram
This commit is contained in:
parent
fdf2a1bebf
commit
30132b3e03
671
nohang
671
nohang
@ -815,11 +815,11 @@ def send_notify_warn():
|
||||
send_notification(title, body)
|
||||
|
||||
|
||||
def send_notify(signal, name, pid):
|
||||
def send_notify(threshold, name, pid):
|
||||
"""
|
||||
Notificate about OOM Preventing.
|
||||
|
||||
signal: key for notify_sig_dict
|
||||
threshold: key for notify_sig_dict
|
||||
name: str process name
|
||||
pid: str process pid
|
||||
"""
|
||||
@ -831,7 +831,7 @@ def send_notify(signal, name, pid):
|
||||
|
||||
title = 'Freeze prevention'
|
||||
body = '<b>{}</b> [{}] <b>{}</b>'.format(
|
||||
notify_sig_dict[signal],
|
||||
notify_sig_dict[threshold],
|
||||
pid,
|
||||
name.replace(
|
||||
# symbol '&' can break notifications in some themes,
|
||||
@ -1041,6 +1041,7 @@ def find_victim(_print_proc_table):
|
||||
)[0]
|
||||
|
||||
pid = pid_tuple_list[0]
|
||||
victim_id = get_victim_id(pid)
|
||||
|
||||
# Get maximum 'badness' value
|
||||
victim_badness = pid_tuple_list[1]
|
||||
@ -1062,7 +1063,7 @@ def find_victim(_print_proc_table):
|
||||
)
|
||||
)
|
||||
|
||||
return pid, victim_badness, victim_name
|
||||
return pid, victim_badness, victim_name, victim_id
|
||||
|
||||
|
||||
def find_victim_info(pid, victim_badness, name):
|
||||
@ -1271,18 +1272,6 @@ def find_victim_info(pid, victim_badness, name):
|
||||
return victim_info
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def check_mem_swap_ex():
|
||||
"""
|
||||
Check: is mem and swap threshold exceeded?
|
||||
@ -1314,8 +1303,8 @@ def check_mem_swap_ex():
|
||||
if (mem_available <= mem_min_sigkill_kb and
|
||||
swap_free <= swap_min_sigkill_kb):
|
||||
|
||||
mem_info = 'Hard threshold exceeded\nMemory status that requ' \
|
||||
'ires corrective actions:' \
|
||||
mem_info = 'Memory status that requ' \
|
||||
'ires corrective actions (hard threshold exceeded):' \
|
||||
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
||||
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
||||
'p_min_sigkill [{} MiB, {} %]'.format(
|
||||
@ -1328,13 +1317,13 @@ def check_mem_swap_ex():
|
||||
kib_to_mib(swap_min_sigkill_kb),
|
||||
swap_sigkill_pc)
|
||||
|
||||
return SIGKILL, mem_info
|
||||
return SIGKILL, mem_info, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
|
||||
|
||||
if (mem_available <= mem_min_sigterm_kb and
|
||||
swap_free <= swap_min_sigterm_kb):
|
||||
|
||||
mem_info = 'Soft threshold exceeded\nMemory status that requi' \
|
||||
'res corrective actions:' \
|
||||
mem_info = 'Memory status that requi' \
|
||||
'res corrective actions (soft threshold exceeded):' \
|
||||
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
||||
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
||||
'p_min_sigterm [{} MiB, {} %]'.format(
|
||||
@ -1347,24 +1336,146 @@ def check_mem_swap_ex():
|
||||
kib_to_mib(swap_min_sigterm_kb),
|
||||
swap_sigterm_pc)
|
||||
|
||||
return SIGTERM, mem_info
|
||||
return SIGTERM, mem_info, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
|
||||
|
||||
return None, None
|
||||
if gui_low_memory_warnings:
|
||||
|
||||
if (mem_available <= mem_min_warnings_kb and swap_free <=
|
||||
swap_min_warnings_kb + 0.1):
|
||||
return 'WARN', None, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
|
||||
|
||||
return None, None, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
|
||||
|
||||
|
||||
def check_zram_ex():
|
||||
"""
|
||||
"""
|
||||
mem_used_zram = check_zram()
|
||||
|
||||
if mem_used_zram >= zram_max_sigkill_kb:
|
||||
|
||||
mem_info = 'Memory status that requir' \
|
||||
'es corrective actions (hard threshold exceeded):' \
|
||||
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
|
||||
'kill [{} MiB, {} %]'.format(
|
||||
kib_to_mib(mem_used_zram),
|
||||
percent(mem_used_zram / mem_total),
|
||||
kib_to_mib(zram_max_sigkill_kb),
|
||||
percent(zram_max_sigkill_kb / mem_total))
|
||||
|
||||
return SIGKILL, mem_info, mem_used_zram
|
||||
|
||||
if mem_used_zram >= zram_max_sigterm_kb:
|
||||
|
||||
mem_info = 'Memory status that require' \
|
||||
's corrective actions (soft threshold exceeded):\n MemUsedZram [{} MiB, {} %] >= zra' \
|
||||
'm_max_sigterm [{} M, {} %]'.format(
|
||||
kib_to_mib(mem_used_zram),
|
||||
percent(mem_used_zram / mem_total),
|
||||
kib_to_mib(zram_max_sigterm_kb),
|
||||
percent(zram_max_sigterm_kb / mem_total))
|
||||
|
||||
return SIGTERM, mem_info, mem_used_zram
|
||||
|
||||
if gui_low_memory_warnings:
|
||||
if mem_used_zram >= zram_max_warnings_kb:
|
||||
return 'WARN', None, mem_used_zram
|
||||
|
||||
return None, None, mem_used_zram
|
||||
|
||||
|
||||
def check_psi_ex(psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0):
|
||||
"""
|
||||
"""
|
||||
|
||||
delta0 = time() - x0
|
||||
x0 = time()
|
||||
|
||||
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
|
||||
# print(psi_avg_value)
|
||||
|
||||
psi_post_action_delay_timer = time() - psi_t0
|
||||
|
||||
if psi_post_action_delay_timer >= psi_post_action_delay:
|
||||
psi_post_action_delay_exceeded = True
|
||||
else:
|
||||
psi_post_action_delay_exceeded = False
|
||||
|
||||
if psi_avg_value >= sigkill_psi_threshold:
|
||||
sigkill_psi_exceeded = True
|
||||
psi_kill_exceeded_timer += delta0
|
||||
else:
|
||||
sigkill_psi_exceeded = False
|
||||
psi_kill_exceeded_timer = 0
|
||||
|
||||
if psi_debug:
|
||||
|
||||
log('psi_post_action_delay_timer: {}'.format(
|
||||
round(psi_post_action_delay_timer, 3)))
|
||||
|
||||
log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded'
|
||||
': {}\npsi_kill_exceeded_timer: {}'.format(
|
||||
psi_post_action_delay_exceeded,
|
||||
sigkill_psi_exceeded,
|
||||
round(psi_kill_exceeded_timer, 1)
|
||||
)
|
||||
)
|
||||
|
||||
if (psi_kill_exceeded_timer >= psi_excess_duration and
|
||||
psi_post_action_delay_exceeded):
|
||||
|
||||
mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \
|
||||
'PSI avg exceeded psi_excess_duration (value' \
|
||||
' = {} sec) for {} seconds'.format(
|
||||
psi_avg_value,
|
||||
sigkill_psi_threshold,
|
||||
psi_excess_duration,
|
||||
round(psi_kill_exceeded_timer, 1)
|
||||
)
|
||||
|
||||
# psi_t0 = time() # ВОТ ГДЕ ПРОБЛЕМА. Таймер надо сбрасывать именно после применения корректирующего действия. Именно ПОСЛЕ. А не здесь.
|
||||
# Или после любого применения, или после успешного.
|
||||
# Если жертва умерла в процессе поиска - сбрасываем. Если отправлен
|
||||
# сигнал - сбрасываем.
|
||||
|
||||
return SIGKILL, mem_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
|
||||
|
||||
if psi_avg_value >= sigterm_psi_threshold:
|
||||
sigterm_psi_exceeded = True
|
||||
psi_term_exceeded_timer += delta0
|
||||
else:
|
||||
sigterm_psi_exceeded = False
|
||||
psi_term_exceeded_timer = 0
|
||||
|
||||
if psi_debug:
|
||||
|
||||
log('sigterm_psi_exceeded: {}\n'
|
||||
'psi_term_exceeded_timer: {}\n'.format(
|
||||
sigterm_psi_exceeded,
|
||||
round(psi_term_exceeded_timer, 1)
|
||||
)
|
||||
)
|
||||
|
||||
if (psi_term_exceeded_timer >= psi_excess_duration and
|
||||
psi_post_action_delay_exceeded):
|
||||
|
||||
mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \
|
||||
'PSI avg exceeded psi_excess_duration (value' \
|
||||
' = {} sec) for {} seconds'.format(
|
||||
psi_avg_value,
|
||||
sigterm_psi_threshold,
|
||||
psi_excess_duration,
|
||||
round(psi_term_exceeded_timer, 1)
|
||||
)
|
||||
|
||||
return SIGTERM, mem_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
|
||||
|
||||
if gui_low_memory_warnings:
|
||||
|
||||
if psi_avg_value >= psi_avg_warnings:
|
||||
return 'WARN', None, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
|
||||
|
||||
return None, None, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
|
||||
|
||||
|
||||
def is_victim_alive(pid):
|
||||
@ -1384,31 +1495,26 @@ def is_victim_alive(pid):
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def implement_corrective_action(signal, mem_info):
|
||||
def implement_corrective_action(
|
||||
threshold,
|
||||
mem_info_list,
|
||||
psi_t0,
|
||||
psi_kill_exceeded_timer,
|
||||
psi_term_exceeded_timer,
|
||||
x0, psi_s, zram_s, zram_m, psi_m):
|
||||
"""
|
||||
Find victim with highest badness and send SIGTERM/SIGKILL
|
||||
"""
|
||||
|
||||
|
||||
|
||||
# Ёбаная запутанная фция. Распутать всё нахуй. Выделить части в отдельн фции.
|
||||
# Разбить саму фцию на части. Сделать ее структуру простой и понятной.
|
||||
|
||||
|
||||
time0 = time() # начало корр действия. Для вычисл времени действия.
|
||||
|
||||
time0 = time() # начало корр действия. Для вычисл времени действия.
|
||||
|
||||
# выходим из фции, если для SIGTERM порога не превышено время
|
||||
# min_delay_after_sigterm и спим в течение over_sleep
|
||||
# если хард порог превышен - идем дальше.
|
||||
if signal is SIGTERM:
|
||||
if threshold is SIGTERM:
|
||||
|
||||
dt = time() - actions_time_dict['action_handled'][0]
|
||||
|
||||
@ -1422,7 +1528,7 @@ def implement_corrective_action(signal, mem_info):
|
||||
|
||||
sleep(over_sleep)
|
||||
|
||||
return None # время задержки между действиями не истекло
|
||||
return psi_t0 # время задержки между действиями не истекло
|
||||
else:
|
||||
log('min_delay_after_sigterm IS EXCEEDED, it is time to action')
|
||||
|
||||
@ -1442,60 +1548,104 @@ def implement_corrective_action(signal, mem_info):
|
||||
|
||||
"""
|
||||
|
||||
log(mem_info)
|
||||
for i in mem_info_list:
|
||||
log(i)
|
||||
|
||||
# ищем жертву с ее бэднес.
|
||||
pid, victim_badness, name = find_victim(print_proc_table)
|
||||
pid, victim_badness, name, victim_id = find_victim(print_proc_table)
|
||||
# sleep(0.1)
|
||||
|
||||
new_signal, mem_info = check_mem_swap_ex()
|
||||
|
||||
|
||||
|
||||
#log(new_signal)
|
||||
#log(mem_info)
|
||||
|
||||
if new_signal is None:
|
||||
|
||||
|
||||
|
||||
log('Recheck memory levels...')
|
||||
|
||||
|
||||
|
||||
# перепроверяем пороги: они могли измениться за время поиска жертвы
|
||||
(masf_s, masf_m, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb,
|
||||
swap_free, swap_total) = check_mem_swap_ex()
|
||||
|
||||
|
||||
if CHECK_ZRAM:
|
||||
zram_s, zram_m, mem_used_zram = check_zram_ex()
|
||||
|
||||
if CHECK_PSI:
|
||||
(psi_s, psi_m, psi_t0, psi_kill_exceeded_timer,
|
||||
psi_term_exceeded_timer, x0) = check_psi_ex(
|
||||
psi_t0,psi_kill_exceeded_timer,psi_term_exceeded_timer,x0)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if masf_s is SIGKILL or zram_s is SIGKILL or psi_s is SIGKILL:
|
||||
|
||||
new_threshold = SIGKILL
|
||||
mem_info_list = []
|
||||
|
||||
if masf_s is SIGKILL or masf_s is SIGTERM:
|
||||
mem_info_list.append(masf_m)
|
||||
|
||||
if zram_s is SIGKILL or zram_s is SIGTERM:
|
||||
mem_info_list.append(zram_m)
|
||||
|
||||
if psi_s is SIGKILL or psi_s is SIGTERM:
|
||||
mem_info_list.append(psi_m)
|
||||
|
||||
elif masf_s is SIGTERM or zram_s is SIGTERM or psi_s is SIGTERM:
|
||||
|
||||
new_threshold = SIGTERM
|
||||
mem_info_list = []
|
||||
|
||||
if masf_s is SIGKILL or masf_s is SIGTERM:
|
||||
mem_info_list.append(masf_m)
|
||||
|
||||
if zram_s is SIGKILL or zram_s is SIGTERM:
|
||||
mem_info_list.append(zram_m)
|
||||
|
||||
if psi_s is SIGKILL or psi_s is SIGTERM:
|
||||
mem_info_list.append(psi_m)
|
||||
|
||||
else:
|
||||
log('Thresholds is not exceeded now')
|
||||
return None
|
||||
return psi_t0
|
||||
|
||||
if new_signal is not signal:
|
||||
log(mem_info)
|
||||
signal = new_signal
|
||||
|
||||
#log(mem_info)
|
||||
|
||||
|
||||
# печать порогов
|
||||
for i in mem_info_list:
|
||||
log(i)
|
||||
|
||||
# может это излишне
|
||||
if new_threshold is None or new_threshold == 'WARN':
|
||||
log('Thresholds is not exceeded now')
|
||||
return psi_t0
|
||||
|
||||
threshold = new_threshold
|
||||
|
||||
if victim_badness >= min_badness:
|
||||
|
||||
psi_t0 = time()
|
||||
|
||||
if print_victim_info:
|
||||
victim_info = find_victim_info(pid, victim_badness, name)
|
||||
log(victim_info)
|
||||
|
||||
# пороги могли превысиься за время поиска жертвы (поиск может занимать
|
||||
# сотни миллисекунд)
|
||||
mem_available, swap_total, swap_free = check_mem_and_swap()
|
||||
|
||||
ma_mib = int(mem_available) / 1024.0
|
||||
sf_mib = int(swap_free) / 1024.0
|
||||
log('Memory status before implementing a corrective act'
|
||||
'ion:\n MemAvailable'
|
||||
': {} MiB, SwapFree: {} MiB'.format(
|
||||
round(ma_mib, 1), round(sf_mib, 1)
|
||||
)
|
||||
)
|
||||
|
||||
if (mem_available <= mem_min_sigkill_kb and
|
||||
swap_free <= swap_min_sigkill_kb):
|
||||
log('Hard threshold exceeded')
|
||||
signal = SIGKILL
|
||||
|
||||
victim_id = get_victim_id(pid)
|
||||
|
||||
# kill the victim if it doesn't respond to SIGTERM В ТЕЧЕНИЕ
|
||||
# ЗАДАННОГО ВРЕМЕНИ
|
||||
|
||||
# переопределяем сигнал для старых жертв
|
||||
if signal is SIGTERM:
|
||||
if threshold is SIGTERM:
|
||||
|
||||
if victim_id in victim_dict:
|
||||
|
||||
@ -1504,12 +1654,12 @@ def implement_corrective_action(signal, mem_info):
|
||||
if dt > max_post_sigterm_victim_lifetime:
|
||||
print('max_post_sigterm_victim_lifetime exceeded: the '
|
||||
'victim will get SIGKILL')
|
||||
signal = SIGKILL
|
||||
threshold = SIGKILL
|
||||
|
||||
# matching with re to customize corrective actions
|
||||
soft_match = False
|
||||
|
||||
if soft_actions and signal is SIGTERM:
|
||||
if soft_actions and threshold is SIGTERM:
|
||||
name = pid_to_name(pid)
|
||||
cgroup_v1 = pid_to_cgroup_v1(pid)
|
||||
service = ''
|
||||
@ -1530,9 +1680,7 @@ def implement_corrective_action(signal, mem_info):
|
||||
soft_match = True
|
||||
break
|
||||
|
||||
|
||||
|
||||
if soft_match: # ПЕРЕОПРЕДЕЛЕНИЕ МЯГКОГО КОРРЕКТИРУЮЩЕГО ДЕЙСТВИЯ
|
||||
if soft_match: # ПЕРЕОПРЕДЕЛЕНИЕ МЯГКОГО КОРРЕКТИРУЮЩЕГО ДЕЙСТВИЯ
|
||||
|
||||
# todo: make new func
|
||||
m = check_mem_and_swap()
|
||||
@ -1588,16 +1736,11 @@ def implement_corrective_action(signal, mem_info):
|
||||
|
||||
# обычное действие через сигнал
|
||||
|
||||
|
||||
|
||||
# вот тут поработать. Тут ебаный цикл. Нахуй его.
|
||||
|
||||
try: # вот тут засрано, в блоке try должно быть только kill(), остальное ниже за пределами
|
||||
|
||||
|
||||
|
||||
try:
|
||||
|
||||
os.kill(int(pid), signal)
|
||||
os.kill(int(pid), threshold)
|
||||
kill_timestamp = time()
|
||||
response_time = kill_timestamp - time0
|
||||
|
||||
@ -1625,10 +1768,7 @@ def implement_corrective_action(signal, mem_info):
|
||||
log('Process exited (VmRSS = 0) in {} sec'.format(
|
||||
round(dt, 5)))
|
||||
|
||||
|
||||
|
||||
|
||||
if signal is SIGKILL or victim_alive == 2:
|
||||
if threshold is SIGKILL or victim_alive == 2:
|
||||
# жертва умирает от SIGKILL. Дожидаемся ее полной смерти.
|
||||
|
||||
while True:
|
||||
@ -1641,8 +1781,7 @@ def implement_corrective_action(signal, mem_info):
|
||||
log('The victim died in {} sec'.format(
|
||||
round(kill_duration, 3)))
|
||||
|
||||
|
||||
|
||||
"""
|
||||
mem_available, swap_total, swap_free = check_mem_and_swap()
|
||||
|
||||
ma_mib = int(mem_available) / 1024.0
|
||||
@ -1653,17 +1792,18 @@ def implement_corrective_action(signal, mem_info):
|
||||
round(ma_mib, 1), round(sf_mib, 1)
|
||||
)
|
||||
)
|
||||
"""
|
||||
|
||||
send_result = 'total response time: {} ms'.format(
|
||||
round(response_time * 1000))
|
||||
|
||||
preventing_oom_message = 'Implement a corrective action:' \
|
||||
'\n Send {} to the victim; {}'.format(
|
||||
sig_dict[signal], send_result)
|
||||
sig_dict[threshold], send_result)
|
||||
|
||||
key = 'Send {} to {}'.format(sig_dict[signal], name)
|
||||
key = 'Send {} to {}'.format(sig_dict[threshold], name)
|
||||
|
||||
if signal is SIGKILL and post_kill_exe != '':
|
||||
if threshold is SIGKILL and post_kill_exe != '':
|
||||
|
||||
cmd = post_kill_exe.replace('$PID', pid).replace(
|
||||
'$NAME', pid_to_name(pid))
|
||||
@ -1673,7 +1813,7 @@ def implement_corrective_action(signal, mem_info):
|
||||
exe(cmd)
|
||||
|
||||
if gui_notifications:
|
||||
send_notify(signal, name, pid)
|
||||
send_notify(threshold, name, pid)
|
||||
|
||||
except FileNotFoundError:
|
||||
response_time = time() - time0
|
||||
@ -1696,10 +1836,11 @@ def implement_corrective_action(signal, mem_info):
|
||||
|
||||
update_stat_dict_and_print(key)
|
||||
|
||||
|
||||
# нехуй делать, бэднес жертвы слишком мал
|
||||
else:
|
||||
|
||||
# может эту часть наверх отправить через if
|
||||
|
||||
response_time = time() - time0
|
||||
victim_badness_is_too_small = 'victim badness {} < min_b' \
|
||||
'adness {}; nothing to do; response time: {} ms'.format(
|
||||
@ -1724,21 +1865,13 @@ def implement_corrective_action(signal, mem_info):
|
||||
|
||||
print('##################################################################')
|
||||
|
||||
sleep(over_sleep) # Спать если бэднес жертвы мал
|
||||
|
||||
# Что делать с psi_t0 если у мертвы мал бэднес? Ничего, потому что кор действия не было.
|
||||
# демон может жрать 10% цпу при этом. Можно отдельн парам ввести. А можно
|
||||
# не вводить. кек
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
return psi_t0
|
||||
|
||||
|
||||
def sleep_after_check_mem():
|
||||
@ -1802,8 +1935,8 @@ def sleep_after_check_mem():
|
||||
|
||||
log(
|
||||
'Sleep {} sec (t_mem={}, t_swap={}{})'.format(
|
||||
round(t, 2),round(t_mem, 2),round(t_swap, 2), z)
|
||||
)
|
||||
round(t, 2), round(t_mem, 2), round(t_swap, 2), z)
|
||||
)
|
||||
|
||||
try:
|
||||
stdout.flush()
|
||||
@ -1874,27 +2007,9 @@ def calculate_percent(arg_key):
|
||||
return mem_min_kb, mem_min_mb, mem_min_percent
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
##########################################################################
|
||||
|
||||
|
||||
|
||||
|
||||
victim_dict = dict()
|
||||
victim_id = None
|
||||
actions_time_dict = dict()
|
||||
@ -1902,31 +2017,6 @@ actions_time_dict['action_handled'] = [time(), victim_id]
|
||||
# print(actions_time_dict)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
start_time = time()
|
||||
|
||||
|
||||
@ -1974,17 +2064,6 @@ else:
|
||||
notify_helper_path = '/usr/sbin/nohang_notify_helper'
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# will store corrective actions stat
|
||||
stat_dict = dict()
|
||||
|
||||
@ -2319,8 +2398,6 @@ print_victim_info = conf_parse_bool('print_victim_info')
|
||||
print_victim_cmdline = conf_parse_bool('print_victim_cmdline')
|
||||
|
||||
|
||||
|
||||
|
||||
print_config = conf_parse_bool('print_config')
|
||||
print_mem_check_results = conf_parse_bool('print_mem_check_results')
|
||||
print_sleep_periods = conf_parse_bool('print_sleep_periods')
|
||||
@ -2459,6 +2536,20 @@ else:
|
||||
exit(1)
|
||||
|
||||
|
||||
if 'psi_avg_warnings' in config_dict:
|
||||
psi_avg_warnings = string_to_float_convert_test(
|
||||
config_dict['psi_avg_warnings'])
|
||||
if psi_avg_warnings is None:
|
||||
errprint('Invalid psi_avg_warnings value, not float\nExit')
|
||||
exit(1)
|
||||
if psi_avg_warnings < 0 or psi_avg_warnings > 100:
|
||||
errprint('psi_avg_warnings must be in the range [0; 100]\nExit')
|
||||
exit(1)
|
||||
else:
|
||||
errprint('psi_avg_warnings not in config\nExit')
|
||||
exit(1)
|
||||
|
||||
|
||||
if 'min_badness' in config_dict:
|
||||
min_badness = string_to_int_convert_test(
|
||||
config_dict['min_badness'])
|
||||
@ -2876,9 +2967,6 @@ psi_avg_string = '' # will be overwritten if PSI monitoring enabled
|
||||
|
||||
mem_used_zram = 0
|
||||
|
||||
if psi_support and not ignore_psi:
|
||||
psi_t0 = time()
|
||||
|
||||
|
||||
if print_mem_check_results:
|
||||
|
||||
@ -2895,10 +2983,25 @@ for i in sig_list:
|
||||
signal(i, signal_handler)
|
||||
|
||||
|
||||
x0 = time()
|
||||
delta0 = 0
|
||||
|
||||
|
||||
threshold = None
|
||||
mem_info = None
|
||||
|
||||
|
||||
#print(x0, 'x0')
|
||||
|
||||
CHECK_PSI = False
|
||||
if psi_support and not ignore_psi:
|
||||
CHECK_PSI = True
|
||||
|
||||
psi_kill_exceeded_timer = 0
|
||||
psi_term_exceeded_timer = 0
|
||||
psi_t0 = time()
|
||||
psi_s = zram_s = zram_m = psi_m = None
|
||||
|
||||
|
||||
CHECK_ZRAM = not ignore_zram
|
||||
|
||||
@ -2907,58 +3010,36 @@ log('Monitoring has started!')
|
||||
stdout.flush()
|
||||
|
||||
|
||||
psi_kill_exceeded_timer = psi_term_exceeded_timer = delta0 = 0
|
||||
|
||||
x0 = time()
|
||||
|
||||
##########################################################################
|
||||
|
||||
|
||||
while True:
|
||||
|
||||
delta0 = time() - x0
|
||||
x0 = time()
|
||||
|
||||
# FIND VALUES: mem, swap, zram, psi
|
||||
|
||||
mem_available, swap_total, swap_free = check_mem_and_swap()
|
||||
|
||||
# if swap_min_sigkill is set in percent
|
||||
if swap_kill_is_percent:
|
||||
swap_min_sigkill_kb = swap_total * swap_min_sigkill_percent / 100.0
|
||||
|
||||
if swap_term_is_percent:
|
||||
swap_min_sigterm_kb = swap_total * swap_min_sigterm_percent / 100.0
|
||||
|
||||
if swap_warn_is_percent:
|
||||
swap_min_warnings_kb = swap_total * swap_min_warnings_percent / 100.0
|
||||
|
||||
if swap_total > swap_min_sigkill_kb:
|
||||
swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1))
|
||||
else:
|
||||
swap_sigkill_pc = '-'
|
||||
|
||||
if swap_total > swap_min_sigterm_kb:
|
||||
swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1))
|
||||
else:
|
||||
swap_sigterm_pc = '-'
|
||||
(masf_s, masf_m, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb,
|
||||
swap_free, swap_total) = check_mem_swap_ex()
|
||||
|
||||
if CHECK_ZRAM:
|
||||
mem_used_zram = check_zram()
|
||||
zram_s, zram_m, mem_used_zram = check_zram_ex()
|
||||
|
||||
if CHECK_PSI:
|
||||
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
|
||||
if time() - psi_t0 >= psi_post_action_delay:
|
||||
psi_post_action_delay_exceeded = True
|
||||
else:
|
||||
psi_post_action_delay_exceeded = False
|
||||
(psi_s, psi_m, psi_t0, psi_kill_exceeded_timer,
|
||||
psi_term_exceeded_timer, x0) = check_psi_ex(
|
||||
psi_t0,psi_kill_exceeded_timer,psi_term_exceeded_timer,x0)
|
||||
|
||||
if print_mem_check_results:
|
||||
psi_avg_string = 'PSI avg: {} | '.format(
|
||||
str(psi_avg_value).rjust(6))
|
||||
|
||||
if print_mem_check_results:
|
||||
|
||||
if CHECK_PSI:
|
||||
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
|
||||
if time() - psi_t0 >= psi_post_action_delay:
|
||||
psi_post_action_delay_exceeded = True
|
||||
else:
|
||||
psi_post_action_delay_exceeded = False
|
||||
|
||||
if print_mem_check_results:
|
||||
psi_avg_string = 'PSI avg: {} | '.format(
|
||||
str(psi_avg_value).rjust(6))
|
||||
|
||||
wt1 = time()
|
||||
|
||||
delta = (mem_available + swap_free) - new_mem
|
||||
@ -3024,168 +3105,64 @@ while True:
|
||||
)
|
||||
)
|
||||
|
||||
###########################################################################
|
||||
|
||||
# CHECK HARD THRESHOLDS (SIGKILL LEVEL)
|
||||
if masf_s is SIGKILL or zram_s is SIGKILL or psi_s is SIGKILL:
|
||||
|
||||
if (mem_available <= mem_min_sigkill_kb and
|
||||
swap_free <= swap_min_sigkill_kb):
|
||||
threshold = SIGKILL
|
||||
mem_info_list = []
|
||||
|
||||
mem_info = 'Hard threshold exceeded\nMemory status that requ' \
|
||||
'ires corrective actions:' \
|
||||
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
||||
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
||||
'p_min_sigkill [{} MiB, {} %]'.format(
|
||||
kib_to_mib(mem_available),
|
||||
percent(mem_available / mem_total),
|
||||
kib_to_mib(mem_min_sigkill_kb),
|
||||
percent(mem_min_sigkill_kb / mem_total),
|
||||
kib_to_mib(swap_free),
|
||||
percent(swap_free / (swap_total + 0.1)),
|
||||
kib_to_mib(swap_min_sigkill_kb),
|
||||
swap_sigkill_pc)
|
||||
if masf_m is not None:
|
||||
mem_info_list.append(masf_m)
|
||||
|
||||
implement_corrective_action(SIGKILL, mem_info)
|
||||
psi_t0 = time()
|
||||
if zram_m is not None:
|
||||
mem_info_list.append(zram_m)
|
||||
|
||||
if psi_m is not None:
|
||||
mem_info_list.append(psi_m)
|
||||
|
||||
psi_t0 = implement_corrective_action(
|
||||
threshold,
|
||||
mem_info_list,
|
||||
psi_t0,
|
||||
psi_kill_exceeded_timer,
|
||||
psi_term_exceeded_timer,
|
||||
x0, psi_s, zram_s, zram_m, psi_m)
|
||||
continue
|
||||
|
||||
if CHECK_ZRAM:
|
||||
if mem_used_zram >= zram_max_sigkill_kb:
|
||||
if masf_s is SIGTERM or zram_s is SIGTERM or psi_s is SIGTERM:
|
||||
|
||||
mem_info = 'Hard threshold exceeded\nMemory status that requir' \
|
||||
'es corrective actions:' \
|
||||
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
|
||||
'kill [{} MiB, {} %]'.format(
|
||||
kib_to_mib(mem_used_zram),
|
||||
percent(mem_used_zram / mem_total),
|
||||
kib_to_mib(zram_max_sigkill_kb),
|
||||
percent(zram_max_sigkill_kb / mem_total))
|
||||
threshold = SIGTERM
|
||||
mem_info_list = []
|
||||
|
||||
implement_corrective_action(SIGKILL, mem_info)
|
||||
psi_t0 = time()
|
||||
continue
|
||||
if masf_m is not None:
|
||||
mem_info_list.append(masf_m)
|
||||
|
||||
if CHECK_PSI:
|
||||
if zram_m is not None:
|
||||
mem_info_list.append(zram_m)
|
||||
|
||||
if psi_avg_value >= sigkill_psi_threshold:
|
||||
sigkill_psi_exceeded = True
|
||||
psi_kill_exceeded_timer += delta0
|
||||
else:
|
||||
sigkill_psi_exceeded = False
|
||||
psi_kill_exceeded_timer = 0
|
||||
if psi_m is not None:
|
||||
mem_info_list.append(psi_m)
|
||||
|
||||
if psi_debug:
|
||||
|
||||
log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded'
|
||||
': {}\npsi_kill_exceeded_timer: {}'.format(
|
||||
psi_post_action_delay_exceeded,
|
||||
sigkill_psi_exceeded,
|
||||
round(psi_kill_exceeded_timer, 1)
|
||||
)
|
||||
)
|
||||
|
||||
if (psi_kill_exceeded_timer >= psi_excess_duration and
|
||||
psi_post_action_delay_exceeded):
|
||||
|
||||
mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \
|
||||
'PSI avg exceeded psi_excess_duration (value' \
|
||||
' = {} sec) for {} seconds'.format(
|
||||
psi_avg_value,
|
||||
sigkill_psi_threshold,
|
||||
psi_excess_duration,
|
||||
round(psi_kill_exceeded_timer, 1)
|
||||
)
|
||||
|
||||
implement_corrective_action(SIGKILL, mem_info)
|
||||
psi_t0 = time()
|
||||
continue
|
||||
|
||||
###########################################################################
|
||||
|
||||
# CHECK SOFT THRESHOLDS (SIGTERM LEVEL)
|
||||
|
||||
if (mem_available <= mem_min_sigterm_kb and
|
||||
swap_free <= swap_min_sigterm_kb):
|
||||
|
||||
mem_info = 'Soft threshold exceeded\nMemory status that requi' \
|
||||
'res corrective actions:' \
|
||||
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
||||
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
||||
'p_min_sigterm [{} MiB, {} %]'.format(
|
||||
kib_to_mib(mem_available),
|
||||
percent(mem_available / mem_total),
|
||||
kib_to_mib(mem_min_sigterm_kb),
|
||||
round(mem_min_sigterm_percent, 1),
|
||||
kib_to_mib(swap_free),
|
||||
percent(swap_free / (swap_total + 0.1)),
|
||||
kib_to_mib(swap_min_sigterm_kb),
|
||||
swap_sigterm_pc)
|
||||
|
||||
implement_corrective_action(SIGTERM, mem_info)
|
||||
psi_t0 = time()
|
||||
psi_t0 = implement_corrective_action(
|
||||
threshold,
|
||||
mem_info_list,
|
||||
psi_t0,
|
||||
psi_kill_exceeded_timer,
|
||||
psi_term_exceeded_timer,
|
||||
x0, psi_s, zram_s, zram_m, psi_m)
|
||||
continue
|
||||
|
||||
if CHECK_ZRAM:
|
||||
if mem_used_zram >= zram_max_sigterm_kb:
|
||||
|
||||
mem_info = 'Soft threshold exceeded\nMemory status that require' \
|
||||
's corrective actions:\n MemUsedZram [{} MiB, {} %] >= zra' \
|
||||
'm_max_sigterm [{} M, {} %]'.format(
|
||||
kib_to_mib(mem_used_zram),
|
||||
percent(mem_used_zram / mem_total),
|
||||
kib_to_mib(zram_max_sigterm_kb),
|
||||
percent(zram_max_sigterm_kb / mem_total))
|
||||
|
||||
implement_corrective_action(SIGTERM, mem_info)
|
||||
psi_t0 = time()
|
||||
continue
|
||||
|
||||
if CHECK_PSI:
|
||||
if psi_avg_value >= sigterm_psi_threshold:
|
||||
sigterm_psi_exceeded = True
|
||||
psi_term_exceeded_timer += delta0
|
||||
else:
|
||||
sigterm_psi_exceeded = False
|
||||
psi_term_exceeded_timer = 0
|
||||
|
||||
if psi_debug:
|
||||
|
||||
log('sigterm_psi_exceeded: {}\n'
|
||||
'psi_term_exceeded_timer: {}\n'.format(
|
||||
sigterm_psi_exceeded,
|
||||
round(psi_term_exceeded_timer, 1)
|
||||
)
|
||||
)
|
||||
|
||||
if (psi_term_exceeded_timer >= psi_excess_duration and
|
||||
psi_post_action_delay_exceeded):
|
||||
|
||||
mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \
|
||||
'PSI avg exceeded psi_excess_duration (value' \
|
||||
' = {} sec) for {} seconds'.format(
|
||||
psi_avg_value,
|
||||
sigterm_psi_threshold,
|
||||
psi_excess_duration,
|
||||
round(psi_term_exceeded_timer, 1)
|
||||
)
|
||||
|
||||
implement_corrective_action(SIGTERM, mem_info)
|
||||
psi_t0 = time()
|
||||
continue
|
||||
|
||||
###########################################################################
|
||||
|
||||
if gui_low_memory_warnings:
|
||||
|
||||
if (mem_available <= mem_min_warnings_kb and
|
||||
swap_free <= swap_min_warnings_kb + 0.1 or
|
||||
mem_used_zram >= zram_max_warnings_kb):
|
||||
if masf_s == 'WARN' or zram_s == 'WARN' or psi_s == 'WARN':
|
||||
|
||||
warn_time_delta = time() - warn_time_now
|
||||
warn_time_now = time()
|
||||
warn_timer += warn_time_delta
|
||||
if warn_timer > min_time_between_warnings:
|
||||
|
||||
send_notify_warn()
|
||||
|
||||
warn_timer = 0
|
||||
|
||||
sleep_after_check_mem()
|
||||
|
@ -107,7 +107,7 @@ sigterm_psi_threshold = 60
|
||||
sigkill_psi_threshold = 90
|
||||
|
||||
>= 0, float
|
||||
psi_excess_duration = 30
|
||||
psi_excess_duration = 40
|
||||
|
||||
psi_post_action_delay = 20
|
||||
|
||||
@ -289,6 +289,8 @@ swap_min_warnings = 50 %
|
||||
|
||||
zram_max_warnings = 40 %
|
||||
|
||||
psi_avg_warnings = 60
|
||||
|
||||
Valid values are floating-point numbers from the range [1; 300].
|
||||
|
||||
min_time_between_warnings = 15
|
||||
@ -336,7 +338,7 @@ print_victim_info = True
|
||||
|
||||
print_victim_cmdline = False
|
||||
|
||||
max_ancestry_depth = 1
|
||||
max_ancestry_depth = 5
|
||||
|
||||
separate_log = False
|
||||
|
||||
|
@ -1,2 +1,2 @@
|
||||
some avg10=29.70 avg60=51.59 avg300=22.92 total=195239452
|
||||
full avg10=28.82 avg60=49.77 avg300=21.83 total=182504463
|
||||
some avg10=56.70 avg60=51.59 avg300=22.92 total=195239452
|
||||
full avg10=28.82 avg60=49.77 avg300=21.83 total=182504463
|
||||
|
Loading…
Reference in New Issue
Block a user