do not check zram by default

This commit is contained in:
Alexey Avramov 2019-05-21 12:25:31 +09:00
parent 77da0efb9a
commit 944c13be7e
3 changed files with 332 additions and 391 deletions

595
nohang
View File

@ -11,92 +11,6 @@ from sre_constants import error as invalid_re
from signal import signal, SIGKILL, SIGTERM, SIGINT, SIGQUIT, SIGHUP from signal import signal, SIGKILL, SIGTERM, SIGINT, SIGQUIT, SIGHUP
start_time = time()
help_mess = """usage: nohang [-h] [-v] [-t] [-p] [-c CONFIG]
optional arguments:
-h, --help show this help message and exit
-v, --version print version
-t, --test print some tests
-p, --print-proc-table
print table of processes with their badness values
-c CONFIG, --config CONFIG
path to the config file, default values:
./nohang.conf, /etc/nohang/nohang.conf"""
SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE'])
conf_err_mess = 'Invalid config. Exit.'
sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP]
sig_dict = {
SIGKILL: 'SIGKILL',
SIGINT: 'SIGINT',
SIGQUIT: 'SIGQUIT',
SIGHUP: 'SIGHUP',
SIGTERM: 'SIGTERM'
}
self_pid = str(os.getpid())
self_uid = os.geteuid()
if self_uid == 0:
root = True
else:
root = False
if os.path.exists('./nohang_notify_helper'):
notify_helper_path = './nohang_notify_helper'
else:
notify_helper_path = '/usr/sbin/nohang_notify_helper'
victim_dict = dict()
victim_id = None
actions_time_dict = dict()
actions_time_dict['action_handled'] = [time(), victim_id]
# print(actions_time_dict)
# will store corrective actions stat
stat_dict = dict()
separate_log = False # will be overwritten after parse config
def find_cgroup_indexes():
""" Find cgroup-line positions in /proc/*/cgroup file.
"""
cgroup_v1_index = None
cgroup_v2_index = None
with open('/proc/self/cgroup') as f:
for index, line in enumerate(f):
if ':name=' in line:
cgroup_v1_index = index
if line.startswith('0::'):
cgroup_v2_index = index
return cgroup_v1_index, cgroup_v2_index
cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
########################################################################## ##########################################################################
# define functions # define functions
@ -115,7 +29,62 @@ def print_self_rss():
''' '''
def get_swap_threshold_tuple(string):
# re (Num %, True) or (Num KiB, False)
"""Returns KiB value if abs val was set in config, or tuple with %"""
# return tuple with abs and bool: (abs %, True) or (abs MiB, False)
if string.endswith('%'):
valid = string_to_float_convert_test(string[:-1])
if valid is None:
errprint('somewhere swap unit is not float_%')
exit(1)
value = float(string[:-1].strip())
if value < 0 or value > 100:
errprint('invalid value, must be from the range[0; 100] %')
exit(1)
return value, True
elif string.endswith('M'):
valid = string_to_float_convert_test(string[:-1])
if valid is None:
errprint('somewhere swap unit is not float_M')
exit(1)
value = float(string[:-1].strip()) * 1024
if value < 0:
errprint('invalid unit in config (negative value)')
exit(1)
return value, False
else:
errprint(
'Invalid config file. There are invalid units somewhere\nExit')
exit(1)
def find_cgroup_indexes():
""" Find cgroup-line positions in /proc/*/cgroup file.
"""
cgroup_v1_index = cgroup_v2_index = None
with open('/proc/self/cgroup') as f:
for index, line in enumerate(f):
if ':name=' in line:
cgroup_v1_index = index
if line.startswith('0::'):
cgroup_v2_index = index
return cgroup_v1_index, cgroup_v2_index
def pid_to_rss(pid): def pid_to_rss(pid):
"""
"""
try: try:
rss = int(rline1( rss = int(rline1(
'/proc/{}/statm'.format(pid)).split(' ')[1]) * SC_PAGESIZE '/proc/{}/statm'.format(pid)).split(' ')[1]) * SC_PAGESIZE
@ -129,6 +98,8 @@ def pid_to_rss(pid):
def pid_to_vm_size(pid): def pid_to_vm_size(pid):
"""
"""
try: try:
vm_size = int(rline1( vm_size = int(rline1(
'/proc/{}/statm'.format(pid)).partition(' ')[0]) * SC_PAGESIZE '/proc/{}/statm'.format(pid)).partition(' ')[0]) * SC_PAGESIZE
@ -141,12 +112,6 @@ def pid_to_vm_size(pid):
return vm_size return vm_size
def signal_handler(signum, frame): def signal_handler(signum, frame):
""" """
""" """
@ -193,13 +158,6 @@ def write_self_oom_score_adj(new_value):
write('/proc/self/oom_score_adj', new_value) write('/proc/self/oom_score_adj', new_value)
self_oom_score_adj_min = '-600'
self_oom_score_adj_max = '-6'
write_self_oom_score_adj(self_oom_score_adj_min)
def valid_re(reg_exp): def valid_re(reg_exp):
"""Validate regular expression. """Validate regular expression.
""" """
@ -431,6 +389,8 @@ def pid_to_environ(pid):
def pid_to_realpath(pid): def pid_to_realpath(pid):
"""
"""
try: try:
return os.path.realpath('/proc/' + pid + '/exe') return os.path.realpath('/proc/' + pid + '/exe')
except FileNotFoundError: except FileNotFoundError:
@ -615,9 +575,6 @@ def pid_to_status_unicode(pid):
return None return None
##########################################################################
def uptime(): def uptime():
""" """
""" """
@ -993,9 +950,6 @@ def get_pid_list():
return pid_list return pid_list
pid_list = get_pid_list()
def get_non_decimal_pids(): def get_non_decimal_pids():
""" """
""" """
@ -1362,23 +1316,14 @@ def find_victim_info(pid, victim_badness, name):
return victim_info return victim_info
def implement_corrective_action(signal): def implement_corrective_action(signal):
""" """
Find victim with highest badness and send SIGTERM/SIGKILL Find victim with highest badness and send SIGTERM/SIGKILL
""" """
time0 = time()
# выходим из фции, если для SIGTERM порога не превышено время
# выходим из фции, если для SIGTERM порога не превышено время min_delay_after_sigterm и спим в течение over_sleep # min_delay_after_sigterm и спим в течение over_sleep
if signal is SIGTERM: if signal is SIGTERM:
dt = time() - actions_time_dict['action_handled'][0] dt = time() - actions_time_dict['action_handled'][0]
@ -1397,13 +1342,12 @@ def implement_corrective_action(signal):
else: else:
print('min_delay_after_sigterm IS EXCEEDED, it is time to action') print('min_delay_after_sigterm IS EXCEEDED, it is time to action')
""" """
При заходе в фцию проверяем права на сигтерм. Права на сигкилл у нас всегда есть При заходе в фцию проверяем права на сигтерм. Права на сигкилл у нас
(потому что идем дальше только после полн освободж памяти после смерти жертвы) всегда есть
(потому что идем дальше только после полн освободж памяти после
смерти жертвы)
actions_time_dict[action_handled] = time() actions_time_dict[action_handled] = time()
actions_time_dict[veto] = True actions_time_dict[veto] = True
@ -1414,7 +1358,6 @@ def implement_corrective_action(signal):
""" """
log(mem_info) log(mem_info)
pid, victim_badness, name = find_victim(print_proc_table) pid, victim_badness, name = find_victim(print_proc_table)
@ -1425,10 +1368,8 @@ def implement_corrective_action(signal):
victim_info = find_victim_info(pid, victim_badness, name) victim_info = find_victim_info(pid, victim_badness, name)
log(victim_info) log(victim_info)
# пороги могли превысиься за время поиска жертвы (поиск может занимать
# сотни миллисекунд)
# пороги могли превысиься за время поиска жертвы (поиск может занимать сотни миллисекунд)
mem_available, swap_total, swap_free = check_mem_and_swap() mem_available, swap_total, swap_free = check_mem_and_swap()
ma_mib = int(mem_available) / 1024.0 ma_mib = int(mem_available) / 1024.0
@ -1445,15 +1386,8 @@ def implement_corrective_action(signal):
log('Hard threshold exceeded') log('Hard threshold exceeded')
signal = SIGKILL signal = SIGKILL
victim_id = get_victim_id(pid) victim_id = get_victim_id(pid)
# kill the victim if it doesn't respond to SIGTERM В ТЕЧЕНИЕ # kill the victim if it doesn't respond to SIGTERM В ТЕЧЕНИЕ
# ЗАДАННОГО ВРЕМЕНИ # ЗАДАННОГО ВРЕМЕНИ
@ -1465,19 +1399,10 @@ def implement_corrective_action(signal):
dt = time() - victim_dict[victim_id] dt = time() - victim_dict[victim_id]
if dt > max_post_sigterm_victim_lifetime: if dt > max_post_sigterm_victim_lifetime:
print('max_post_sigterm_victim_lifetime exceeded: the victim will get SIGKILL') print('max_post_sigterm_victim_lifetime exceeded: the '
'victim will get SIGKILL')
signal = SIGKILL signal = SIGKILL
# matching with re to customize corrective actions # matching with re to customize corrective actions
soft_match = False soft_match = False
@ -1529,8 +1454,10 @@ def implement_corrective_action(signal):
response_time = time() - time0 response_time = time() - time0
# тут надо, как и при дефолтном действии, проверять существование жертвы, ее реакцию на действие, # тут надо, как и при дефолтном действии, проверять существование
# и время ее смерти в случае успеха, о обновление таймстемпов действия # жертвы, ее реакцию на действие,
# и время ее смерти в случае успеха, о обновление таймстемпов
# действия
etc_info = 'Implement a corrective act' \ etc_info = 'Implement a corrective act' \
'ion:\n Run the command: {}' \ 'ion:\n Run the command: {}' \
@ -1552,71 +1479,49 @@ def implement_corrective_action(signal):
command.replace('$PID', pid).replace( command.replace('$PID', pid).replace(
'$NAME', pid_to_name(pid))) '$NAME', pid_to_name(pid)))
else: else:
# обычное действие через сигнал # обычное действие через сигнал
try: try:
os.kill(int(pid), signal) os.kill(int(pid), signal)
kill_timestamp = time() kill_timestamp = time()
response_time = kill_timestamp - time0 response_time = kill_timestamp - time0
while True: while True:
exe_exists = os.path.exists('/proc/{}/exe'.format(pid)) exe_exists = os.path.exists('/proc/{}/exe'.format(pid))
rss = pid_to_rss(pid) rss = pid_to_rss(pid)
dt = time() - kill_timestamp dt = time() - kill_timestamp
log('Victim VmRSS: {} KiB'.format(rss)) log('Victim VmRSS: {} KiB'.format(rss))
if not exe_exists or rss == 0 or dt > 0.01: if not exe_exists or rss == 0 or dt > 0.01:
#print(dt) # print(dt)
break break
sleep(0.001) sleep(0.001)
if dt > 0.01: if dt > 0.01:
log('Timer (value = 0.01 sec) expired; seems' \ log('Timer (value = 0.01 sec) expired; seems'
' like the victim handles signal') ' like the victim handles signal')
actions_time_dict['action_handled'] = [time(), get_victim_id(pid)] actions_time_dict['action_handled'] = [
time(), get_victim_id(pid)]
if victim_id not in victim_dict: # хз как надо. if victim_id not in victim_dict: # хз как надо.
victim_dict.update({victim_id: time()}) victim_dict.update({victim_id: time()})
# log('actions_time_dict', actions_time_dict) # log('actions_time_dict', actions_time_dict)
# log('victim_dict', victim_dict) # log('victim_dict', victim_dict)
else: else:
log('Process exited (VmRSS = 0) in {} sec'.format( log('Process exited (VmRSS = 0) in {} sec'.format(
round(dt, 5))) round(dt, 5)))
if signal is SIGKILL or not exe_exists or rss == 0: if signal is SIGKILL or not exe_exists or rss == 0:
while True: while True:
sleep(0.001) sleep(0.001)
rss = pid_to_rss(pid) # рсс не важен когда путь не существует. Проверяй просто существование пид. # рсс не важен когда путь не существует. Проверяй
# просто существование пид.
rss = pid_to_rss(pid)
if rss is None: if rss is None:
break break
t1 = time() t1 = time()
@ -1624,7 +1529,6 @@ def implement_corrective_action(signal):
log('The victim died in {} sec'.format( log('The victim died in {} sec'.format(
round(kill_duration, 3))) round(kill_duration, 3)))
mem_available, swap_total, swap_free = check_mem_and_swap() mem_available, swap_total, swap_free = check_mem_and_swap()
ma_mib = int(mem_available) / 1024.0 ma_mib = int(mem_available) / 1024.0
@ -1636,12 +1540,6 @@ def implement_corrective_action(signal):
) )
) )
send_result = 'total response time: {} ms'.format( send_result = 'total response time: {} ms'.format(
round(response_time * 1000)) round(response_time * 1000))
@ -1700,11 +1598,12 @@ def implement_corrective_action(signal):
update_stat_dict_and_print(key) update_stat_dict_and_print(key)
# тут надо поспать хорошенько. а может и счетчики поправить. # тут надо поспать хорошенько. а может и счетчики поправить.
# херню несу. во-первых, внезапно может кто-то появиться c блльшим бэднес.. Далее надо минимизировать аутпут спам. # херню несу. во-первых, внезапно может кто-то появиться c блльшим
# бэднес.. Далее надо минимизировать аутпут спам.
sleep(over_sleep) sleep(over_sleep)
# обновлять время не на каждый кил, а только на килл той жертвы,
# обновлять время не на каждый кил, а только на килл той жертвы, которая не отвечала на софт экшн. # которая не отвечала на софт экшн.
# Вывод: ко времени действия прилагать также виктим айди. # Вывод: ко времени действия прилагать также виктим айди.
print('##################################################################') print('##################################################################')
@ -1739,17 +1638,23 @@ def sleep_after_check_mem():
t_mem = mem_point / rate_mem t_mem = mem_point / rate_mem
t_swap = swap_point / rate_swap t_swap = swap_point / rate_swap
if CHECK_ZRAM:
t_zram = (mem_total * 0.8 - mem_used_zram) / rate_zram t_zram = (mem_total * 0.8 - mem_used_zram) / rate_zram
if t_zram < 0: if t_zram < 0:
t_zram = 0 t_zram = 0
t_mem_zram = t_mem + t_zram
t_mem_swap = t_mem + t_swap t_mem_swap = t_mem + t_swap
t_mem_zram = t_mem + t_zram
if CHECK_ZRAM:
if t_mem_swap <= t_mem_zram: if t_mem_swap <= t_mem_zram:
t = t_mem_swap t = t_mem_swap
else: else:
t = t_mem_zram t = t_mem_zram
else:
t = t_mem_swap
if t > max_sleep: if t > max_sleep:
t = max_sleep t = max_sleep
@ -1841,6 +1746,83 @@ def calculate_percent(arg_key):
########################################################################## ##########################################################################
start_time = time()
help_mess = """usage: nohang [-h] [-v] [-t] [-p] [-c CONFIG]
optional arguments:
-h, --help show this help message and exit
-v, --version print version
-t, --test print some tests
-p, --print-proc-table
print table of processes with their badness values
-c CONFIG, --config CONFIG
path to the config file, default values:
./nohang.conf, /etc/nohang/nohang.conf"""
SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE'])
conf_err_mess = 'Invalid config. Exit.'
sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP]
sig_dict = {
SIGKILL: 'SIGKILL',
SIGINT: 'SIGINT',
SIGQUIT: 'SIGQUIT',
SIGHUP: 'SIGHUP',
SIGTERM: 'SIGTERM'
}
self_pid = str(os.getpid())
self_uid = os.geteuid()
if self_uid == 0:
root = True
else:
root = False
if os.path.exists('./nohang_notify_helper'):
notify_helper_path = './nohang_notify_helper'
else:
notify_helper_path = '/usr/sbin/nohang_notify_helper'
victim_dict = dict()
victim_id = None
actions_time_dict = dict()
actions_time_dict['action_handled'] = [time(), victim_id]
# print(actions_time_dict)
# will store corrective actions stat
stat_dict = dict()
separate_log = False # will be overwritten after parse config
cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
self_oom_score_adj_min = '-600'
self_oom_score_adj_max = '-6'
write_self_oom_score_adj(self_oom_score_adj_min)
pid_list = get_pid_list()
print_proc_table_flag = False print_proc_table_flag = False
if len(argv) == 1: if len(argv) == 1:
@ -1879,9 +1861,6 @@ else:
exit(1) exit(1)
##########################################################################
# find mem_total # find mem_total
# find positions of SwapFree and SwapTotal in /proc/meminfo # find positions of SwapFree and SwapTotal in /proc/meminfo
@ -1928,8 +1907,6 @@ except ValueError:
detailed_rss = False detailed_rss = False
# print('It is not Linux 4.5+') # print('It is not Linux 4.5+')
##########################################################################
log('Config: ' + config) log('Config: ' + config)
@ -2167,6 +2144,8 @@ gui_low_memory_warnings = conf_parse_bool('gui_low_memory_warnings')
gui_notifications = conf_parse_bool('gui_notifications') gui_notifications = conf_parse_bool('gui_notifications')
decrease_oom_score_adj = conf_parse_bool('decrease_oom_score_adj') decrease_oom_score_adj = conf_parse_bool('decrease_oom_score_adj')
ignore_psi = conf_parse_bool('ignore_psi') ignore_psi = conf_parse_bool('ignore_psi')
ignore_zram = conf_parse_bool('ignore_zram')
(mem_min_sigterm_kb, mem_min_sigterm_mb, mem_min_sigterm_percent (mem_min_sigterm_kb, mem_min_sigterm_mb, mem_min_sigterm_percent
) = calculate_percent('mem_min_sigterm') ) = calculate_percent('mem_min_sigterm')
@ -2559,43 +2538,6 @@ psi_support = os.path.exists(psi_path)
# Get KiB levels if it's possible. # Get KiB levels if it's possible.
def get_swap_threshold_tuple(string):
# re (Num %, True) or (Num KiB, False)
"""Returns KiB value if abs val was set in config, or tuple with %"""
# return tuple with abs and bool: (abs %, True) or (abs MiB, False)
if string.endswith('%'):
valid = string_to_float_convert_test(string[:-1])
if valid is None:
errprint('somewhere swap unit is not float_%')
exit(1)
value = float(string[:-1].strip())
if value < 0 or value > 100:
errprint('invalid value, must be from the range[0; 100] %')
exit(1)
return value, True
elif string.endswith('M'):
valid = string_to_float_convert_test(string[:-1])
if valid is None:
errprint('somewhere swap unit is not float_M')
exit(1)
value = float(string[:-1].strip()) * 1024
if value < 0:
errprint('invalid unit in config (negative value)')
exit(1)
return value, False
else:
errprint(
'Invalid config file. There are invalid units somewhere\nExit')
exit(1)
swap_min_sigterm_tuple = get_swap_threshold_tuple(swap_min_sigterm) swap_min_sigterm_tuple = get_swap_threshold_tuple(swap_min_sigterm)
swap_min_sigkill_tuple = get_swap_threshold_tuple(swap_min_sigkill) swap_min_sigkill_tuple = get_swap_threshold_tuple(swap_min_sigkill)
swap_min_warnings_tuple = get_swap_threshold_tuple(swap_min_warnings) swap_min_warnings_tuple = get_swap_threshold_tuple(swap_min_warnings)
@ -2732,14 +2674,9 @@ mlockall()
# print_self_rss() # print_self_rss()
log('Monitoring has started!')
stdout.flush()
##########################################################################
psi_avg_string = '' # will be overwritten if PSI monitoring enabled psi_avg_string = '' # will be overwritten if PSI monitoring enabled
mem_used_zram = 0
if psi_support and not ignore_psi: if psi_support and not ignore_psi:
psi_t0 = time() psi_t0 = time()
@ -2760,58 +2697,26 @@ for i in sig_list:
signal(i, signal_handler) signal(i, signal_handler)
CHECK_PSI = False
if psi_support and not ignore_psi:
CHECK_PSI = True
CHECK_ZRAM = not ignore_zram
log('Monitoring has started!')
stdout.flush()
##########################################################################
while True: while True:
if psi_support and not ignore_psi: # Q = time()
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics) # FIND VALUES: mem, swap, zram, psi
if print_mem_check_results:
psi_avg_string = 'PSI avg value: {} | '.format(
str(psi_avg_value).rjust(6))
if psi_avg_value >= sigkill_psi_threshold:
sigkill_psi_exceeded = True
else:
sigkill_psi_exceeded = False
if psi_avg_value >= sigterm_psi_threshold:
sigterm_psi_exceeded = True
else:
sigterm_psi_exceeded = False
if time() - psi_t0 >= psi_post_action_delay:
psi_post_action_delay_exceeded = True
else:
psi_post_action_delay_exceeded = False
if psi_debug:
log('sigterm_psi_exceeded: {}, sigkill_psi_exceeded: {}, ps'
'i_post_action_delay_exceeded: {}'.format(
sigterm_psi_exceeded,
sigkill_psi_exceeded,
psi_post_action_delay_exceeded))
if sigkill_psi_exceeded and psi_post_action_delay_exceeded:
time0 = time()
mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \
'old ({})'.format(
psi_avg_value, sigkill_psi_threshold)
implement_corrective_action(SIGKILL)
psi_t0 = time()
continue
if sigterm_psi_exceeded and psi_post_action_delay_exceeded:
time0 = time()
mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \
'shold ({})'.format(psi_avg_value, sigterm_psi_threshold)
implement_corrective_action(SIGTERM)
psi_t0 = time()
continue
mem_available, swap_total, swap_free = check_mem_and_swap() mem_available, swap_total, swap_free = check_mem_and_swap()
@ -2825,8 +2730,30 @@ while True:
if swap_warn_is_percent: if swap_warn_is_percent:
swap_min_warnings_kb = swap_total * swap_min_warnings_percent / 100.0 swap_min_warnings_kb = swap_total * swap_min_warnings_percent / 100.0
if swap_total > swap_min_sigkill_kb:
swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1))
else:
swap_sigkill_pc = '-'
if swap_total > swap_min_sigterm_kb:
swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1))
else:
swap_sigterm_pc = '-'
if CHECK_ZRAM:
mem_used_zram = check_zram() mem_used_zram = check_zram()
if CHECK_PSI:
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
if time() - psi_t0 >= psi_post_action_delay:
psi_post_action_delay_exceeded = True
else:
psi_post_action_delay_exceeded = False
if print_mem_check_results:
psi_avg_string = 'PSI avg value: {} | '.format(
str(psi_avg_value).rjust(6))
if print_mem_check_results: if print_mem_check_results:
wt1 = time() wt1 = time()
@ -2894,20 +2821,12 @@ while True:
) )
) )
if swap_total > swap_min_sigkill_kb: ###########################################################################
swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1))
else:
swap_sigkill_pc = '-'
if swap_total > swap_min_sigterm_kb: # CHECK HARD THRESHOLDS (SIGKILL LEVEL)
swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1))
else:
swap_sigterm_pc = '-'
# MEM SWAP KILL
if (mem_available <= mem_min_sigkill_kb and if (mem_available <= mem_min_sigkill_kb and
swap_free <= swap_min_sigkill_kb): swap_free <= swap_min_sigkill_kb):
time0 = time()
mem_info = 'Hard threshold exceeded\nMemory status that requ' \ mem_info = 'Hard threshold exceeded\nMemory status that requ' \
'ires corrective actions:' \ 'ires corrective actions:' \
@ -2924,13 +2843,11 @@ while True:
swap_sigkill_pc) swap_sigkill_pc)
implement_corrective_action(SIGKILL) implement_corrective_action(SIGKILL)
psi_t0 = time() psi_t0 = time()
continue continue
# ZRAM KILL if CHECK_ZRAM:
if mem_used_zram >= zram_max_sigkill_kb: if mem_used_zram >= zram_max_sigkill_kb:
time0 = time()
mem_info = 'Hard threshold exceeded\nMemory status that requir' \ mem_info = 'Hard threshold exceeded\nMemory status that requir' \
'es corrective actions:' \ 'es corrective actions:' \
@ -2942,15 +2859,31 @@ while True:
percent(zram_max_sigkill_kb / mem_total)) percent(zram_max_sigkill_kb / mem_total))
implement_corrective_action(SIGKILL) implement_corrective_action(SIGKILL)
psi_t0 = time() psi_t0 = time()
continue continue
# MEM SWAP TERM if CHECK_PSI:
if mem_available <= mem_min_sigterm_kb and \ if psi_avg_value >= sigkill_psi_threshold:
swap_free <= swap_min_sigterm_kb: sigkill_psi_exceeded = True
else:
sigkill_psi_exceeded = False
time0 = time() if sigkill_psi_exceeded and psi_post_action_delay_exceeded:
mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \
'old ({})'.format(
psi_avg_value, sigkill_psi_threshold)
implement_corrective_action(SIGKILL)
psi_t0 = time()
continue
###########################################################################
# CHECK SOFT THRESHOLDS (SIGTERM LEVEL)
if (mem_available <= mem_min_sigterm_kb and
swap_free <= swap_min_sigterm_kb):
mem_info = 'Soft threshold exceeded\nMemory status that requi' \ mem_info = 'Soft threshold exceeded\nMemory status that requi' \
'res corrective actions:' \ 'res corrective actions:' \
@ -2967,34 +2900,54 @@ while True:
swap_sigterm_pc) swap_sigterm_pc)
implement_corrective_action(SIGTERM) implement_corrective_action(SIGTERM)
psi_t0 = time() psi_t0 = time()
continue continue
# ZRAM TERM if CHECK_ZRAM:
if mem_used_zram >= zram_max_sigterm_kb: if mem_used_zram >= zram_max_sigterm_kb:
time0 = time()
mem_info = 'Soft threshold exceeded\nMemory status that requ' \ mem_info = 'Soft threshold exceeded\nMemory status that require' \
'ires corrective actions:' \ 's corrective actions:\n MemUsedZram [{} MiB, {} %] >= zra' \
'\n MemUsedZram [{} MiB, {} %] >= ' \ 'm_max_sigterm [{} M, {} %]'.format(
'zram_max_sigterm [{} M, {} %]'.format(
kib_to_mib(mem_used_zram), kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total), percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigterm_kb), kib_to_mib(zram_max_sigterm_kb),
percent(zram_max_sigterm_kb / mem_total)) percent(zram_max_sigterm_kb / mem_total))
implement_corrective_action(SIGTERM) implement_corrective_action(SIGTERM)
psi_t0 = time() psi_t0 = time()
continue continue
# LOW MEMORY WARNINGS if CHECK_PSI:
if psi_avg_value >= sigterm_psi_threshold:
sigterm_psi_exceeded = True
else:
sigterm_psi_exceeded = False
if psi_debug:
log('sigterm_psi_exceeded: {}, sigkill_psi_exceeded: {}, ps'
'i_post_action_delay_exceeded: {}'.format(
sigterm_psi_exceeded,
sigkill_psi_exceeded,
psi_post_action_delay_exceeded))
if sigterm_psi_exceeded and psi_post_action_delay_exceeded:
mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \
'shold ({})'.format(psi_avg_value, sigterm_psi_threshold)
implement_corrective_action(SIGTERM)
psi_t0 = time()
continue
###########################################################################
if gui_low_memory_warnings: if gui_low_memory_warnings:
if mem_available <= mem_min_warnings_kb and \ if (mem_available <= mem_min_warnings_kb and
swap_free <= swap_min_warnings_kb + 0.1 or \ swap_free <= swap_min_warnings_kb + 0.1 or
mem_used_zram >= zram_max_warnings_kb: mem_used_zram >= zram_max_warnings_kb):
warn_time_delta = time() - warn_time_now warn_time_delta = time() - warn_time_now
warn_time_now = time() warn_time_now = time()
warn_timer += warn_time_delta warn_timer += warn_time_delta
@ -3003,17 +2956,7 @@ while True:
warn_timer = 0 warn_timer = 0
# x = time() - Q
# print(x * 1000)
# SLEEP BETWEEN MEM CHECKS
sleep_after_check_mem() sleep_after_check_mem()

View File

@ -1,34 +1,38 @@
This is nohang config file. This is nohang config file.
Lines starting with #, tabs and spaces are comments. Lines starting with #, tabs and spaces are comments.
Lines starting with @ contain optional parameters. Lines starting with @ contain optional parameters.
All values are case sensitive.
Be careful: nohang doesn't forbid you to shoot yourself in the foot.
The configuration includes the following sections: The configuration includes the following sections:
0. Common zram settings
1. Memory levels to respond to as an OOM threat 1. Memory levels to respond to as an OOM threat
2. Response on PSI memory metrics 2. Response on PSI memory metrics
3. The frequency of checking the level of available memory 3. The frequency of checking the level of available memory
(and CPU usage) (and CPU usage)
4. The prevention of killing innocent victims 4. The prevention of killing innocent victims
5. Impact on the badness of processes via matching their 5. Impact on the badness of processes via matching their names, cgroups and
- names, cmdlines with specified regular expressions
- cgroups,
- cmdlines and
- UIDs
with regular expressions
6. Customize corrective actions: the execution of a specific command 6. Customize corrective actions: the execution of a specific command
instead of sending the SIGTERM signal instead of sending the SIGTERM signal
7. GUI notifications: 7. GUI notifications:
- OOM prevention results and
- low memory warnings - low memory warnings
- OOM prevention results
8. Output verbosity 8. Output verbosity
9. Misc 9. Misc
Just read the description of the parameters and edit the values. Just read the description of the parameters and edit the values.
Please restart the program after editing the config. Please restart the program after editing the config.
Bool values are case sensitive. ###############################################################################
##################################################################### 0. Common zram settings
See https://www.kernel.org/doc/Documentation/blockdev/zram.txt
You maybe need to set `ignore_zram = False` if you has a big zram disksize.
ignore_zram = False
1. Thresholds below which a signal should be sent to the victim 1. Thresholds below which a signal should be sent to the victim
@ -57,9 +61,9 @@ swap_min_sigkill = 5 %
numbers from the range [0; 90] %. numbers from the range [0; 90] %.
zram_max_sigterm = 50 % zram_max_sigterm = 50 %
zram_max_sigkill = 55 % zram_max_sigkill = 60 %
##################################################################### ###############################################################################
2. Response on PSI memory metrics (it needs Linux 4.20 and up) 2. Response on PSI memory metrics (it needs Linux 4.20 and up)
@ -102,7 +106,7 @@ sigkill_psi_threshold = 90
psi_post_action_delay = 60 psi_post_action_delay = 60
##################################################################### ###############################################################################
3. The frequency of checking the amount of available memory 3. The frequency of checking the amount of available memory
(and CPU usage) (and CPU usage)
@ -124,7 +128,7 @@ psi_post_action_delay = 60
rate_mem = 4000 rate_mem = 4000
rate_swap = 1500 rate_swap = 1500
rate_zram = 500 rate_zram = 6000
See also https://github.com/rfjakob/earlyoom/issues/61 See also https://github.com/rfjakob/earlyoom/issues/61
@ -135,7 +139,7 @@ min_sleep = 0.1
over_sleep = 0.05 over_sleep = 0.05
##################################################################### ###############################################################################
4. The prevention of killing innocent victims 4. The prevention of killing innocent victims
@ -144,7 +148,7 @@ over_sleep = 0.05
min_badness = 20 min_badness = 20
Valid values are non-negative floating-point numbers. Valid values are non-negative floating-point numbers.
Min delay if a victim does not respond to SIGTERM in 10 ms. Min delay if a victim doesn't respond to SIGTERM in 10 ms.
min_delay_after_sigterm = 3 min_delay_after_sigterm = 3
@ -157,7 +161,7 @@ decrease_oom_score_adj = False
oom_score_adj_max = 0 oom_score_adj_max = 0
##################################################################### ###############################################################################
5. Impact on the badness of processes via matching their names, 5. Impact on the badness of processes via matching their names,
cmdlines or UIDs with regular expressions using re.search(). cmdlines or UIDs with regular expressions using re.search().
@ -194,21 +198,15 @@ oom_score_adj_max = 0
A good option that allows fine adjustment. A good option that allows fine adjustment.
Prefer electron-based apps and chromium tabs Prefer chromium tabs and electron-based apps
@CMDLINE_RE 200 /// --type=renderer @CMDLINE_RE 200 /// --type=renderer
Prefer firefox tabs Prefer firefox tabs (Web Content and WebExtensions)
@CMDLINE_RE 100 /// -greomni|-childID @CMDLINE_RE 100 /// -appomni
@CMDLINE_RE -500 /// python
@CMDLINE_RE -200 /// ^/usr/lib/virtualbox @CMDLINE_RE -200 /// ^/usr/lib/virtualbox
5.3 Matching UIDs with RE patterns 5.3 Matching eUIDs with RE patterns
The most slow option The most slow option
@ -232,10 +230,11 @@ oom_score_adj_max = 0
@ENVIRON_RE 100 /// USER=user @ENVIRON_RE 100 /// USER=user
Note that you can control badness also via systemd units via OOMScoreAdjust, see Note that you can control badness also via systemd units via
https://www.freedesktop.org/software/systemd/man/systemd.exec.html#OOMScoreAdjust= OOMScoreAdjust, see
www.freedesktop.org/software/systemd/man/systemd.exec.html#OOMScoreAdjust=
##################################################################### ###############################################################################
6. Customize corrective actions. 6. Customize corrective actions.
@ -252,9 +251,10 @@ oom_score_adj_max = 0
$PID will be replaced by process PID. $PID will be replaced by process PID.
$NAME will be replaced by process name. $NAME will be replaced by process name.
$SERVICE will be replaced by .service if it exists (overwise it will be relpaced by empty line). $SERVICE will be replaced by .service if it exists (overwise it will be
relpaced by empty line)
##################################################################### ###############################################################################
7. GUI notifications: 7. GUI notifications:
- OOM prevention results and - OOM prevention results and
@ -289,7 +289,7 @@ min_time_between_warnings = 15
Ampersands (&) will be replaced with asterisks (*) in process Ampersands (&) will be replaced with asterisks (*) in process
names and in commands. names and in commands.
##################################################################### ###############################################################################
8. Verbosity 8. Verbosity
@ -303,7 +303,7 @@ print_config = False
print_mem_check_results = False print_mem_check_results = False
min_mem_report_interval = 60 min_mem_report_interval = 300
Print sleep periods between memory checks. Print sleep periods between memory checks.
Valid values are True and False. Valid values are True and False.
@ -327,15 +327,13 @@ extra_table_info = cgroup_v1
print_victim_info = False print_victim_info = False
# print_victim_cmdline max_ancestry_depth = 10
max_ancestry_depth = 1
separate_log = False separate_log = False
psi_debug = False psi_debug = False
##################################################################### ###############################################################################
9. Misc 9. Misc

View File

@ -122,7 +122,7 @@ send_signal = SIGTERM
# os.kill(int(pid), SIGCONT) # os.kill(int(pid), SIGCONT)
os.kill(int(pid), send_signal) # os.kill(int(pid), send_signal)
t0 = time() t0 = time()