do not check zram by default

This commit is contained in:
Alexey Avramov 2019-05-21 12:25:31 +09:00
parent 77da0efb9a
commit 944c13be7e
3 changed files with 332 additions and 391 deletions

645
nohang
View File

@ -11,92 +11,6 @@ from sre_constants import error as invalid_re
from signal import signal, SIGKILL, SIGTERM, SIGINT, SIGQUIT, SIGHUP
start_time = time()
help_mess = """usage: nohang [-h] [-v] [-t] [-p] [-c CONFIG]
optional arguments:
-h, --help show this help message and exit
-v, --version print version
-t, --test print some tests
-p, --print-proc-table
print table of processes with their badness values
-c CONFIG, --config CONFIG
path to the config file, default values:
./nohang.conf, /etc/nohang/nohang.conf"""
SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE'])
conf_err_mess = 'Invalid config. Exit.'
sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP]
sig_dict = {
SIGKILL: 'SIGKILL',
SIGINT: 'SIGINT',
SIGQUIT: 'SIGQUIT',
SIGHUP: 'SIGHUP',
SIGTERM: 'SIGTERM'
}
self_pid = str(os.getpid())
self_uid = os.geteuid()
if self_uid == 0:
root = True
else:
root = False
if os.path.exists('./nohang_notify_helper'):
notify_helper_path = './nohang_notify_helper'
else:
notify_helper_path = '/usr/sbin/nohang_notify_helper'
victim_dict = dict()
victim_id = None
actions_time_dict = dict()
actions_time_dict['action_handled'] = [time(), victim_id]
# print(actions_time_dict)
# will store corrective actions stat
stat_dict = dict()
separate_log = False # will be overwritten after parse config
def find_cgroup_indexes():
""" Find cgroup-line positions in /proc/*/cgroup file.
"""
cgroup_v1_index = None
cgroup_v2_index = None
with open('/proc/self/cgroup') as f:
for index, line in enumerate(f):
if ':name=' in line:
cgroup_v1_index = index
if line.startswith('0::'):
cgroup_v2_index = index
return cgroup_v1_index, cgroup_v2_index
cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
##########################################################################
# define functions
@ -115,7 +29,62 @@ def print_self_rss():
'''
def get_swap_threshold_tuple(string):
# re (Num %, True) or (Num KiB, False)
"""Returns KiB value if abs val was set in config, or tuple with %"""
# return tuple with abs and bool: (abs %, True) or (abs MiB, False)
if string.endswith('%'):
valid = string_to_float_convert_test(string[:-1])
if valid is None:
errprint('somewhere swap unit is not float_%')
exit(1)
value = float(string[:-1].strip())
if value < 0 or value > 100:
errprint('invalid value, must be from the range[0; 100] %')
exit(1)
return value, True
elif string.endswith('M'):
valid = string_to_float_convert_test(string[:-1])
if valid is None:
errprint('somewhere swap unit is not float_M')
exit(1)
value = float(string[:-1].strip()) * 1024
if value < 0:
errprint('invalid unit in config (negative value)')
exit(1)
return value, False
else:
errprint(
'Invalid config file. There are invalid units somewhere\nExit')
exit(1)
def find_cgroup_indexes():
""" Find cgroup-line positions in /proc/*/cgroup file.
"""
cgroup_v1_index = cgroup_v2_index = None
with open('/proc/self/cgroup') as f:
for index, line in enumerate(f):
if ':name=' in line:
cgroup_v1_index = index
if line.startswith('0::'):
cgroup_v2_index = index
return cgroup_v1_index, cgroup_v2_index
def pid_to_rss(pid):
"""
"""
try:
rss = int(rline1(
'/proc/{}/statm'.format(pid)).split(' ')[1]) * SC_PAGESIZE
@ -129,6 +98,8 @@ def pid_to_rss(pid):
def pid_to_vm_size(pid):
"""
"""
try:
vm_size = int(rline1(
'/proc/{}/statm'.format(pid)).partition(' ')[0]) * SC_PAGESIZE
@ -141,12 +112,6 @@ def pid_to_vm_size(pid):
return vm_size
def signal_handler(signum, frame):
"""
"""
@ -193,13 +158,6 @@ def write_self_oom_score_adj(new_value):
write('/proc/self/oom_score_adj', new_value)
self_oom_score_adj_min = '-600'
self_oom_score_adj_max = '-6'
write_self_oom_score_adj(self_oom_score_adj_min)
def valid_re(reg_exp):
"""Validate regular expression.
"""
@ -431,6 +389,8 @@ def pid_to_environ(pid):
def pid_to_realpath(pid):
"""
"""
try:
return os.path.realpath('/proc/' + pid + '/exe')
except FileNotFoundError:
@ -615,9 +575,6 @@ def pid_to_status_unicode(pid):
return None
##########################################################################
def uptime():
"""
"""
@ -993,9 +950,6 @@ def get_pid_list():
return pid_list
pid_list = get_pid_list()
def get_non_decimal_pids():
"""
"""
@ -1362,23 +1316,14 @@ def find_victim_info(pid, victim_badness, name):
return victim_info
def implement_corrective_action(signal):
"""
Find victim with highest badness and send SIGTERM/SIGKILL
"""
time0 = time()
# выходим из фции, если для SIGTERM порога не превышено время min_delay_after_sigterm и спим в течение over_sleep
# выходим из фции, если для SIGTERM порога не превышено время
# min_delay_after_sigterm и спим в течение over_sleep
if signal is SIGTERM:
dt = time() - actions_time_dict['action_handled'][0]
@ -1397,13 +1342,12 @@ def implement_corrective_action(signal):
else:
print('min_delay_after_sigterm IS EXCEEDED, it is time to action')
"""
При заходе в фцию проверяем права на сигтерм. Права на сигкилл у нас всегда есть
(потому что идем дальше только после полн освободж памяти после смерти жертвы)
При заходе в фцию проверяем права на сигтерм. Права на сигкилл у нас
всегда есть
(потому что идем дальше только после полн освободж памяти после
смерти жертвы)
actions_time_dict[action_handled] = time()
actions_time_dict[veto] = True
@ -1414,7 +1358,6 @@ def implement_corrective_action(signal):
"""
log(mem_info)
pid, victim_badness, name = find_victim(print_proc_table)
@ -1425,10 +1368,8 @@ def implement_corrective_action(signal):
victim_info = find_victim_info(pid, victim_badness, name)
log(victim_info)
# пороги могли превысиься за время поиска жертвы (поиск может занимать сотни миллисекунд)
# пороги могли превысиься за время поиска жертвы (поиск может занимать
# сотни миллисекунд)
mem_available, swap_total, swap_free = check_mem_and_swap()
ma_mib = int(mem_available) / 1024.0
@ -1445,15 +1386,8 @@ def implement_corrective_action(signal):
log('Hard threshold exceeded')
signal = SIGKILL
victim_id = get_victim_id(pid)
# kill the victim if it doesn't respond to SIGTERM В ТЕЧЕНИЕ
# ЗАДАННОГО ВРЕМЕНИ
@ -1465,19 +1399,10 @@ def implement_corrective_action(signal):
dt = time() - victim_dict[victim_id]
if dt > max_post_sigterm_victim_lifetime:
print('max_post_sigterm_victim_lifetime exceeded: the victim will get SIGKILL')
print('max_post_sigterm_victim_lifetime exceeded: the '
'victim will get SIGKILL')
signal = SIGKILL
# matching with re to customize corrective actions
soft_match = False
@ -1529,8 +1454,10 @@ def implement_corrective_action(signal):
response_time = time() - time0
# тут надо, как и при дефолтном действии, проверять существование жертвы, ее реакцию на действие,
# и время ее смерти в случае успеха, о обновление таймстемпов действия
# тут надо, как и при дефолтном действии, проверять существование
# жертвы, ее реакцию на действие,
# и время ее смерти в случае успеха, о обновление таймстемпов
# действия
etc_info = 'Implement a corrective act' \
'ion:\n Run the command: {}' \
@ -1552,71 +1479,49 @@ def implement_corrective_action(signal):
command.replace('$PID', pid).replace(
'$NAME', pid_to_name(pid)))
else:
# обычное действие через сигнал
try:
os.kill(int(pid), signal)
kill_timestamp = time()
response_time = kill_timestamp - time0
while True:
exe_exists = os.path.exists('/proc/{}/exe'.format(pid))
rss = pid_to_rss(pid)
dt = time() - kill_timestamp
log('Victim VmRSS: {} KiB'.format(rss))
if not exe_exists or rss == 0 or dt > 0.01:
#print(dt)
# print(dt)
break
sleep(0.001)
if dt > 0.01:
log('Timer (value = 0.01 sec) expired; seems' \
log('Timer (value = 0.01 sec) expired; seems'
' like the victim handles signal')
actions_time_dict['action_handled'] = [time(), get_victim_id(pid)]
actions_time_dict['action_handled'] = [
time(), get_victim_id(pid)]
if victim_id not in victim_dict: # хз как надо.
victim_dict.update({victim_id: time()})
# log('actions_time_dict', actions_time_dict)
# log('victim_dict', victim_dict)
else:
log('Process exited (VmRSS = 0) in {} sec'.format(
round(dt, 5)))
if signal is SIGKILL or not exe_exists or rss == 0:
while True:
sleep(0.001)
rss = pid_to_rss(pid) # рсс не важен когда путь не существует. Проверяй просто существование пид.
# рсс не важен когда путь не существует. Проверяй
# просто существование пид.
rss = pid_to_rss(pid)
if rss is None:
break
t1 = time()
@ -1624,7 +1529,6 @@ def implement_corrective_action(signal):
log('The victim died in {} sec'.format(
round(kill_duration, 3)))
mem_available, swap_total, swap_free = check_mem_and_swap()
ma_mib = int(mem_available) / 1024.0
@ -1636,12 +1540,6 @@ def implement_corrective_action(signal):
)
)
send_result = 'total response time: {} ms'.format(
round(response_time * 1000))
@ -1700,11 +1598,12 @@ def implement_corrective_action(signal):
update_stat_dict_and_print(key)
# тут надо поспать хорошенько. а может и счетчики поправить.
# херню несу. во-первых, внезапно может кто-то появиться c блльшим бэднес.. Далее надо минимизировать аутпут спам.
# херню несу. во-первых, внезапно может кто-то появиться c блльшим
# бэднес.. Далее надо минимизировать аутпут спам.
sleep(over_sleep)
# обновлять время не на каждый кил, а только на килл той жертвы, которая не отвечала на софт экшн.
# обновлять время не на каждый кил, а только на килл той жертвы,
# которая не отвечала на софт экшн.
# Вывод: ко времени действия прилагать также виктим айди.
print('##################################################################')
@ -1739,17 +1638,23 @@ def sleep_after_check_mem():
t_mem = mem_point / rate_mem
t_swap = swap_point / rate_swap
t_zram = (mem_total * 0.8 - mem_used_zram) / rate_zram
if t_zram < 0:
t_zram = 0
if CHECK_ZRAM:
t_zram = (mem_total * 0.8 - mem_used_zram) / rate_zram
if t_zram < 0:
t_zram = 0
t_mem_zram = t_mem + t_zram
t_mem_swap = t_mem + t_swap
t_mem_zram = t_mem + t_zram
if t_mem_swap <= t_mem_zram:
t = t_mem_swap
if CHECK_ZRAM:
if t_mem_swap <= t_mem_zram:
t = t_mem_swap
else:
t = t_mem_zram
else:
t = t_mem_zram
t = t_mem_swap
if t > max_sleep:
t = max_sleep
@ -1841,6 +1746,83 @@ def calculate_percent(arg_key):
##########################################################################
start_time = time()
help_mess = """usage: nohang [-h] [-v] [-t] [-p] [-c CONFIG]
optional arguments:
-h, --help show this help message and exit
-v, --version print version
-t, --test print some tests
-p, --print-proc-table
print table of processes with their badness values
-c CONFIG, --config CONFIG
path to the config file, default values:
./nohang.conf, /etc/nohang/nohang.conf"""
SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE'])
conf_err_mess = 'Invalid config. Exit.'
sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP]
sig_dict = {
SIGKILL: 'SIGKILL',
SIGINT: 'SIGINT',
SIGQUIT: 'SIGQUIT',
SIGHUP: 'SIGHUP',
SIGTERM: 'SIGTERM'
}
self_pid = str(os.getpid())
self_uid = os.geteuid()
if self_uid == 0:
root = True
else:
root = False
if os.path.exists('./nohang_notify_helper'):
notify_helper_path = './nohang_notify_helper'
else:
notify_helper_path = '/usr/sbin/nohang_notify_helper'
victim_dict = dict()
victim_id = None
actions_time_dict = dict()
actions_time_dict['action_handled'] = [time(), victim_id]
# print(actions_time_dict)
# will store corrective actions stat
stat_dict = dict()
separate_log = False # will be overwritten after parse config
cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
self_oom_score_adj_min = '-600'
self_oom_score_adj_max = '-6'
write_self_oom_score_adj(self_oom_score_adj_min)
pid_list = get_pid_list()
print_proc_table_flag = False
if len(argv) == 1:
@ -1879,9 +1861,6 @@ else:
exit(1)
##########################################################################
# find mem_total
# find positions of SwapFree and SwapTotal in /proc/meminfo
@ -1928,8 +1907,6 @@ except ValueError:
detailed_rss = False
# print('It is not Linux 4.5+')
##########################################################################
log('Config: ' + config)
@ -2167,6 +2144,8 @@ gui_low_memory_warnings = conf_parse_bool('gui_low_memory_warnings')
gui_notifications = conf_parse_bool('gui_notifications')
decrease_oom_score_adj = conf_parse_bool('decrease_oom_score_adj')
ignore_psi = conf_parse_bool('ignore_psi')
ignore_zram = conf_parse_bool('ignore_zram')
(mem_min_sigterm_kb, mem_min_sigterm_mb, mem_min_sigterm_percent
) = calculate_percent('mem_min_sigterm')
@ -2559,43 +2538,6 @@ psi_support = os.path.exists(psi_path)
# Get KiB levels if it's possible.
def get_swap_threshold_tuple(string):
# re (Num %, True) or (Num KiB, False)
"""Returns KiB value if abs val was set in config, or tuple with %"""
# return tuple with abs and bool: (abs %, True) or (abs MiB, False)
if string.endswith('%'):
valid = string_to_float_convert_test(string[:-1])
if valid is None:
errprint('somewhere swap unit is not float_%')
exit(1)
value = float(string[:-1].strip())
if value < 0 or value > 100:
errprint('invalid value, must be from the range[0; 100] %')
exit(1)
return value, True
elif string.endswith('M'):
valid = string_to_float_convert_test(string[:-1])
if valid is None:
errprint('somewhere swap unit is not float_M')
exit(1)
value = float(string[:-1].strip()) * 1024
if value < 0:
errprint('invalid unit in config (negative value)')
exit(1)
return value, False
else:
errprint(
'Invalid config file. There are invalid units somewhere\nExit')
exit(1)
swap_min_sigterm_tuple = get_swap_threshold_tuple(swap_min_sigterm)
swap_min_sigkill_tuple = get_swap_threshold_tuple(swap_min_sigkill)
swap_min_warnings_tuple = get_swap_threshold_tuple(swap_min_warnings)
@ -2732,14 +2674,9 @@ mlockall()
# print_self_rss()
log('Monitoring has started!')
stdout.flush()
##########################################################################
psi_avg_string = '' # will be overwritten if PSI monitoring enabled
mem_used_zram = 0
if psi_support and not ignore_psi:
psi_t0 = time()
@ -2760,58 +2697,26 @@ for i in sig_list:
signal(i, signal_handler)
CHECK_PSI = False
if psi_support and not ignore_psi:
CHECK_PSI = True
CHECK_ZRAM = not ignore_zram
log('Monitoring has started!')
stdout.flush()
##########################################################################
while True:
if psi_support and not ignore_psi:
# Q = time()
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
if print_mem_check_results:
psi_avg_string = 'PSI avg value: {} | '.format(
str(psi_avg_value).rjust(6))
if psi_avg_value >= sigkill_psi_threshold:
sigkill_psi_exceeded = True
else:
sigkill_psi_exceeded = False
if psi_avg_value >= sigterm_psi_threshold:
sigterm_psi_exceeded = True
else:
sigterm_psi_exceeded = False
if time() - psi_t0 >= psi_post_action_delay:
psi_post_action_delay_exceeded = True
else:
psi_post_action_delay_exceeded = False
if psi_debug:
log('sigterm_psi_exceeded: {}, sigkill_psi_exceeded: {}, ps'
'i_post_action_delay_exceeded: {}'.format(
sigterm_psi_exceeded,
sigkill_psi_exceeded,
psi_post_action_delay_exceeded))
if sigkill_psi_exceeded and psi_post_action_delay_exceeded:
time0 = time()
mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \
'old ({})'.format(
psi_avg_value, sigkill_psi_threshold)
implement_corrective_action(SIGKILL)
psi_t0 = time()
continue
if sigterm_psi_exceeded and psi_post_action_delay_exceeded:
time0 = time()
mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \
'shold ({})'.format(psi_avg_value, sigterm_psi_threshold)
implement_corrective_action(SIGTERM)
psi_t0 = time()
continue
# FIND VALUES: mem, swap, zram, psi
mem_available, swap_total, swap_free = check_mem_and_swap()
@ -2825,7 +2730,29 @@ while True:
if swap_warn_is_percent:
swap_min_warnings_kb = swap_total * swap_min_warnings_percent / 100.0
mem_used_zram = check_zram()
if swap_total > swap_min_sigkill_kb:
swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1))
else:
swap_sigkill_pc = '-'
if swap_total > swap_min_sigterm_kb:
swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1))
else:
swap_sigterm_pc = '-'
if CHECK_ZRAM:
mem_used_zram = check_zram()
if CHECK_PSI:
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
if time() - psi_t0 >= psi_post_action_delay:
psi_post_action_delay_exceeded = True
else:
psi_post_action_delay_exceeded = False
if print_mem_check_results:
psi_avg_string = 'PSI avg value: {} | '.format(
str(psi_avg_value).rjust(6))
if print_mem_check_results:
@ -2894,20 +2821,12 @@ while True:
)
)
if swap_total > swap_min_sigkill_kb:
swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1))
else:
swap_sigkill_pc = '-'
###########################################################################
if swap_total > swap_min_sigterm_kb:
swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1))
else:
swap_sigterm_pc = '-'
# CHECK HARD THRESHOLDS (SIGKILL LEVEL)
# MEM SWAP KILL
if (mem_available <= mem_min_sigkill_kb and
swap_free <= swap_min_sigkill_kb):
time0 = time()
mem_info = 'Hard threshold exceeded\nMemory status that requ' \
'ires corrective actions:' \
@ -2924,33 +2843,47 @@ while True:
swap_sigkill_pc)
implement_corrective_action(SIGKILL)
psi_t0 = time()
continue
# ZRAM KILL
if mem_used_zram >= zram_max_sigkill_kb:
time0 = time()
if CHECK_ZRAM:
if mem_used_zram >= zram_max_sigkill_kb:
mem_info = 'Hard threshold exceeded\nMemory status that requir' \
'es corrective actions:' \
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
'kill [{} MiB, {} %]'.format(
kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigkill_kb),
percent(zram_max_sigkill_kb / mem_total))
mem_info = 'Hard threshold exceeded\nMemory status that requir' \
'es corrective actions:' \
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
'kill [{} MiB, {} %]'.format(
kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigkill_kb),
percent(zram_max_sigkill_kb / mem_total))
implement_corrective_action(SIGKILL)
implement_corrective_action(SIGKILL)
psi_t0 = time()
continue
psi_t0 = time()
continue
if CHECK_PSI:
if psi_avg_value >= sigkill_psi_threshold:
sigkill_psi_exceeded = True
else:
sigkill_psi_exceeded = False
# MEM SWAP TERM
if mem_available <= mem_min_sigterm_kb and \
swap_free <= swap_min_sigterm_kb:
if sigkill_psi_exceeded and psi_post_action_delay_exceeded:
time0 = time()
mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \
'old ({})'.format(
psi_avg_value, sigkill_psi_threshold)
implement_corrective_action(SIGKILL)
psi_t0 = time()
continue
###########################################################################
# CHECK SOFT THRESHOLDS (SIGTERM LEVEL)
if (mem_available <= mem_min_sigterm_kb and
swap_free <= swap_min_sigterm_kb):
mem_info = 'Soft threshold exceeded\nMemory status that requi' \
'res corrective actions:' \
@ -2967,34 +2900,54 @@ while True:
swap_sigterm_pc)
implement_corrective_action(SIGTERM)
psi_t0 = time()
continue
# ZRAM TERM
if mem_used_zram >= zram_max_sigterm_kb:
time0 = time()
if CHECK_ZRAM:
if mem_used_zram >= zram_max_sigterm_kb:
mem_info = 'Soft threshold exceeded\nMemory status that requ' \
'ires corrective actions:' \
'\n MemUsedZram [{} MiB, {} %] >= ' \
'zram_max_sigterm [{} M, {} %]'.format(
kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigterm_kb),
percent(zram_max_sigterm_kb / mem_total))
mem_info = 'Soft threshold exceeded\nMemory status that require' \
's corrective actions:\n MemUsedZram [{} MiB, {} %] >= zra' \
'm_max_sigterm [{} M, {} %]'.format(
kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigterm_kb),
percent(zram_max_sigterm_kb / mem_total))
implement_corrective_action(SIGTERM)
implement_corrective_action(SIGTERM)
psi_t0 = time()
continue
psi_t0 = time()
continue
if CHECK_PSI:
if psi_avg_value >= sigterm_psi_threshold:
sigterm_psi_exceeded = True
else:
sigterm_psi_exceeded = False
if psi_debug:
log('sigterm_psi_exceeded: {}, sigkill_psi_exceeded: {}, ps'
'i_post_action_delay_exceeded: {}'.format(
sigterm_psi_exceeded,
sigkill_psi_exceeded,
psi_post_action_delay_exceeded))
if sigterm_psi_exceeded and psi_post_action_delay_exceeded:
mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \
'shold ({})'.format(psi_avg_value, sigterm_psi_threshold)
implement_corrective_action(SIGTERM)
psi_t0 = time()
continue
###########################################################################
# LOW MEMORY WARNINGS
if gui_low_memory_warnings:
if mem_available <= mem_min_warnings_kb and \
swap_free <= swap_min_warnings_kb + 0.1 or \
mem_used_zram >= zram_max_warnings_kb:
if (mem_available <= mem_min_warnings_kb and
swap_free <= swap_min_warnings_kb + 0.1 or
mem_used_zram >= zram_max_warnings_kb):
warn_time_delta = time() - warn_time_now
warn_time_now = time()
warn_timer += warn_time_delta
@ -3003,17 +2956,7 @@ while True:
warn_timer = 0
# x = time() - Q
# print(x * 1000)
# SLEEP BETWEEN MEM CHECKS
sleep_after_check_mem()

View File

@ -1,34 +1,38 @@
This is nohang config file.
Lines starting with #, tabs and spaces are comments.
Lines starting with @ contain optional parameters.
All values are case sensitive.
Be careful: nohang doesn't forbid you to shoot yourself in the foot.
The configuration includes the following sections:
0. Common zram settings
1. Memory levels to respond to as an OOM threat
2. Response on PSI memory metrics
3. The frequency of checking the level of available memory
(and CPU usage)
4. The prevention of killing innocent victims
5. Impact on the badness of processes via matching their
- names,
- cgroups,
- cmdlines and
- UIDs
with regular expressions
5. Impact on the badness of processes via matching their names, cgroups and
cmdlines with specified regular expressions
6. Customize corrective actions: the execution of a specific command
instead of sending the SIGTERM signal
7. GUI notifications:
- OOM prevention results and
- low memory warnings
- OOM prevention results
8. Output verbosity
9. Misc
Just read the description of the parameters and edit the values.
Please restart the program after editing the config.
Bool values are case sensitive.
###############################################################################
#####################################################################
0. Common zram settings
See https://www.kernel.org/doc/Documentation/blockdev/zram.txt
You maybe need to set `ignore_zram = False` if you has a big zram disksize.
ignore_zram = False
1. Thresholds below which a signal should be sent to the victim
@ -57,9 +61,9 @@ swap_min_sigkill = 5 %
numbers from the range [0; 90] %.
zram_max_sigterm = 50 %
zram_max_sigkill = 55 %
zram_max_sigkill = 60 %
#####################################################################
###############################################################################
2. Response on PSI memory metrics (it needs Linux 4.20 and up)
@ -102,7 +106,7 @@ sigkill_psi_threshold = 90
psi_post_action_delay = 60
#####################################################################
###############################################################################
3. The frequency of checking the amount of available memory
(and CPU usage)
@ -124,7 +128,7 @@ psi_post_action_delay = 60
rate_mem = 4000
rate_swap = 1500
rate_zram = 500
rate_zram = 6000
See also https://github.com/rfjakob/earlyoom/issues/61
@ -135,7 +139,7 @@ min_sleep = 0.1
over_sleep = 0.05
#####################################################################
###############################################################################
4. The prevention of killing innocent victims
@ -144,7 +148,7 @@ over_sleep = 0.05
min_badness = 20
Valid values are non-negative floating-point numbers.
Min delay if a victim does not respond to SIGTERM in 10 ms.
Min delay if a victim doesn't respond to SIGTERM in 10 ms.
min_delay_after_sigterm = 3
@ -157,7 +161,7 @@ decrease_oom_score_adj = False
oom_score_adj_max = 0
#####################################################################
###############################################################################
5. Impact on the badness of processes via matching their names,
cmdlines or UIDs with regular expressions using re.search().
@ -194,21 +198,15 @@ oom_score_adj_max = 0
A good option that allows fine adjustment.
Prefer electron-based apps and chromium tabs
Prefer chromium tabs and electron-based apps
@CMDLINE_RE 200 /// --type=renderer
Prefer firefox tabs
@CMDLINE_RE 100 /// -greomni|-childID
@CMDLINE_RE -500 /// python
Prefer firefox tabs (Web Content and WebExtensions)
@CMDLINE_RE 100 /// -appomni
@CMDLINE_RE -200 /// ^/usr/lib/virtualbox
5.3 Matching UIDs with RE patterns
5.3 Matching eUIDs with RE patterns
The most slow option
@ -232,10 +230,11 @@ oom_score_adj_max = 0
@ENVIRON_RE 100 /// USER=user
Note that you can control badness also via systemd units via OOMScoreAdjust, see
https://www.freedesktop.org/software/systemd/man/systemd.exec.html#OOMScoreAdjust=
Note that you can control badness also via systemd units via
OOMScoreAdjust, see
www.freedesktop.org/software/systemd/man/systemd.exec.html#OOMScoreAdjust=
#####################################################################
###############################################################################
6. Customize corrective actions.
@ -247,14 +246,15 @@ oom_score_adj_max = 0
@SOFT_ACTION_RE_NAME ^foo$ /// kill -SEGV $PID
@SOFT_ACTION_RE_NAME ^bash$ /// kill -9 $PID
@SOFT_ACTION_RE_CGROUP_V1 ^/system.slice/ /// systemctl restart $SERVICE
@SOFT_ACTION_RE_CGROUP_V1 foo.service$ /// systemctl restart $SERVICE
@SOFT_ACTION_RE_CGROUP_V1 ^/system.slice/ /// systemctl restart $SERVICE
@SOFT_ACTION_RE_CGROUP_V1 foo.service$ /// systemctl restart $SERVICE
$PID will be replaced by process PID.
$NAME will be replaced by process name.
$SERVICE will be replaced by .service if it exists (overwise it will be relpaced by empty line).
$SERVICE will be replaced by .service if it exists (overwise it will be
relpaced by empty line)
#####################################################################
###############################################################################
7. GUI notifications:
- OOM prevention results and
@ -289,7 +289,7 @@ min_time_between_warnings = 15
Ampersands (&) will be replaced with asterisks (*) in process
names and in commands.
#####################################################################
###############################################################################
8. Verbosity
@ -303,7 +303,7 @@ print_config = False
print_mem_check_results = False
min_mem_report_interval = 60
min_mem_report_interval = 300
Print sleep periods between memory checks.
Valid values are True and False.
@ -327,15 +327,13 @@ extra_table_info = cgroup_v1
print_victim_info = False
# print_victim_cmdline
max_ancestry_depth = 1
max_ancestry_depth = 10
separate_log = False
psi_debug = False
#####################################################################
###############################################################################
9. Misc

View File

@ -122,7 +122,7 @@ send_signal = SIGTERM
# os.kill(int(pid), SIGCONT)
os.kill(int(pid), send_signal)
# os.kill(int(pid), send_signal)
t0 = time()