do not check zram by default
This commit is contained in:
parent
77da0efb9a
commit
944c13be7e
645
nohang
645
nohang
@ -11,92 +11,6 @@ from sre_constants import error as invalid_re
|
||||
from signal import signal, SIGKILL, SIGTERM, SIGINT, SIGQUIT, SIGHUP
|
||||
|
||||
|
||||
start_time = time()
|
||||
|
||||
|
||||
help_mess = """usage: nohang [-h] [-v] [-t] [-p] [-c CONFIG]
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-v, --version print version
|
||||
-t, --test print some tests
|
||||
-p, --print-proc-table
|
||||
print table of processes with their badness values
|
||||
-c CONFIG, --config CONFIG
|
||||
path to the config file, default values:
|
||||
./nohang.conf, /etc/nohang/nohang.conf"""
|
||||
|
||||
|
||||
SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
|
||||
|
||||
SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE'])
|
||||
|
||||
conf_err_mess = 'Invalid config. Exit.'
|
||||
|
||||
sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP]
|
||||
|
||||
sig_dict = {
|
||||
SIGKILL: 'SIGKILL',
|
||||
SIGINT: 'SIGINT',
|
||||
SIGQUIT: 'SIGQUIT',
|
||||
SIGHUP: 'SIGHUP',
|
||||
SIGTERM: 'SIGTERM'
|
||||
}
|
||||
|
||||
self_pid = str(os.getpid())
|
||||
|
||||
self_uid = os.geteuid()
|
||||
|
||||
if self_uid == 0:
|
||||
root = True
|
||||
else:
|
||||
root = False
|
||||
|
||||
|
||||
if os.path.exists('./nohang_notify_helper'):
|
||||
notify_helper_path = './nohang_notify_helper'
|
||||
else:
|
||||
notify_helper_path = '/usr/sbin/nohang_notify_helper'
|
||||
|
||||
|
||||
victim_dict = dict()
|
||||
|
||||
|
||||
|
||||
victim_id = None
|
||||
actions_time_dict = dict()
|
||||
actions_time_dict['action_handled'] = [time(), victim_id]
|
||||
# print(actions_time_dict)
|
||||
|
||||
|
||||
|
||||
# will store corrective actions stat
|
||||
stat_dict = dict()
|
||||
|
||||
|
||||
separate_log = False # will be overwritten after parse config
|
||||
|
||||
|
||||
def find_cgroup_indexes():
|
||||
""" Find cgroup-line positions in /proc/*/cgroup file.
|
||||
"""
|
||||
|
||||
cgroup_v1_index = None
|
||||
cgroup_v2_index = None
|
||||
|
||||
with open('/proc/self/cgroup') as f:
|
||||
for index, line in enumerate(f):
|
||||
if ':name=' in line:
|
||||
cgroup_v1_index = index
|
||||
if line.startswith('0::'):
|
||||
cgroup_v2_index = index
|
||||
|
||||
return cgroup_v1_index, cgroup_v2_index
|
||||
|
||||
|
||||
cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
|
||||
|
||||
|
||||
##########################################################################
|
||||
|
||||
# define functions
|
||||
@ -115,7 +29,62 @@ def print_self_rss():
|
||||
'''
|
||||
|
||||
|
||||
def get_swap_threshold_tuple(string):
|
||||
# re (Num %, True) or (Num KiB, False)
|
||||
"""Returns KiB value if abs val was set in config, or tuple with %"""
|
||||
# return tuple with abs and bool: (abs %, True) or (abs MiB, False)
|
||||
|
||||
if string.endswith('%'):
|
||||
valid = string_to_float_convert_test(string[:-1])
|
||||
if valid is None:
|
||||
errprint('somewhere swap unit is not float_%')
|
||||
exit(1)
|
||||
|
||||
value = float(string[:-1].strip())
|
||||
if value < 0 or value > 100:
|
||||
errprint('invalid value, must be from the range[0; 100] %')
|
||||
exit(1)
|
||||
|
||||
return value, True
|
||||
|
||||
elif string.endswith('M'):
|
||||
valid = string_to_float_convert_test(string[:-1])
|
||||
if valid is None:
|
||||
errprint('somewhere swap unit is not float_M')
|
||||
exit(1)
|
||||
|
||||
value = float(string[:-1].strip()) * 1024
|
||||
if value < 0:
|
||||
errprint('invalid unit in config (negative value)')
|
||||
exit(1)
|
||||
|
||||
return value, False
|
||||
|
||||
else:
|
||||
errprint(
|
||||
'Invalid config file. There are invalid units somewhere\nExit')
|
||||
exit(1)
|
||||
|
||||
|
||||
def find_cgroup_indexes():
|
||||
""" Find cgroup-line positions in /proc/*/cgroup file.
|
||||
"""
|
||||
|
||||
cgroup_v1_index = cgroup_v2_index = None
|
||||
|
||||
with open('/proc/self/cgroup') as f:
|
||||
for index, line in enumerate(f):
|
||||
if ':name=' in line:
|
||||
cgroup_v1_index = index
|
||||
if line.startswith('0::'):
|
||||
cgroup_v2_index = index
|
||||
|
||||
return cgroup_v1_index, cgroup_v2_index
|
||||
|
||||
|
||||
def pid_to_rss(pid):
|
||||
"""
|
||||
"""
|
||||
try:
|
||||
rss = int(rline1(
|
||||
'/proc/{}/statm'.format(pid)).split(' ')[1]) * SC_PAGESIZE
|
||||
@ -129,6 +98,8 @@ def pid_to_rss(pid):
|
||||
|
||||
|
||||
def pid_to_vm_size(pid):
|
||||
"""
|
||||
"""
|
||||
try:
|
||||
vm_size = int(rline1(
|
||||
'/proc/{}/statm'.format(pid)).partition(' ')[0]) * SC_PAGESIZE
|
||||
@ -141,12 +112,6 @@ def pid_to_vm_size(pid):
|
||||
return vm_size
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
"""
|
||||
"""
|
||||
@ -193,13 +158,6 @@ def write_self_oom_score_adj(new_value):
|
||||
write('/proc/self/oom_score_adj', new_value)
|
||||
|
||||
|
||||
self_oom_score_adj_min = '-600'
|
||||
self_oom_score_adj_max = '-6'
|
||||
|
||||
|
||||
write_self_oom_score_adj(self_oom_score_adj_min)
|
||||
|
||||
|
||||
def valid_re(reg_exp):
|
||||
"""Validate regular expression.
|
||||
"""
|
||||
@ -431,6 +389,8 @@ def pid_to_environ(pid):
|
||||
|
||||
|
||||
def pid_to_realpath(pid):
|
||||
"""
|
||||
"""
|
||||
try:
|
||||
return os.path.realpath('/proc/' + pid + '/exe')
|
||||
except FileNotFoundError:
|
||||
@ -615,9 +575,6 @@ def pid_to_status_unicode(pid):
|
||||
return None
|
||||
|
||||
|
||||
##########################################################################
|
||||
|
||||
|
||||
def uptime():
|
||||
"""
|
||||
"""
|
||||
@ -993,9 +950,6 @@ def get_pid_list():
|
||||
return pid_list
|
||||
|
||||
|
||||
pid_list = get_pid_list()
|
||||
|
||||
|
||||
def get_non_decimal_pids():
|
||||
"""
|
||||
"""
|
||||
@ -1362,23 +1316,14 @@ def find_victim_info(pid, victim_badness, name):
|
||||
return victim_info
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def implement_corrective_action(signal):
|
||||
"""
|
||||
Find victim with highest badness and send SIGTERM/SIGKILL
|
||||
"""
|
||||
time0 = time()
|
||||
|
||||
|
||||
# выходим из фции, если для SIGTERM порога не превышено время min_delay_after_sigterm и спим в течение over_sleep
|
||||
# выходим из фции, если для SIGTERM порога не превышено время
|
||||
# min_delay_after_sigterm и спим в течение over_sleep
|
||||
if signal is SIGTERM:
|
||||
|
||||
dt = time() - actions_time_dict['action_handled'][0]
|
||||
@ -1397,13 +1342,12 @@ def implement_corrective_action(signal):
|
||||
else:
|
||||
print('min_delay_after_sigterm IS EXCEEDED, it is time to action')
|
||||
|
||||
|
||||
|
||||
|
||||
"""
|
||||
|
||||
При заходе в фцию проверяем права на сигтерм. Права на сигкилл у нас всегда есть
|
||||
(потому что идем дальше только после полн освободж памяти после смерти жертвы)
|
||||
При заходе в фцию проверяем права на сигтерм. Права на сигкилл у нас
|
||||
всегда есть
|
||||
(потому что идем дальше только после полн освободж памяти после
|
||||
смерти жертвы)
|
||||
|
||||
actions_time_dict[action_handled] = time()
|
||||
actions_time_dict[veto] = True
|
||||
@ -1414,7 +1358,6 @@ def implement_corrective_action(signal):
|
||||
|
||||
"""
|
||||
|
||||
|
||||
log(mem_info)
|
||||
|
||||
pid, victim_badness, name = find_victim(print_proc_table)
|
||||
@ -1425,10 +1368,8 @@ def implement_corrective_action(signal):
|
||||
victim_info = find_victim_info(pid, victim_badness, name)
|
||||
log(victim_info)
|
||||
|
||||
|
||||
|
||||
|
||||
# пороги могли превысиься за время поиска жертвы (поиск может занимать сотни миллисекунд)
|
||||
# пороги могли превысиься за время поиска жертвы (поиск может занимать
|
||||
# сотни миллисекунд)
|
||||
mem_available, swap_total, swap_free = check_mem_and_swap()
|
||||
|
||||
ma_mib = int(mem_available) / 1024.0
|
||||
@ -1445,15 +1386,8 @@ def implement_corrective_action(signal):
|
||||
log('Hard threshold exceeded')
|
||||
signal = SIGKILL
|
||||
|
||||
|
||||
|
||||
victim_id = get_victim_id(pid)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# kill the victim if it doesn't respond to SIGTERM В ТЕЧЕНИЕ
|
||||
# ЗАДАННОГО ВРЕМЕНИ
|
||||
|
||||
@ -1465,19 +1399,10 @@ def implement_corrective_action(signal):
|
||||
dt = time() - victim_dict[victim_id]
|
||||
|
||||
if dt > max_post_sigterm_victim_lifetime:
|
||||
print('max_post_sigterm_victim_lifetime exceeded: the victim will get SIGKILL')
|
||||
print('max_post_sigterm_victim_lifetime exceeded: the '
|
||||
'victim will get SIGKILL')
|
||||
signal = SIGKILL
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# matching with re to customize corrective actions
|
||||
soft_match = False
|
||||
|
||||
@ -1529,8 +1454,10 @@ def implement_corrective_action(signal):
|
||||
|
||||
response_time = time() - time0
|
||||
|
||||
# тут надо, как и при дефолтном действии, проверять существование жертвы, ее реакцию на действие,
|
||||
# и время ее смерти в случае успеха, о обновление таймстемпов действия
|
||||
# тут надо, как и при дефолтном действии, проверять существование
|
||||
# жертвы, ее реакцию на действие,
|
||||
# и время ее смерти в случае успеха, о обновление таймстемпов
|
||||
# действия
|
||||
|
||||
etc_info = 'Implement a corrective act' \
|
||||
'ion:\n Run the command: {}' \
|
||||
@ -1552,71 +1479,49 @@ def implement_corrective_action(signal):
|
||||
command.replace('$PID', pid).replace(
|
||||
'$NAME', pid_to_name(pid)))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
else:
|
||||
|
||||
# обычное действие через сигнал
|
||||
try:
|
||||
|
||||
|
||||
os.kill(int(pid), signal)
|
||||
kill_timestamp = time()
|
||||
response_time = kill_timestamp - time0
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
while True:
|
||||
exe_exists = os.path.exists('/proc/{}/exe'.format(pid))
|
||||
rss = pid_to_rss(pid)
|
||||
dt = time() - kill_timestamp
|
||||
log('Victim VmRSS: {} KiB'.format(rss))
|
||||
if not exe_exists or rss == 0 or dt > 0.01:
|
||||
#print(dt)
|
||||
# print(dt)
|
||||
break
|
||||
sleep(0.001)
|
||||
|
||||
if dt > 0.01:
|
||||
log('Timer (value = 0.01 sec) expired; seems' \
|
||||
log('Timer (value = 0.01 sec) expired; seems'
|
||||
' like the victim handles signal')
|
||||
|
||||
actions_time_dict['action_handled'] = [time(), get_victim_id(pid)]
|
||||
|
||||
actions_time_dict['action_handled'] = [
|
||||
time(), get_victim_id(pid)]
|
||||
|
||||
if victim_id not in victim_dict: # хз как надо.
|
||||
victim_dict.update({victim_id: time()})
|
||||
|
||||
|
||||
# log('actions_time_dict', actions_time_dict)
|
||||
# log('victim_dict', victim_dict)
|
||||
|
||||
|
||||
|
||||
|
||||
else:
|
||||
log('Process exited (VmRSS = 0) in {} sec'.format(
|
||||
round(dt, 5)))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if signal is SIGKILL or not exe_exists or rss == 0:
|
||||
|
||||
while True:
|
||||
sleep(0.001)
|
||||
rss = pid_to_rss(pid) # рсс не важен когда путь не существует. Проверяй просто существование пид.
|
||||
# рсс не важен когда путь не существует. Проверяй
|
||||
# просто существование пид.
|
||||
rss = pid_to_rss(pid)
|
||||
if rss is None:
|
||||
break
|
||||
t1 = time()
|
||||
@ -1624,7 +1529,6 @@ def implement_corrective_action(signal):
|
||||
log('The victim died in {} sec'.format(
|
||||
round(kill_duration, 3)))
|
||||
|
||||
|
||||
mem_available, swap_total, swap_free = check_mem_and_swap()
|
||||
|
||||
ma_mib = int(mem_available) / 1024.0
|
||||
@ -1636,12 +1540,6 @@ def implement_corrective_action(signal):
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
send_result = 'total response time: {} ms'.format(
|
||||
round(response_time * 1000))
|
||||
|
||||
@ -1700,11 +1598,12 @@ def implement_corrective_action(signal):
|
||||
update_stat_dict_and_print(key)
|
||||
|
||||
# тут надо поспать хорошенько. а может и счетчики поправить.
|
||||
# херню несу. во-первых, внезапно может кто-то появиться c блльшим бэднес.. Далее надо минимизировать аутпут спам.
|
||||
# херню несу. во-первых, внезапно может кто-то появиться c блльшим
|
||||
# бэднес.. Далее надо минимизировать аутпут спам.
|
||||
sleep(over_sleep)
|
||||
|
||||
|
||||
# обновлять время не на каждый кил, а только на килл той жертвы, которая не отвечала на софт экшн.
|
||||
# обновлять время не на каждый кил, а только на килл той жертвы,
|
||||
# которая не отвечала на софт экшн.
|
||||
# Вывод: ко времени действия прилагать также виктим айди.
|
||||
|
||||
print('##################################################################')
|
||||
@ -1739,17 +1638,23 @@ def sleep_after_check_mem():
|
||||
|
||||
t_mem = mem_point / rate_mem
|
||||
t_swap = swap_point / rate_swap
|
||||
t_zram = (mem_total * 0.8 - mem_used_zram) / rate_zram
|
||||
if t_zram < 0:
|
||||
t_zram = 0
|
||||
|
||||
if CHECK_ZRAM:
|
||||
t_zram = (mem_total * 0.8 - mem_used_zram) / rate_zram
|
||||
if t_zram < 0:
|
||||
t_zram = 0
|
||||
t_mem_zram = t_mem + t_zram
|
||||
|
||||
t_mem_swap = t_mem + t_swap
|
||||
t_mem_zram = t_mem + t_zram
|
||||
|
||||
if t_mem_swap <= t_mem_zram:
|
||||
t = t_mem_swap
|
||||
if CHECK_ZRAM:
|
||||
|
||||
if t_mem_swap <= t_mem_zram:
|
||||
t = t_mem_swap
|
||||
else:
|
||||
t = t_mem_zram
|
||||
else:
|
||||
t = t_mem_zram
|
||||
t = t_mem_swap
|
||||
|
||||
if t > max_sleep:
|
||||
t = max_sleep
|
||||
@ -1841,6 +1746,83 @@ def calculate_percent(arg_key):
|
||||
##########################################################################
|
||||
|
||||
|
||||
start_time = time()
|
||||
|
||||
|
||||
help_mess = """usage: nohang [-h] [-v] [-t] [-p] [-c CONFIG]
|
||||
|
||||
optional arguments:
|
||||
-h, --help show this help message and exit
|
||||
-v, --version print version
|
||||
-t, --test print some tests
|
||||
-p, --print-proc-table
|
||||
print table of processes with their badness values
|
||||
-c CONFIG, --config CONFIG
|
||||
path to the config file, default values:
|
||||
./nohang.conf, /etc/nohang/nohang.conf"""
|
||||
|
||||
|
||||
SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
|
||||
|
||||
SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE'])
|
||||
|
||||
conf_err_mess = 'Invalid config. Exit.'
|
||||
|
||||
sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP]
|
||||
|
||||
sig_dict = {
|
||||
SIGKILL: 'SIGKILL',
|
||||
SIGINT: 'SIGINT',
|
||||
SIGQUIT: 'SIGQUIT',
|
||||
SIGHUP: 'SIGHUP',
|
||||
SIGTERM: 'SIGTERM'
|
||||
}
|
||||
|
||||
self_pid = str(os.getpid())
|
||||
|
||||
self_uid = os.geteuid()
|
||||
|
||||
if self_uid == 0:
|
||||
root = True
|
||||
else:
|
||||
root = False
|
||||
|
||||
|
||||
if os.path.exists('./nohang_notify_helper'):
|
||||
notify_helper_path = './nohang_notify_helper'
|
||||
else:
|
||||
notify_helper_path = '/usr/sbin/nohang_notify_helper'
|
||||
|
||||
|
||||
victim_dict = dict()
|
||||
|
||||
|
||||
victim_id = None
|
||||
actions_time_dict = dict()
|
||||
actions_time_dict['action_handled'] = [time(), victim_id]
|
||||
# print(actions_time_dict)
|
||||
|
||||
|
||||
# will store corrective actions stat
|
||||
stat_dict = dict()
|
||||
|
||||
|
||||
separate_log = False # will be overwritten after parse config
|
||||
|
||||
|
||||
cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
|
||||
|
||||
|
||||
self_oom_score_adj_min = '-600'
|
||||
self_oom_score_adj_max = '-6'
|
||||
|
||||
|
||||
write_self_oom_score_adj(self_oom_score_adj_min)
|
||||
|
||||
|
||||
pid_list = get_pid_list()
|
||||
|
||||
|
||||
print_proc_table_flag = False
|
||||
|
||||
if len(argv) == 1:
|
||||
@ -1879,9 +1861,6 @@ else:
|
||||
exit(1)
|
||||
|
||||
|
||||
##########################################################################
|
||||
|
||||
|
||||
# find mem_total
|
||||
# find positions of SwapFree and SwapTotal in /proc/meminfo
|
||||
|
||||
@ -1928,8 +1907,6 @@ except ValueError:
|
||||
detailed_rss = False
|
||||
# print('It is not Linux 4.5+')
|
||||
|
||||
##########################################################################
|
||||
|
||||
|
||||
log('Config: ' + config)
|
||||
|
||||
@ -2167,6 +2144,8 @@ gui_low_memory_warnings = conf_parse_bool('gui_low_memory_warnings')
|
||||
gui_notifications = conf_parse_bool('gui_notifications')
|
||||
decrease_oom_score_adj = conf_parse_bool('decrease_oom_score_adj')
|
||||
ignore_psi = conf_parse_bool('ignore_psi')
|
||||
ignore_zram = conf_parse_bool('ignore_zram')
|
||||
|
||||
|
||||
(mem_min_sigterm_kb, mem_min_sigterm_mb, mem_min_sigterm_percent
|
||||
) = calculate_percent('mem_min_sigterm')
|
||||
@ -2559,43 +2538,6 @@ psi_support = os.path.exists(psi_path)
|
||||
# Get KiB levels if it's possible.
|
||||
|
||||
|
||||
def get_swap_threshold_tuple(string):
|
||||
# re (Num %, True) or (Num KiB, False)
|
||||
"""Returns KiB value if abs val was set in config, or tuple with %"""
|
||||
# return tuple with abs and bool: (abs %, True) or (abs MiB, False)
|
||||
|
||||
if string.endswith('%'):
|
||||
valid = string_to_float_convert_test(string[:-1])
|
||||
if valid is None:
|
||||
errprint('somewhere swap unit is not float_%')
|
||||
exit(1)
|
||||
|
||||
value = float(string[:-1].strip())
|
||||
if value < 0 or value > 100:
|
||||
errprint('invalid value, must be from the range[0; 100] %')
|
||||
exit(1)
|
||||
|
||||
return value, True
|
||||
|
||||
elif string.endswith('M'):
|
||||
valid = string_to_float_convert_test(string[:-1])
|
||||
if valid is None:
|
||||
errprint('somewhere swap unit is not float_M')
|
||||
exit(1)
|
||||
|
||||
value = float(string[:-1].strip()) * 1024
|
||||
if value < 0:
|
||||
errprint('invalid unit in config (negative value)')
|
||||
exit(1)
|
||||
|
||||
return value, False
|
||||
|
||||
else:
|
||||
errprint(
|
||||
'Invalid config file. There are invalid units somewhere\nExit')
|
||||
exit(1)
|
||||
|
||||
|
||||
swap_min_sigterm_tuple = get_swap_threshold_tuple(swap_min_sigterm)
|
||||
swap_min_sigkill_tuple = get_swap_threshold_tuple(swap_min_sigkill)
|
||||
swap_min_warnings_tuple = get_swap_threshold_tuple(swap_min_warnings)
|
||||
@ -2732,14 +2674,9 @@ mlockall()
|
||||
# print_self_rss()
|
||||
|
||||
|
||||
log('Monitoring has started!')
|
||||
|
||||
stdout.flush()
|
||||
|
||||
##########################################################################
|
||||
|
||||
psi_avg_string = '' # will be overwritten if PSI monitoring enabled
|
||||
|
||||
mem_used_zram = 0
|
||||
|
||||
if psi_support and not ignore_psi:
|
||||
psi_t0 = time()
|
||||
@ -2760,58 +2697,26 @@ for i in sig_list:
|
||||
signal(i, signal_handler)
|
||||
|
||||
|
||||
CHECK_PSI = False
|
||||
if psi_support and not ignore_psi:
|
||||
CHECK_PSI = True
|
||||
|
||||
|
||||
CHECK_ZRAM = not ignore_zram
|
||||
|
||||
log('Monitoring has started!')
|
||||
|
||||
stdout.flush()
|
||||
|
||||
|
||||
##########################################################################
|
||||
|
||||
|
||||
while True:
|
||||
|
||||
if psi_support and not ignore_psi:
|
||||
# Q = time()
|
||||
|
||||
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
|
||||
|
||||
if print_mem_check_results:
|
||||
psi_avg_string = 'PSI avg value: {} | '.format(
|
||||
str(psi_avg_value).rjust(6))
|
||||
|
||||
if psi_avg_value >= sigkill_psi_threshold:
|
||||
sigkill_psi_exceeded = True
|
||||
else:
|
||||
sigkill_psi_exceeded = False
|
||||
|
||||
if psi_avg_value >= sigterm_psi_threshold:
|
||||
sigterm_psi_exceeded = True
|
||||
else:
|
||||
sigterm_psi_exceeded = False
|
||||
|
||||
if time() - psi_t0 >= psi_post_action_delay:
|
||||
psi_post_action_delay_exceeded = True
|
||||
else:
|
||||
psi_post_action_delay_exceeded = False
|
||||
|
||||
if psi_debug:
|
||||
log('sigterm_psi_exceeded: {}, sigkill_psi_exceeded: {}, ps'
|
||||
'i_post_action_delay_exceeded: {}'.format(
|
||||
sigterm_psi_exceeded,
|
||||
sigkill_psi_exceeded,
|
||||
psi_post_action_delay_exceeded))
|
||||
|
||||
if sigkill_psi_exceeded and psi_post_action_delay_exceeded:
|
||||
time0 = time()
|
||||
mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \
|
||||
'old ({})'.format(
|
||||
psi_avg_value, sigkill_psi_threshold)
|
||||
|
||||
implement_corrective_action(SIGKILL)
|
||||
|
||||
psi_t0 = time()
|
||||
continue
|
||||
|
||||
if sigterm_psi_exceeded and psi_post_action_delay_exceeded:
|
||||
time0 = time()
|
||||
mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \
|
||||
'shold ({})'.format(psi_avg_value, sigterm_psi_threshold)
|
||||
|
||||
implement_corrective_action(SIGTERM)
|
||||
|
||||
psi_t0 = time()
|
||||
continue
|
||||
# FIND VALUES: mem, swap, zram, psi
|
||||
|
||||
mem_available, swap_total, swap_free = check_mem_and_swap()
|
||||
|
||||
@ -2825,7 +2730,29 @@ while True:
|
||||
if swap_warn_is_percent:
|
||||
swap_min_warnings_kb = swap_total * swap_min_warnings_percent / 100.0
|
||||
|
||||
mem_used_zram = check_zram()
|
||||
if swap_total > swap_min_sigkill_kb:
|
||||
swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1))
|
||||
else:
|
||||
swap_sigkill_pc = '-'
|
||||
|
||||
if swap_total > swap_min_sigterm_kb:
|
||||
swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1))
|
||||
else:
|
||||
swap_sigterm_pc = '-'
|
||||
|
||||
if CHECK_ZRAM:
|
||||
mem_used_zram = check_zram()
|
||||
|
||||
if CHECK_PSI:
|
||||
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
|
||||
if time() - psi_t0 >= psi_post_action_delay:
|
||||
psi_post_action_delay_exceeded = True
|
||||
else:
|
||||
psi_post_action_delay_exceeded = False
|
||||
|
||||
if print_mem_check_results:
|
||||
psi_avg_string = 'PSI avg value: {} | '.format(
|
||||
str(psi_avg_value).rjust(6))
|
||||
|
||||
if print_mem_check_results:
|
||||
|
||||
@ -2894,20 +2821,12 @@ while True:
|
||||
)
|
||||
)
|
||||
|
||||
if swap_total > swap_min_sigkill_kb:
|
||||
swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1))
|
||||
else:
|
||||
swap_sigkill_pc = '-'
|
||||
###########################################################################
|
||||
|
||||
if swap_total > swap_min_sigterm_kb:
|
||||
swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1))
|
||||
else:
|
||||
swap_sigterm_pc = '-'
|
||||
# CHECK HARD THRESHOLDS (SIGKILL LEVEL)
|
||||
|
||||
# MEM SWAP KILL
|
||||
if (mem_available <= mem_min_sigkill_kb and
|
||||
swap_free <= swap_min_sigkill_kb):
|
||||
time0 = time()
|
||||
|
||||
mem_info = 'Hard threshold exceeded\nMemory status that requ' \
|
||||
'ires corrective actions:' \
|
||||
@ -2924,33 +2843,47 @@ while True:
|
||||
swap_sigkill_pc)
|
||||
|
||||
implement_corrective_action(SIGKILL)
|
||||
|
||||
psi_t0 = time()
|
||||
continue
|
||||
|
||||
# ZRAM KILL
|
||||
if mem_used_zram >= zram_max_sigkill_kb:
|
||||
time0 = time()
|
||||
if CHECK_ZRAM:
|
||||
if mem_used_zram >= zram_max_sigkill_kb:
|
||||
|
||||
mem_info = 'Hard threshold exceeded\nMemory status that requir' \
|
||||
'es corrective actions:' \
|
||||
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
|
||||
'kill [{} MiB, {} %]'.format(
|
||||
kib_to_mib(mem_used_zram),
|
||||
percent(mem_used_zram / mem_total),
|
||||
kib_to_mib(zram_max_sigkill_kb),
|
||||
percent(zram_max_sigkill_kb / mem_total))
|
||||
mem_info = 'Hard threshold exceeded\nMemory status that requir' \
|
||||
'es corrective actions:' \
|
||||
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
|
||||
'kill [{} MiB, {} %]'.format(
|
||||
kib_to_mib(mem_used_zram),
|
||||
percent(mem_used_zram / mem_total),
|
||||
kib_to_mib(zram_max_sigkill_kb),
|
||||
percent(zram_max_sigkill_kb / mem_total))
|
||||
|
||||
implement_corrective_action(SIGKILL)
|
||||
implement_corrective_action(SIGKILL)
|
||||
psi_t0 = time()
|
||||
continue
|
||||
|
||||
psi_t0 = time()
|
||||
continue
|
||||
if CHECK_PSI:
|
||||
if psi_avg_value >= sigkill_psi_threshold:
|
||||
sigkill_psi_exceeded = True
|
||||
else:
|
||||
sigkill_psi_exceeded = False
|
||||
|
||||
# MEM SWAP TERM
|
||||
if mem_available <= mem_min_sigterm_kb and \
|
||||
swap_free <= swap_min_sigterm_kb:
|
||||
if sigkill_psi_exceeded and psi_post_action_delay_exceeded:
|
||||
|
||||
time0 = time()
|
||||
mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \
|
||||
'old ({})'.format(
|
||||
psi_avg_value, sigkill_psi_threshold)
|
||||
|
||||
implement_corrective_action(SIGKILL)
|
||||
psi_t0 = time()
|
||||
continue
|
||||
|
||||
###########################################################################
|
||||
|
||||
# CHECK SOFT THRESHOLDS (SIGTERM LEVEL)
|
||||
|
||||
if (mem_available <= mem_min_sigterm_kb and
|
||||
swap_free <= swap_min_sigterm_kb):
|
||||
|
||||
mem_info = 'Soft threshold exceeded\nMemory status that requi' \
|
||||
'res corrective actions:' \
|
||||
@ -2967,34 +2900,54 @@ while True:
|
||||
swap_sigterm_pc)
|
||||
|
||||
implement_corrective_action(SIGTERM)
|
||||
|
||||
psi_t0 = time()
|
||||
continue
|
||||
|
||||
# ZRAM TERM
|
||||
if mem_used_zram >= zram_max_sigterm_kb:
|
||||
time0 = time()
|
||||
if CHECK_ZRAM:
|
||||
if mem_used_zram >= zram_max_sigterm_kb:
|
||||
|
||||
mem_info = 'Soft threshold exceeded\nMemory status that requ' \
|
||||
'ires corrective actions:' \
|
||||
'\n MemUsedZram [{} MiB, {} %] >= ' \
|
||||
'zram_max_sigterm [{} M, {} %]'.format(
|
||||
kib_to_mib(mem_used_zram),
|
||||
percent(mem_used_zram / mem_total),
|
||||
kib_to_mib(zram_max_sigterm_kb),
|
||||
percent(zram_max_sigterm_kb / mem_total))
|
||||
mem_info = 'Soft threshold exceeded\nMemory status that require' \
|
||||
's corrective actions:\n MemUsedZram [{} MiB, {} %] >= zra' \
|
||||
'm_max_sigterm [{} M, {} %]'.format(
|
||||
kib_to_mib(mem_used_zram),
|
||||
percent(mem_used_zram / mem_total),
|
||||
kib_to_mib(zram_max_sigterm_kb),
|
||||
percent(zram_max_sigterm_kb / mem_total))
|
||||
|
||||
implement_corrective_action(SIGTERM)
|
||||
implement_corrective_action(SIGTERM)
|
||||
psi_t0 = time()
|
||||
continue
|
||||
|
||||
psi_t0 = time()
|
||||
continue
|
||||
if CHECK_PSI:
|
||||
if psi_avg_value >= sigterm_psi_threshold:
|
||||
sigterm_psi_exceeded = True
|
||||
else:
|
||||
sigterm_psi_exceeded = False
|
||||
|
||||
if psi_debug:
|
||||
log('sigterm_psi_exceeded: {}, sigkill_psi_exceeded: {}, ps'
|
||||
'i_post_action_delay_exceeded: {}'.format(
|
||||
sigterm_psi_exceeded,
|
||||
sigkill_psi_exceeded,
|
||||
psi_post_action_delay_exceeded))
|
||||
|
||||
if sigterm_psi_exceeded and psi_post_action_delay_exceeded:
|
||||
|
||||
mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \
|
||||
'shold ({})'.format(psi_avg_value, sigterm_psi_threshold)
|
||||
|
||||
implement_corrective_action(SIGTERM)
|
||||
psi_t0 = time()
|
||||
continue
|
||||
|
||||
###########################################################################
|
||||
|
||||
# LOW MEMORY WARNINGS
|
||||
if gui_low_memory_warnings:
|
||||
|
||||
if mem_available <= mem_min_warnings_kb and \
|
||||
swap_free <= swap_min_warnings_kb + 0.1 or \
|
||||
mem_used_zram >= zram_max_warnings_kb:
|
||||
if (mem_available <= mem_min_warnings_kb and
|
||||
swap_free <= swap_min_warnings_kb + 0.1 or
|
||||
mem_used_zram >= zram_max_warnings_kb):
|
||||
|
||||
warn_time_delta = time() - warn_time_now
|
||||
warn_time_now = time()
|
||||
warn_timer += warn_time_delta
|
||||
@ -3003,17 +2956,7 @@ while True:
|
||||
warn_timer = 0
|
||||
|
||||
|
||||
# x = time() - Q
|
||||
# print(x * 1000)
|
||||
|
||||
|
||||
# SLEEP BETWEEN MEM CHECKS
|
||||
sleep_after_check_mem()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
76
nohang.conf
76
nohang.conf
@ -1,34 +1,38 @@
|
||||
This is nohang config file.
|
||||
Lines starting with #, tabs and spaces are comments.
|
||||
Lines starting with @ contain optional parameters.
|
||||
All values are case sensitive.
|
||||
Be careful: nohang doesn't forbid you to shoot yourself in the foot.
|
||||
|
||||
The configuration includes the following sections:
|
||||
|
||||
0. Common zram settings
|
||||
1. Memory levels to respond to as an OOM threat
|
||||
2. Response on PSI memory metrics
|
||||
3. The frequency of checking the level of available memory
|
||||
(and CPU usage)
|
||||
4. The prevention of killing innocent victims
|
||||
5. Impact on the badness of processes via matching their
|
||||
- names,
|
||||
- cgroups,
|
||||
- cmdlines and
|
||||
- UIDs
|
||||
with regular expressions
|
||||
5. Impact on the badness of processes via matching their names, cgroups and
|
||||
cmdlines with specified regular expressions
|
||||
6. Customize corrective actions: the execution of a specific command
|
||||
instead of sending the SIGTERM signal
|
||||
7. GUI notifications:
|
||||
- OOM prevention results and
|
||||
- low memory warnings
|
||||
- OOM prevention results
|
||||
8. Output verbosity
|
||||
9. Misc
|
||||
|
||||
Just read the description of the parameters and edit the values.
|
||||
Please restart the program after editing the config.
|
||||
|
||||
Bool values are case sensitive.
|
||||
###############################################################################
|
||||
|
||||
#####################################################################
|
||||
0. Common zram settings
|
||||
|
||||
See https://www.kernel.org/doc/Documentation/blockdev/zram.txt
|
||||
You maybe need to set `ignore_zram = False` if you has a big zram disksize.
|
||||
|
||||
ignore_zram = False
|
||||
|
||||
1. Thresholds below which a signal should be sent to the victim
|
||||
|
||||
@ -57,9 +61,9 @@ swap_min_sigkill = 5 %
|
||||
numbers from the range [0; 90] %.
|
||||
|
||||
zram_max_sigterm = 50 %
|
||||
zram_max_sigkill = 55 %
|
||||
zram_max_sigkill = 60 %
|
||||
|
||||
#####################################################################
|
||||
###############################################################################
|
||||
|
||||
2. Response on PSI memory metrics (it needs Linux 4.20 and up)
|
||||
|
||||
@ -102,7 +106,7 @@ sigkill_psi_threshold = 90
|
||||
|
||||
psi_post_action_delay = 60
|
||||
|
||||
#####################################################################
|
||||
###############################################################################
|
||||
|
||||
3. The frequency of checking the amount of available memory
|
||||
(and CPU usage)
|
||||
@ -124,7 +128,7 @@ psi_post_action_delay = 60
|
||||
|
||||
rate_mem = 4000
|
||||
rate_swap = 1500
|
||||
rate_zram = 500
|
||||
rate_zram = 6000
|
||||
|
||||
See also https://github.com/rfjakob/earlyoom/issues/61
|
||||
|
||||
@ -135,7 +139,7 @@ min_sleep = 0.1
|
||||
|
||||
over_sleep = 0.05
|
||||
|
||||
#####################################################################
|
||||
###############################################################################
|
||||
|
||||
4. The prevention of killing innocent victims
|
||||
|
||||
@ -144,7 +148,7 @@ over_sleep = 0.05
|
||||
min_badness = 20
|
||||
|
||||
Valid values are non-negative floating-point numbers.
|
||||
Min delay if a victim does not respond to SIGTERM in 10 ms.
|
||||
Min delay if a victim doesn't respond to SIGTERM in 10 ms.
|
||||
|
||||
min_delay_after_sigterm = 3
|
||||
|
||||
@ -157,7 +161,7 @@ decrease_oom_score_adj = False
|
||||
|
||||
oom_score_adj_max = 0
|
||||
|
||||
#####################################################################
|
||||
###############################################################################
|
||||
|
||||
5. Impact on the badness of processes via matching their names,
|
||||
cmdlines or UIDs with regular expressions using re.search().
|
||||
@ -194,21 +198,15 @@ oom_score_adj_max = 0
|
||||
|
||||
A good option that allows fine adjustment.
|
||||
|
||||
Prefer electron-based apps and chromium tabs
|
||||
Prefer chromium tabs and electron-based apps
|
||||
@CMDLINE_RE 200 /// --type=renderer
|
||||
|
||||
Prefer firefox tabs
|
||||
@CMDLINE_RE 100 /// -greomni|-childID
|
||||
|
||||
|
||||
@CMDLINE_RE -500 /// python
|
||||
|
||||
|
||||
|
||||
Prefer firefox tabs (Web Content and WebExtensions)
|
||||
@CMDLINE_RE 100 /// -appomni
|
||||
|
||||
@CMDLINE_RE -200 /// ^/usr/lib/virtualbox
|
||||
|
||||
5.3 Matching UIDs with RE patterns
|
||||
5.3 Matching eUIDs with RE patterns
|
||||
|
||||
The most slow option
|
||||
|
||||
@ -232,10 +230,11 @@ oom_score_adj_max = 0
|
||||
|
||||
@ENVIRON_RE 100 /// USER=user
|
||||
|
||||
Note that you can control badness also via systemd units via OOMScoreAdjust, see
|
||||
https://www.freedesktop.org/software/systemd/man/systemd.exec.html#OOMScoreAdjust=
|
||||
Note that you can control badness also via systemd units via
|
||||
OOMScoreAdjust, see
|
||||
www.freedesktop.org/software/systemd/man/systemd.exec.html#OOMScoreAdjust=
|
||||
|
||||
#####################################################################
|
||||
###############################################################################
|
||||
|
||||
6. Customize corrective actions.
|
||||
|
||||
@ -247,14 +246,15 @@ oom_score_adj_max = 0
|
||||
@SOFT_ACTION_RE_NAME ^foo$ /// kill -SEGV $PID
|
||||
@SOFT_ACTION_RE_NAME ^bash$ /// kill -9 $PID
|
||||
|
||||
@SOFT_ACTION_RE_CGROUP_V1 ^/system.slice/ /// systemctl restart $SERVICE
|
||||
@SOFT_ACTION_RE_CGROUP_V1 foo.service$ /// systemctl restart $SERVICE
|
||||
@SOFT_ACTION_RE_CGROUP_V1 ^/system.slice/ /// systemctl restart $SERVICE
|
||||
@SOFT_ACTION_RE_CGROUP_V1 foo.service$ /// systemctl restart $SERVICE
|
||||
|
||||
$PID will be replaced by process PID.
|
||||
$NAME will be replaced by process name.
|
||||
$SERVICE will be replaced by .service if it exists (overwise it will be relpaced by empty line).
|
||||
$SERVICE will be replaced by .service if it exists (overwise it will be
|
||||
relpaced by empty line)
|
||||
|
||||
#####################################################################
|
||||
###############################################################################
|
||||
|
||||
7. GUI notifications:
|
||||
- OOM prevention results and
|
||||
@ -289,7 +289,7 @@ min_time_between_warnings = 15
|
||||
Ampersands (&) will be replaced with asterisks (*) in process
|
||||
names and in commands.
|
||||
|
||||
#####################################################################
|
||||
###############################################################################
|
||||
|
||||
8. Verbosity
|
||||
|
||||
@ -303,7 +303,7 @@ print_config = False
|
||||
|
||||
print_mem_check_results = False
|
||||
|
||||
min_mem_report_interval = 60
|
||||
min_mem_report_interval = 300
|
||||
|
||||
Print sleep periods between memory checks.
|
||||
Valid values are True and False.
|
||||
@ -327,15 +327,13 @@ extra_table_info = cgroup_v1
|
||||
|
||||
print_victim_info = False
|
||||
|
||||
# print_victim_cmdline
|
||||
|
||||
max_ancestry_depth = 1
|
||||
max_ancestry_depth = 10
|
||||
|
||||
separate_log = False
|
||||
|
||||
psi_debug = False
|
||||
|
||||
#####################################################################
|
||||
###############################################################################
|
||||
|
||||
9. Misc
|
||||
|
||||
|
@ -122,7 +122,7 @@ send_signal = SIGTERM
|
||||
# os.kill(int(pid), SIGCONT)
|
||||
|
||||
|
||||
os.kill(int(pid), send_signal)
|
||||
# os.kill(int(pid), send_signal)
|
||||
t0 = time()
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user