handle errors; add extra_table_info option; fix output
This commit is contained in:
parent
846c6640b9
commit
146a6324cf
285
nohang
285
nohang
@ -52,8 +52,6 @@ else:
|
|||||||
victim_dict = dict()
|
victim_dict = dict()
|
||||||
|
|
||||||
|
|
||||||
# extra_process_table_info = None/cmdline/realpath # (todo)
|
|
||||||
|
|
||||||
# will store corrective actions stat
|
# will store corrective actions stat
|
||||||
stat_dict = dict()
|
stat_dict = dict()
|
||||||
|
|
||||||
@ -62,7 +60,7 @@ separate_log = False # will be overwritten after parse config
|
|||||||
|
|
||||||
|
|
||||||
with open('/proc/self/cgroup') as f:
|
with open('/proc/self/cgroup') as f:
|
||||||
# Find cgroup-line position in /proc/*/cgroup file."""
|
# Find cgroup-line position in /proc/*/cgroup file.
|
||||||
for cgroup_index, line in enumerate(f):
|
for cgroup_index, line in enumerate(f):
|
||||||
if ':name=' in line:
|
if ':name=' in line:
|
||||||
break
|
break
|
||||||
@ -73,12 +71,35 @@ with open('/proc/self/cgroup') as f:
|
|||||||
# define functions
|
# define functions
|
||||||
|
|
||||||
|
|
||||||
|
def write(path, string):
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
with open(path, 'w') as f:
|
||||||
|
f.write(string)
|
||||||
|
|
||||||
|
|
||||||
|
def write_self_oom_score_adj(new_value):
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
if root:
|
||||||
|
write('/proc/self/oom_score_adj', new_value)
|
||||||
|
|
||||||
|
|
||||||
|
self_oom_score_adj_min = '-900'
|
||||||
|
self_oom_score_adj_max = '-9'
|
||||||
|
|
||||||
|
|
||||||
|
write_self_oom_score_adj(self_oom_score_adj_min)
|
||||||
|
|
||||||
|
|
||||||
def exe(cmd):
|
def exe(cmd):
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
log('Execute the command: {}'.format(cmd))
|
log('Execute the command: {}'.format(cmd))
|
||||||
t0 = time()
|
t0 = time()
|
||||||
|
write_self_oom_score_adj(self_oom_score_adj_max)
|
||||||
err = os.system(cmd)
|
err = os.system(cmd)
|
||||||
|
write_self_oom_score_adj(self_oom_score_adj_min)
|
||||||
dt = time() - t0
|
dt = time() - t0
|
||||||
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
|
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
|
||||||
return err
|
return err
|
||||||
@ -97,10 +118,13 @@ def valid_re(reg_exp):
|
|||||||
def pid_to_cgroup(pid):
|
def pid_to_cgroup(pid):
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
with open('/proc/' + pid + '/cgroup') as f:
|
try:
|
||||||
for n, line in enumerate(f):
|
with open('/proc/' + pid + '/cgroup') as f:
|
||||||
if n == cgroup_index:
|
for n, line in enumerate(f):
|
||||||
return '/' + line.partition('/')[2][:-1]
|
if n == cgroup_index:
|
||||||
|
return '/' + line.partition('/')[2][:-1]
|
||||||
|
except FileNotFoundError:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
def func_print_proc_table():
|
def func_print_proc_table():
|
||||||
@ -114,11 +138,23 @@ def func_print_proc_table():
|
|||||||
def log(*msg):
|
def log(*msg):
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
print(*msg)
|
try:
|
||||||
|
print(*msg)
|
||||||
|
except OSError:
|
||||||
|
sleep(0.01)
|
||||||
|
pass
|
||||||
|
# print('OSError in print(*msg)')
|
||||||
|
|
||||||
if separate_log:
|
if separate_log:
|
||||||
# need fix: TypeError: not all arguments converted during string
|
# need fix: TypeError: not all arguments converted during string
|
||||||
# formatting
|
# formatting
|
||||||
info(*msg)
|
|
||||||
|
try:
|
||||||
|
info(*msg)
|
||||||
|
except OSError:
|
||||||
|
sleep(0.01)
|
||||||
|
pass
|
||||||
|
# print('OSError in info(*msg)')
|
||||||
|
|
||||||
|
|
||||||
def print_version():
|
def print_version():
|
||||||
@ -207,7 +243,7 @@ def uptime():
|
|||||||
|
|
||||||
|
|
||||||
def pid_to_starttime(pid):
|
def pid_to_starttime(pid):
|
||||||
"""
|
""" handle FNF error!
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
starttime = rline1('/proc/' + pid + '/stat').rpartition(')')[
|
starttime = rline1('/proc/' + pid + '/stat').rpartition(')')[
|
||||||
@ -224,8 +260,11 @@ def pid_to_starttime(pid):
|
|||||||
|
|
||||||
def get_victim_id(pid):
|
def get_victim_id(pid):
|
||||||
"""victim_id is starttime + pid"""
|
"""victim_id is starttime + pid"""
|
||||||
return rline1('/proc/' + pid + '/stat').rpartition(
|
try:
|
||||||
')')[2].split(' ')[20] + pid
|
return rline1('/proc/' + pid + '/stat').rpartition(
|
||||||
|
')')[2].split(' ')[20] + pid
|
||||||
|
except FileNotFoundError:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
def errprint(*text):
|
def errprint(*text):
|
||||||
@ -259,7 +298,7 @@ def mlockall():
|
|||||||
|
|
||||||
|
|
||||||
def pid_to_state(pid):
|
def pid_to_state(pid):
|
||||||
"""
|
""" Handle FNF error! (BTW it already handled in find_victim_info())
|
||||||
"""
|
"""
|
||||||
return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1]
|
return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1]
|
||||||
|
|
||||||
@ -501,31 +540,6 @@ def zram_stat(zram_id):
|
|||||||
return disksize, mem_used_total # BYTES, str
|
return disksize, mem_used_total # BYTES, str
|
||||||
|
|
||||||
|
|
||||||
'''
|
|
||||||
def pid_to_name(pid):
|
|
||||||
"""
|
|
||||||
Get process name by pid.
|
|
||||||
|
|
||||||
pid: str pid of required process
|
|
||||||
returns string process_name
|
|
||||||
"""
|
|
||||||
try:
|
|
||||||
with open('/proc/' + pid + '/status') as f:
|
|
||||||
f.seek(6)
|
|
||||||
for line in f:
|
|
||||||
return line[:-1]
|
|
||||||
except FileNotFoundError:
|
|
||||||
return ''
|
|
||||||
except ProcessLookupError:
|
|
||||||
return ''
|
|
||||||
except UnicodeDecodeError:
|
|
||||||
with open('/proc/' + pid + '/status', 'rb') as f:
|
|
||||||
f.seek(6)
|
|
||||||
return f.read(15).decode(
|
|
||||||
'utf-8', 'ignore').partition('\n')[0]
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
def pid_to_name(pid):
|
def pid_to_name(pid):
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
@ -538,9 +552,6 @@ def pid_to_name(pid):
|
|||||||
return ''
|
return ''
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def pid_to_ppid(pid):
|
def pid_to_ppid(pid):
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
@ -591,8 +602,11 @@ def pid_to_cmdline(pid):
|
|||||||
pid: str pid of required process
|
pid: str pid of required process
|
||||||
returns string cmdline
|
returns string cmdline
|
||||||
"""
|
"""
|
||||||
with open('/proc/' + pid + '/cmdline') as f:
|
try:
|
||||||
return f.read().replace('\x00', ' ').rstrip()
|
with open('/proc/' + pid + '/cmdline') as f:
|
||||||
|
return f.read().replace('\x00', ' ').rstrip()
|
||||||
|
except FileNotFoundError:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
def pid_to_realpath(pid):
|
def pid_to_realpath(pid):
|
||||||
@ -613,6 +627,8 @@ def pid_to_uid(pid):
|
|||||||
with open('/proc/' + pid + '/status', 'rb') as f:
|
with open('/proc/' + pid + '/status', 'rb') as f:
|
||||||
f_list = f.read().decode('utf-8', 'ignore').split('\n')
|
f_list = f.read().decode('utf-8', 'ignore').split('\n')
|
||||||
return f_list[uid_index].split('\t')[2]
|
return f_list[uid_index].split('\t')[2]
|
||||||
|
except FileNotFoundError:
|
||||||
|
return ''
|
||||||
|
|
||||||
|
|
||||||
def send_notify_warn():
|
def send_notify_warn():
|
||||||
@ -666,13 +682,12 @@ def send_notify_warn():
|
|||||||
# send notification to user that runs this nohang
|
# send notification to user that runs this nohang
|
||||||
notify_send_wait(title, body)
|
notify_send_wait(title, body)
|
||||||
'''
|
'''
|
||||||
#os.system('echo --- \ - $(sleep 5) - ')
|
|
||||||
|
|
||||||
t0 = time()
|
# os.system('echo --- \ - $(sleep 5) - ')
|
||||||
|
|
||||||
|
print('Warning threshold exceeded')
|
||||||
|
|
||||||
if check_warning_exe:
|
if check_warning_exe:
|
||||||
|
|
||||||
print('Warning threshold exceeded')
|
|
||||||
exe(warning_exe)
|
exe(warning_exe)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -684,9 +699,6 @@ def send_notify_warn():
|
|||||||
)
|
)
|
||||||
send_notification(title, body)
|
send_notification(title, body)
|
||||||
|
|
||||||
t1 = time()
|
|
||||||
print('Warning duration:', t1 - t0)
|
|
||||||
|
|
||||||
|
|
||||||
def send_notify(signal, name, pid):
|
def send_notify(signal, name, pid):
|
||||||
"""
|
"""
|
||||||
@ -728,11 +740,9 @@ def send_notify_etc(pid, name, command):
|
|||||||
pid: str process pid
|
pid: str process pid
|
||||||
"""
|
"""
|
||||||
title = 'Freeze prevention'
|
title = 'Freeze prevention'
|
||||||
body = '<b>Victim is</b> [{}] <b>{}</b>\nExecute the command:\n<b>{}</b>'.format(
|
body = '<b>Victim is</b> [{}] <b>{}</b>\nExecute the co' \
|
||||||
pid,
|
'mmand:\n<b>{}</b>'.format(
|
||||||
name.replace('&', '*'),
|
pid, name.replace('&', '*'), command.replace('&', '*'))
|
||||||
command.replace('&', '*')
|
|
||||||
)
|
|
||||||
|
|
||||||
send_notification(title, body)
|
send_notification(title, body)
|
||||||
|
|
||||||
@ -759,9 +769,14 @@ def send_notification(title, body):
|
|||||||
|
|
||||||
text = '{}{}{}'.format(title, split_by, body)
|
text = '{}{}{}'.format(title, split_by, body)
|
||||||
|
|
||||||
with open(path_to_cache, 'w') as f:
|
try:
|
||||||
f.write(text)
|
with open(path_to_cache, 'w') as f:
|
||||||
os.chmod(path_to_cache, 0o600)
|
f.write(text)
|
||||||
|
os.chmod(path_to_cache, 0o600)
|
||||||
|
except OSError:
|
||||||
|
log('OSError while send notification '
|
||||||
|
'(No space left on device: /dev/shm)')
|
||||||
|
return None
|
||||||
|
|
||||||
cmd = '{} --uid {} --time {} &'.format(notify_helper_path, self_uid, t000)
|
cmd = '{} --uid {} --time {} &'.format(notify_helper_path, self_uid, t000)
|
||||||
|
|
||||||
@ -882,14 +897,34 @@ def find_victim(_print_proc_table):
|
|||||||
non_decimal_list = get_non_decimal_pids()
|
non_decimal_list = get_non_decimal_pids()
|
||||||
|
|
||||||
for i in non_decimal_list:
|
for i in non_decimal_list:
|
||||||
pid_list.remove(i)
|
if i in pid_list: # ????????????????????????????????????????????
|
||||||
|
pid_list.remove(i)
|
||||||
|
|
||||||
pid_badness_list = []
|
pid_badness_list = []
|
||||||
|
|
||||||
if _print_proc_table:
|
if _print_proc_table:
|
||||||
|
|
||||||
|
if extra_table_info == 'None':
|
||||||
|
extra_table_title = ''
|
||||||
|
|
||||||
|
elif extra_table_info == 'cgroup':
|
||||||
|
extra_table_title = 'CGroup'
|
||||||
|
|
||||||
|
elif extra_table_info == 'cmdline':
|
||||||
|
extra_table_title = 'cmdline'
|
||||||
|
|
||||||
|
elif extra_table_info == 'realpath':
|
||||||
|
extra_table_title = 'realpath'
|
||||||
|
|
||||||
|
elif extra_table_info == 'All':
|
||||||
|
extra_table_title = '[CGroup] [CmdLine] [RealPath]'
|
||||||
|
else:
|
||||||
|
extra_table_title = ''
|
||||||
|
|
||||||
log('=============================================================='
|
log('=============================================================='
|
||||||
'=================')
|
'=================')
|
||||||
log(' PID badness Name eUID CGroup')
|
log(' PID badness Name eUID {}'.format(
|
||||||
|
extra_table_title))
|
||||||
log('------- ------- --------------- ---------- -----------'
|
log('------- ------- --------------- ---------- -----------'
|
||||||
'----------------------')
|
'----------------------')
|
||||||
|
|
||||||
@ -900,18 +935,39 @@ def find_victim(_print_proc_table):
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if _print_proc_table:
|
if _print_proc_table:
|
||||||
|
|
||||||
|
if extra_table_info == 'None':
|
||||||
|
extra_table_line = ''
|
||||||
|
|
||||||
|
elif extra_table_info == 'cgroup':
|
||||||
|
extra_table_line = pid_to_cgroup(pid)
|
||||||
|
|
||||||
|
elif extra_table_info == 'cmdline':
|
||||||
|
extra_table_line = pid_to_cmdline(pid)
|
||||||
|
|
||||||
|
elif extra_table_info == 'realpath':
|
||||||
|
extra_table_line = pid_to_realpath(pid)
|
||||||
|
|
||||||
|
elif extra_table_info == 'All':
|
||||||
|
extra_table_line = '[CG: {}] [CL: {}] [RP: {}]'.format(
|
||||||
|
pid_to_cgroup(pid),
|
||||||
|
pid_to_cmdline(pid),
|
||||||
|
pid_to_realpath(pid)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
extra_table_line = ''
|
||||||
|
|
||||||
log('{} {} {} {} {}'.format(
|
log('{} {} {} {} {}'.format(
|
||||||
pid.rjust(7),
|
pid.rjust(7),
|
||||||
str(badness).rjust(7),
|
str(badness).rjust(7),
|
||||||
pid_to_name(pid).ljust(15),
|
pid_to_name(pid).ljust(15),
|
||||||
|
# сейчас ищем уид, а надо всего побольше, и состояние памяти.
|
||||||
|
# Написать безопасную фцию для нахождения для каждого процесса:
|
||||||
pid_to_uid(pid).rjust(10),
|
pid_to_uid(pid).rjust(10),
|
||||||
# pid_to_cmdline(pid)
|
# Name, PPID, State, VmSize, VmRSS, VmSwap, Threads - на основе
|
||||||
pid_to_realpath(pid)
|
# find victim info.
|
||||||
# pid_to_cgroup(pid)
|
extra_table_line)
|
||||||
# pid_to_name(pid)
|
)
|
||||||
# ''
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
pid_badness_list.append((pid, badness))
|
pid_badness_list.append((pid, badness))
|
||||||
|
|
||||||
@ -946,10 +1002,15 @@ def find_victim(_print_proc_table):
|
|||||||
return pid, victim_badness, victim_name
|
return pid, victim_badness, victim_name
|
||||||
|
|
||||||
|
|
||||||
|
def find_status_for_proc_table(pid):
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def find_victim_info(pid, victim_badness, name):
|
def find_victim_info(pid, victim_badness, name):
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
|
|
||||||
status0 = time()
|
status0 = time()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -1085,6 +1146,8 @@ def find_victim_info(pid, victim_badness, name):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
realpath = os.path.realpath('/proc/' + pid + '/exe')
|
realpath = os.path.realpath('/proc/' + pid + '/exe')
|
||||||
|
victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
|
||||||
|
victim_cgroup = pid_to_cgroup(pid)
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print('The victim died in the search process: FileNotFoundError')
|
print('The victim died in the search process: FileNotFoundError')
|
||||||
update_stat_dict_and_print(
|
update_stat_dict_and_print(
|
||||||
@ -1106,10 +1169,6 @@ def find_victim_info(pid, victim_badness, name):
|
|||||||
else:
|
else:
|
||||||
detailed_rss_info = ''
|
detailed_rss_info = ''
|
||||||
|
|
||||||
victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
|
|
||||||
|
|
||||||
victim_cgroup = pid_to_cgroup(pid)
|
|
||||||
|
|
||||||
victim_info = 'Victim information (found in {} ms):' \
|
victim_info = 'Victim information (found in {} ms):' \
|
||||||
'\n Name: {}' \
|
'\n Name: {}' \
|
||||||
'\n State: {}' \
|
'\n State: {}' \
|
||||||
@ -1147,7 +1206,6 @@ def find_victim_info(pid, victim_badness, name):
|
|||||||
return victim_info
|
return victim_info
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# для дедупликации уведомлений
|
# для дедупликации уведомлений
|
||||||
dick = dict()
|
dick = dict()
|
||||||
dick['v'] = [1, 2, 3, time()]
|
dick['v'] = [1, 2, 3, time()]
|
||||||
@ -1196,28 +1254,13 @@ def implement_corrective_action(signal):
|
|||||||
'ion:\n MemAvailable'
|
'ion:\n MemAvailable'
|
||||||
': {} MiB, SwapFree: {} MiB'.format(ma, sf))
|
': {} MiB, SwapFree: {} MiB'.format(ma, sf))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cmd = etc_dict[name].replace('$PID', pid).replace(
|
cmd = etc_dict[name].replace('$PID', pid).replace(
|
||||||
'$NAME', pid_to_name(pid))
|
'$NAME', pid_to_name(pid))
|
||||||
|
|
||||||
|
|
||||||
exit_status = exe(cmd)
|
exit_status = exe(cmd)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
exit_status = str(exit_status)
|
exit_status = str(exit_status)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
response_time = time() - time0
|
response_time = time() - time0
|
||||||
|
|
||||||
etc_info = 'Implement a corrective act' \
|
etc_info = 'Implement a corrective act' \
|
||||||
@ -1274,7 +1317,6 @@ def implement_corrective_action(signal):
|
|||||||
|
|
||||||
exe(cmd)
|
exe(cmd)
|
||||||
|
|
||||||
|
|
||||||
if gui_notifications:
|
if gui_notifications:
|
||||||
|
|
||||||
# min delay after same notification
|
# min delay after same notification
|
||||||
@ -1288,14 +1330,14 @@ def implement_corrective_action(signal):
|
|||||||
|
|
||||||
y = dick['v']
|
y = dick['v']
|
||||||
|
|
||||||
#print(y[3] - x[3])
|
# print(y[3] - x[3])
|
||||||
|
|
||||||
if x[0] == y[0] and x[1] == y[1] and x[2] == y[2]:
|
if x[0] == y[0] and x[1] == y[1] and x[2] == y[2]:
|
||||||
#print('совпадение имени, пид, сигнала')
|
# print('совпадение имени, пид, сигнала')
|
||||||
|
|
||||||
# сохр в словаре первре совпавшее время
|
# сохр в словаре первре совпавшее время
|
||||||
dt = y[3] - x[3]
|
dt = y[3] - x[3]
|
||||||
#print(dt, 'dt')
|
# print(dt, 'dt')
|
||||||
if dt < delay_after_same_notify:
|
if dt < delay_after_same_notify:
|
||||||
notif = False
|
notif = False
|
||||||
|
|
||||||
@ -1315,7 +1357,10 @@ def implement_corrective_action(signal):
|
|||||||
key = 'ProcessLookupError (the victim died in the se' \
|
key = 'ProcessLookupError (the victim died in the se' \
|
||||||
'arch process): '
|
'arch process): '
|
||||||
|
|
||||||
log(preventing_oom_message)
|
try:
|
||||||
|
log(preventing_oom_message)
|
||||||
|
except UnboundLocalError:
|
||||||
|
preventing_oom_message = key
|
||||||
|
|
||||||
update_stat_dict_and_print(key)
|
update_stat_dict_and_print(key)
|
||||||
|
|
||||||
@ -1388,7 +1433,10 @@ def sleep_after_check_mem():
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
stdout.flush()
|
try:
|
||||||
|
stdout.flush()
|
||||||
|
except OSError: # OSError: [Errno 105] No buffer space available
|
||||||
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sleep(t)
|
sleep(t)
|
||||||
@ -1568,18 +1616,12 @@ cgroup_re_list = []
|
|||||||
realpath_re_list = []
|
realpath_re_list = []
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# dictionary with names and commands for the parameter
|
# dictionary with names and commands for the parameter
|
||||||
# execute_the_command
|
# execute_the_command
|
||||||
# тут тоже список нужен, а не словарь
|
# тут тоже список нужен, а не словарь
|
||||||
etc_dict = dict()
|
etc_dict = dict()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
with open(config) as f:
|
with open(config) as f:
|
||||||
|
|
||||||
@ -1595,7 +1637,6 @@ try:
|
|||||||
if not a and not b and not c and not d and not etc:
|
if not a and not b and not c and not d and not etc:
|
||||||
a = line.partition('=')
|
a = line.partition('=')
|
||||||
|
|
||||||
|
|
||||||
key = a[0].strip()
|
key = a[0].strip()
|
||||||
value = a[2].strip()
|
value = a[2].strip()
|
||||||
|
|
||||||
@ -1656,11 +1697,6 @@ try:
|
|||||||
realpath_re_list.append((badness_adj, reg_exp))
|
realpath_re_list.append((badness_adj, reg_exp))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
except PermissionError:
|
except PermissionError:
|
||||||
errprint('PermissionError', conf_err_mess)
|
errprint('PermissionError', conf_err_mess)
|
||||||
exit(1)
|
exit(1)
|
||||||
@ -1689,8 +1725,8 @@ except FileNotFoundError:
|
|||||||
# check for all necessary parameters
|
# check for all necessary parameters
|
||||||
# validation of all parameters
|
# validation of all parameters
|
||||||
psi_debug = conf_parse_bool('psi_debug')
|
psi_debug = conf_parse_bool('psi_debug')
|
||||||
|
print_total_stat = conf_parse_bool('print_total_stat')
|
||||||
|
print_proc_table = conf_parse_bool('print_proc_table')
|
||||||
forbid_negative_badness = conf_parse_bool('forbid_negative_badness')
|
forbid_negative_badness = conf_parse_bool('forbid_negative_badness')
|
||||||
print_victim_info = conf_parse_bool('print_victim_info')
|
print_victim_info = conf_parse_bool('print_victim_info')
|
||||||
print_config = conf_parse_bool('print_config')
|
print_config = conf_parse_bool('print_config')
|
||||||
@ -1966,8 +2002,17 @@ else:
|
|||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
print_total_stat = conf_parse_bool('print_total_stat')
|
if 'extra_table_info' in config_dict:
|
||||||
print_proc_table = conf_parse_bool('print_proc_table')
|
extra_table_info = config_dict['extra_table_info']
|
||||||
|
if (extra_table_info != 'None' and extra_table_info != 'cgroup' and
|
||||||
|
extra_table_info != 'cmdline' and extra_table_info != 'realpath' and
|
||||||
|
extra_table_info != 'All'):
|
||||||
|
errprint('Invalid config: invalid extra_table_info value\nExit')
|
||||||
|
exit(1)
|
||||||
|
else:
|
||||||
|
errprint('Invalid config: extra_table_info is not in config\nExit')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
separate_log = conf_parse_bool('separate_log')
|
separate_log = conf_parse_bool('separate_log')
|
||||||
|
|
||||||
@ -2308,8 +2353,9 @@ while True:
|
|||||||
|
|
||||||
if sigkill_psi_exceeded and psi_post_action_delay_exceeded:
|
if sigkill_psi_exceeded and psi_post_action_delay_exceeded:
|
||||||
time0 = time()
|
time0 = time()
|
||||||
mem_info = 'PSI avg value ({}) > sigkill_psi_threshold ({})'.format(
|
mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \
|
||||||
psi_avg_value, sigkill_psi_threshold)
|
'old ({})'.format(
|
||||||
|
psi_avg_value, sigkill_psi_threshold)
|
||||||
|
|
||||||
implement_corrective_action(SIGKILL)
|
implement_corrective_action(SIGKILL)
|
||||||
|
|
||||||
@ -2318,8 +2364,8 @@ while True:
|
|||||||
|
|
||||||
if sigterm_psi_exceeded and psi_post_action_delay_exceeded:
|
if sigterm_psi_exceeded and psi_post_action_delay_exceeded:
|
||||||
time0 = time()
|
time0 = time()
|
||||||
mem_info = 'PSI avg value ({}) > sigterm_psi_threshold ({})'.format(
|
mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \
|
||||||
psi_avg_value, sigterm_psi_threshold)
|
'shold ({})'.format(psi_avg_value, sigterm_psi_threshold)
|
||||||
|
|
||||||
implement_corrective_action(SIGTERM)
|
implement_corrective_action(SIGTERM)
|
||||||
|
|
||||||
@ -2437,7 +2483,7 @@ while True:
|
|||||||
time0 = time()
|
time0 = time()
|
||||||
|
|
||||||
mem_info = 'Hard threshold exceeded\nMemory status that requ' \
|
mem_info = 'Hard threshold exceeded\nMemory status that requ' \
|
||||||
'ires corrective actions:' \
|
'ires corrective actions:' \
|
||||||
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
||||||
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
||||||
'p_min_sigkill [{} MiB, {} %]'.format(
|
'p_min_sigkill [{} MiB, {} %]'.format(
|
||||||
@ -2451,6 +2497,7 @@ while True:
|
|||||||
swap_sigkill_pc)
|
swap_sigkill_pc)
|
||||||
|
|
||||||
implement_corrective_action(SIGKILL)
|
implement_corrective_action(SIGKILL)
|
||||||
|
|
||||||
psi_t0 = time()
|
psi_t0 = time()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -2468,6 +2515,7 @@ while True:
|
|||||||
percent(zram_max_sigkill_kb / mem_total))
|
percent(zram_max_sigkill_kb / mem_total))
|
||||||
|
|
||||||
implement_corrective_action(SIGKILL)
|
implement_corrective_action(SIGKILL)
|
||||||
|
|
||||||
psi_t0 = time()
|
psi_t0 = time()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -2478,7 +2526,7 @@ while True:
|
|||||||
time0 = time()
|
time0 = time()
|
||||||
|
|
||||||
mem_info = 'Soft threshold exceeded\nMemory status that requi' \
|
mem_info = 'Soft threshold exceeded\nMemory status that requi' \
|
||||||
'res corrective actions:' \
|
'res corrective actions:' \
|
||||||
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
||||||
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
||||||
'p_min_sigterm [{} MiB, {} %]'.format(
|
'p_min_sigterm [{} MiB, {} %]'.format(
|
||||||
@ -2494,6 +2542,7 @@ while True:
|
|||||||
swap_sigterm_pc)
|
swap_sigterm_pc)
|
||||||
|
|
||||||
implement_corrective_action(SIGTERM)
|
implement_corrective_action(SIGTERM)
|
||||||
|
|
||||||
psi_t0 = time()
|
psi_t0 = time()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -2502,7 +2551,7 @@ while True:
|
|||||||
time0 = time()
|
time0 = time()
|
||||||
|
|
||||||
mem_info = 'Soft threshold exceeded\nMemory status that requ' \
|
mem_info = 'Soft threshold exceeded\nMemory status that requ' \
|
||||||
'ires corrective actions:' \
|
'ires corrective actions:' \
|
||||||
'\n MemUsedZram [{} MiB, {} %] >= ' \
|
'\n MemUsedZram [{} MiB, {} %] >= ' \
|
||||||
'zram_max_sigterm [{} M, {} %]'.format(
|
'zram_max_sigterm [{} M, {} %]'.format(
|
||||||
kib_to_mib(mem_used_zram),
|
kib_to_mib(mem_used_zram),
|
||||||
@ -2525,9 +2574,7 @@ while True:
|
|||||||
warn_time_now = time()
|
warn_time_now = time()
|
||||||
warn_timer += warn_time_delta
|
warn_timer += warn_time_delta
|
||||||
if warn_timer > min_time_between_warnings:
|
if warn_timer > min_time_between_warnings:
|
||||||
t0 = time()
|
|
||||||
send_notify_warn()
|
send_notify_warn()
|
||||||
log(str(time() - t0) + ' | send notify warning time')
|
|
||||||
warn_timer = 0
|
warn_timer = 0
|
||||||
|
|
||||||
# SLEEP BETWEEN MEM CHECKS
|
# SLEEP BETWEEN MEM CHECKS
|
||||||
|
23
nohang.conf
23
nohang.conf
@ -32,6 +32,8 @@
|
|||||||
Just read the description of the parameters and edit the values.
|
Just read the description of the parameters and edit the values.
|
||||||
Please restart the program after editing the config.
|
Please restart the program after editing the config.
|
||||||
|
|
||||||
|
Bool values are case sensitive.
|
||||||
|
|
||||||
#####################################################################
|
#####################################################################
|
||||||
|
|
||||||
1. Thresholds below which a signal should be sent to the victim
|
1. Thresholds below which a signal should be sent to the victim
|
||||||
@ -103,7 +105,7 @@ psi_metrics = some_avg10
|
|||||||
sigterm_psi_threshold = 80
|
sigterm_psi_threshold = 80
|
||||||
sigkill_psi_threshold = 90
|
sigkill_psi_threshold = 90
|
||||||
|
|
||||||
psi_post_action_delay = 40
|
psi_post_action_delay = 60
|
||||||
|
|
||||||
#####################################################################
|
#####################################################################
|
||||||
|
|
||||||
@ -148,7 +150,6 @@ min_badness = 20
|
|||||||
min_delay_after_sigterm = 0.2
|
min_delay_after_sigterm = 0.2
|
||||||
min_delay_after_sigkill = 1
|
min_delay_after_sigkill = 1
|
||||||
|
|
||||||
Enabling the option requires root privileges.
|
|
||||||
Valid values are True and False.
|
Valid values are True and False.
|
||||||
Values are case sensitive.
|
Values are case sensitive.
|
||||||
|
|
||||||
@ -221,7 +222,7 @@ re_match_cgroup = False
|
|||||||
|
|
||||||
@CGROUP_RE -50 /// system.slice
|
@CGROUP_RE -50 /// system.slice
|
||||||
|
|
||||||
@CGROUP_RE -50 /// foo.service
|
@CGROUP_RE 50 /// foo.service
|
||||||
|
|
||||||
@CGROUP_RE -50 /// user.slice
|
@CGROUP_RE -50 /// user.slice
|
||||||
|
|
||||||
@ -300,7 +301,6 @@ gui_low_memory_warnings = True
|
|||||||
Execute the command instead of sending GUI notifications if the value is
|
Execute the command instead of sending GUI notifications if the value is
|
||||||
not empty line. For example:
|
not empty line. For example:
|
||||||
warning_exe = cat /proc/meminfo &
|
warning_exe = cat /proc/meminfo &
|
||||||
warning_exe = cat /proc/pressure/memory & cat /sys/fs/cgroup/unified/system.slice/memory.pressure & cat /sys/fs/cgroup/unified/user.slice/memory.pressure &
|
|
||||||
|
|
||||||
warning_exe =
|
warning_exe =
|
||||||
|
|
||||||
@ -332,7 +332,7 @@ print_config = False
|
|||||||
Print memory check results.
|
Print memory check results.
|
||||||
Valid values are True and False.
|
Valid values are True and False.
|
||||||
|
|
||||||
print_mem_check_results = True
|
print_mem_check_results = False
|
||||||
|
|
||||||
min_mem_report_interval = 60
|
min_mem_report_interval = 60
|
||||||
|
|
||||||
@ -343,11 +343,20 @@ print_sleep_periods = False
|
|||||||
|
|
||||||
print_total_stat = True
|
print_total_stat = True
|
||||||
|
|
||||||
print_proc_table = True
|
print_proc_table = False
|
||||||
|
|
||||||
|
Valid values:
|
||||||
|
None
|
||||||
|
cgroup
|
||||||
|
cmdline
|
||||||
|
realpath
|
||||||
|
All
|
||||||
|
|
||||||
|
extra_table_info = cgroup
|
||||||
|
|
||||||
print_victim_info = True
|
print_victim_info = True
|
||||||
|
|
||||||
max_ancestry_depth = 5
|
max_ancestry_depth = 1
|
||||||
|
|
||||||
separate_log = False
|
separate_log = False
|
||||||
|
|
||||||
|
@ -7,9 +7,8 @@ Documentation=man:nohang(1) https://github.com/hakavlad/nohang
|
|||||||
ExecStart=/usr/sbin/nohang --config /etc/nohang/nohang.conf
|
ExecStart=/usr/sbin/nohang --config /etc/nohang/nohang.conf
|
||||||
Slice=nohang.slice
|
Slice=nohang.slice
|
||||||
Restart=always
|
Restart=always
|
||||||
MemoryMax=60M
|
MemoryMax=50M
|
||||||
TasksMax=20
|
TasksMax=50
|
||||||
OOMScoreAdjust=-5
|
|
||||||
Nice=-20
|
Nice=-20
|
||||||
IOSchedulingClass=1
|
IOSchedulingClass=1
|
||||||
IOSchedulingPriority=0
|
IOSchedulingPriority=0
|
||||||
|
@ -1,34 +1,38 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
from os import listdir, path, remove
|
|
||||||
from subprocess import Popen, TimeoutExpired
|
|
||||||
from sys import argv
|
|
||||||
|
|
||||||
|
|
||||||
# print('Starting nohang_notify_helper')
|
# print('Starting nohang_notify_helper')
|
||||||
|
|
||||||
# print(argv)
|
|
||||||
|
|
||||||
# print(len(argv))
|
def write(path, string):
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
with open(path, 'w') as f:
|
||||||
|
f.write(string)
|
||||||
|
|
||||||
split_by = '#' * 16
|
|
||||||
|
|
||||||
uid = argv[2]
|
try:
|
||||||
|
write('/proc/self/oom_score_adj', '0')
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
t000 = argv[4]
|
|
||||||
|
|
||||||
wait_time = 10
|
try:
|
||||||
|
from os import listdir, path, remove
|
||||||
display_env = 'DISPLAY='
|
from subprocess import Popen, TimeoutExpired
|
||||||
dbus_env = 'DBUS_SESSION_BUS_ADDRESS='
|
from sys import argv
|
||||||
user_env = 'USER='
|
except OSError:
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
def rline1(path):
|
def rline1(path):
|
||||||
"""read 1st line from path."""
|
"""read 1st line from path."""
|
||||||
with open(path) as f:
|
try:
|
||||||
for line in f:
|
with open(path) as f:
|
||||||
return line
|
for line in f:
|
||||||
|
return line
|
||||||
|
except OSError:
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
def rfile(path):
|
def rfile(path):
|
||||||
@ -37,6 +41,39 @@ def rfile(path):
|
|||||||
return f.read()
|
return f.read()
|
||||||
|
|
||||||
|
|
||||||
|
with open('/proc/meminfo') as f:
|
||||||
|
for line in f:
|
||||||
|
if line.startswith('SwapTotal'):
|
||||||
|
swap_total = int(line.split(':')[1][:-4])
|
||||||
|
if swap_total > 0:
|
||||||
|
wait_time = 5
|
||||||
|
else:
|
||||||
|
wait_time = 0.5
|
||||||
|
|
||||||
|
|
||||||
|
print('nohang_notify_helper: wait_time:', wait_time)
|
||||||
|
|
||||||
|
|
||||||
|
# print(argv)
|
||||||
|
|
||||||
|
|
||||||
|
# print(len(argv))
|
||||||
|
|
||||||
|
|
||||||
|
split_by = '#' * 16
|
||||||
|
|
||||||
|
|
||||||
|
uid = argv[2]
|
||||||
|
|
||||||
|
|
||||||
|
t000 = argv[4]
|
||||||
|
|
||||||
|
|
||||||
|
display_env = 'DISPLAY='
|
||||||
|
dbus_env = 'DBUS_SESSION_BUS_ADDRESS='
|
||||||
|
user_env = 'USER='
|
||||||
|
|
||||||
|
|
||||||
path_to_cache = '/dev/shm/nohang_notify_cache_uid{}_time{}'.format(
|
path_to_cache = '/dev/shm/nohang_notify_cache_uid{}_time{}'.format(
|
||||||
uid, t000
|
uid, t000
|
||||||
)
|
)
|
||||||
@ -158,9 +195,13 @@ if list_len > 0:
|
|||||||
proc.wait(timeout=wait_time)
|
proc.wait(timeout=wait_time)
|
||||||
except TimeoutExpired:
|
except TimeoutExpired:
|
||||||
proc.kill()
|
proc.kill()
|
||||||
print('TimeoutExpired: notify user:' + username)
|
print('TimeoutExpired: notify user: ' + username)
|
||||||
except BlockingIOError:
|
except BlockingIOError:
|
||||||
print('nohang_notify_helper: BlockingIOError')
|
print('nohang_notify_helper: BlockingIOError')
|
||||||
|
except OSError:
|
||||||
|
print('nohang_notify_helper: OSError')
|
||||||
|
except Exception:
|
||||||
|
print('nohang_notify_helper: CANNOT SPAWN NOTIFY-SEND PROCESS')
|
||||||
else:
|
else:
|
||||||
print(
|
print(
|
||||||
'Not send GUI notification: [',
|
'Not send GUI notification: [',
|
||||||
|
Loading…
Reference in New Issue
Block a user