handle errors; add extra_table_info option; fix output

This commit is contained in:
Alexey Avramov 2019-04-02 17:45:56 +09:00
parent 846c6640b9
commit 146a6324cf
4 changed files with 243 additions and 147 deletions

235
nohang
View File

@ -52,8 +52,6 @@ else:
victim_dict = dict() victim_dict = dict()
# extra_process_table_info = None/cmdline/realpath # (todo)
# will store corrective actions stat # will store corrective actions stat
stat_dict = dict() stat_dict = dict()
@ -62,7 +60,7 @@ separate_log = False # will be overwritten after parse config
with open('/proc/self/cgroup') as f: with open('/proc/self/cgroup') as f:
# Find cgroup-line position in /proc/*/cgroup file.""" # Find cgroup-line position in /proc/*/cgroup file.
for cgroup_index, line in enumerate(f): for cgroup_index, line in enumerate(f):
if ':name=' in line: if ':name=' in line:
break break
@ -73,12 +71,35 @@ with open('/proc/self/cgroup') as f:
# define functions # define functions
def write(path, string):
"""
"""
with open(path, 'w') as f:
f.write(string)
def write_self_oom_score_adj(new_value):
"""
"""
if root:
write('/proc/self/oom_score_adj', new_value)
self_oom_score_adj_min = '-900'
self_oom_score_adj_max = '-9'
write_self_oom_score_adj(self_oom_score_adj_min)
def exe(cmd): def exe(cmd):
""" """
""" """
log('Execute the command: {}'.format(cmd)) log('Execute the command: {}'.format(cmd))
t0 = time() t0 = time()
write_self_oom_score_adj(self_oom_score_adj_max)
err = os.system(cmd) err = os.system(cmd)
write_self_oom_score_adj(self_oom_score_adj_min)
dt = time() - t0 dt = time() - t0
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3))) log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
return err return err
@ -97,10 +118,13 @@ def valid_re(reg_exp):
def pid_to_cgroup(pid): def pid_to_cgroup(pid):
""" """
""" """
try:
with open('/proc/' + pid + '/cgroup') as f: with open('/proc/' + pid + '/cgroup') as f:
for n, line in enumerate(f): for n, line in enumerate(f):
if n == cgroup_index: if n == cgroup_index:
return '/' + line.partition('/')[2][:-1] return '/' + line.partition('/')[2][:-1]
except FileNotFoundError:
return ''
def func_print_proc_table(): def func_print_proc_table():
@ -114,11 +138,23 @@ def func_print_proc_table():
def log(*msg): def log(*msg):
""" """
""" """
try:
print(*msg) print(*msg)
except OSError:
sleep(0.01)
pass
# print('OSError in print(*msg)')
if separate_log: if separate_log:
# need fix: TypeError: not all arguments converted during string # need fix: TypeError: not all arguments converted during string
# formatting # formatting
try:
info(*msg) info(*msg)
except OSError:
sleep(0.01)
pass
# print('OSError in info(*msg)')
def print_version(): def print_version():
@ -207,7 +243,7 @@ def uptime():
def pid_to_starttime(pid): def pid_to_starttime(pid):
""" """ handle FNF error!
""" """
try: try:
starttime = rline1('/proc/' + pid + '/stat').rpartition(')')[ starttime = rline1('/proc/' + pid + '/stat').rpartition(')')[
@ -224,8 +260,11 @@ def pid_to_starttime(pid):
def get_victim_id(pid): def get_victim_id(pid):
"""victim_id is starttime + pid""" """victim_id is starttime + pid"""
try:
return rline1('/proc/' + pid + '/stat').rpartition( return rline1('/proc/' + pid + '/stat').rpartition(
')')[2].split(' ')[20] + pid ')')[2].split(' ')[20] + pid
except FileNotFoundError:
return ''
def errprint(*text): def errprint(*text):
@ -259,7 +298,7 @@ def mlockall():
def pid_to_state(pid): def pid_to_state(pid):
""" """ Handle FNF error! (BTW it already handled in find_victim_info())
""" """
return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1] return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1]
@ -501,31 +540,6 @@ def zram_stat(zram_id):
return disksize, mem_used_total # BYTES, str return disksize, mem_used_total # BYTES, str
'''
def pid_to_name(pid):
"""
Get process name by pid.
pid: str pid of required process
returns string process_name
"""
try:
with open('/proc/' + pid + '/status') as f:
f.seek(6)
for line in f:
return line[:-1]
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
except UnicodeDecodeError:
with open('/proc/' + pid + '/status', 'rb') as f:
f.seek(6)
return f.read(15).decode(
'utf-8', 'ignore').partition('\n')[0]
'''
def pid_to_name(pid): def pid_to_name(pid):
""" """
""" """
@ -538,9 +552,6 @@ def pid_to_name(pid):
return '' return ''
def pid_to_ppid(pid): def pid_to_ppid(pid):
""" """
""" """
@ -591,8 +602,11 @@ def pid_to_cmdline(pid):
pid: str pid of required process pid: str pid of required process
returns string cmdline returns string cmdline
""" """
try:
with open('/proc/' + pid + '/cmdline') as f: with open('/proc/' + pid + '/cmdline') as f:
return f.read().replace('\x00', ' ').rstrip() return f.read().replace('\x00', ' ').rstrip()
except FileNotFoundError:
return ''
def pid_to_realpath(pid): def pid_to_realpath(pid):
@ -613,6 +627,8 @@ def pid_to_uid(pid):
with open('/proc/' + pid + '/status', 'rb') as f: with open('/proc/' + pid + '/status', 'rb') as f:
f_list = f.read().decode('utf-8', 'ignore').split('\n') f_list = f.read().decode('utf-8', 'ignore').split('\n')
return f_list[uid_index].split('\t')[2] return f_list[uid_index].split('\t')[2]
except FileNotFoundError:
return ''
def send_notify_warn(): def send_notify_warn():
@ -666,13 +682,12 @@ def send_notify_warn():
# send notification to user that runs this nohang # send notification to user that runs this nohang
notify_send_wait(title, body) notify_send_wait(title, body)
''' '''
# os.system('echo --- \ - $(sleep 5) - ') # os.system('echo --- \ - $(sleep 5) - ')
t0 = time() print('Warning threshold exceeded')
if check_warning_exe: if check_warning_exe:
print('Warning threshold exceeded')
exe(warning_exe) exe(warning_exe)
else: else:
@ -684,9 +699,6 @@ def send_notify_warn():
) )
send_notification(title, body) send_notification(title, body)
t1 = time()
print('Warning duration:', t1 - t0)
def send_notify(signal, name, pid): def send_notify(signal, name, pid):
""" """
@ -728,11 +740,9 @@ def send_notify_etc(pid, name, command):
pid: str process pid pid: str process pid
""" """
title = 'Freeze prevention' title = 'Freeze prevention'
body = '<b>Victim is</b> [{}] <b>{}</b>\nExecute the command:\n<b>{}</b>'.format( body = '<b>Victim is</b> [{}] <b>{}</b>\nExecute the co' \
pid, 'mmand:\n<b>{}</b>'.format(
name.replace('&', '*'), pid, name.replace('&', '*'), command.replace('&', '*'))
command.replace('&', '*')
)
send_notification(title, body) send_notification(title, body)
@ -759,9 +769,14 @@ def send_notification(title, body):
text = '{}{}{}'.format(title, split_by, body) text = '{}{}{}'.format(title, split_by, body)
try:
with open(path_to_cache, 'w') as f: with open(path_to_cache, 'w') as f:
f.write(text) f.write(text)
os.chmod(path_to_cache, 0o600) os.chmod(path_to_cache, 0o600)
except OSError:
log('OSError while send notification '
'(No space left on device: /dev/shm)')
return None
cmd = '{} --uid {} --time {} &'.format(notify_helper_path, self_uid, t000) cmd = '{} --uid {} --time {} &'.format(notify_helper_path, self_uid, t000)
@ -882,14 +897,34 @@ def find_victim(_print_proc_table):
non_decimal_list = get_non_decimal_pids() non_decimal_list = get_non_decimal_pids()
for i in non_decimal_list: for i in non_decimal_list:
if i in pid_list: # ????????????????????????????????????????????
pid_list.remove(i) pid_list.remove(i)
pid_badness_list = [] pid_badness_list = []
if _print_proc_table: if _print_proc_table:
if extra_table_info == 'None':
extra_table_title = ''
elif extra_table_info == 'cgroup':
extra_table_title = 'CGroup'
elif extra_table_info == 'cmdline':
extra_table_title = 'cmdline'
elif extra_table_info == 'realpath':
extra_table_title = 'realpath'
elif extra_table_info == 'All':
extra_table_title = '[CGroup] [CmdLine] [RealPath]'
else:
extra_table_title = ''
log('==============================================================' log('=============================================================='
'=================') '=================')
log(' PID badness Name eUID CGroup') log(' PID badness Name eUID {}'.format(
extra_table_title))
log('------- ------- --------------- ---------- -----------' log('------- ------- --------------- ---------- -----------'
'----------------------') '----------------------')
@ -900,17 +935,38 @@ def find_victim(_print_proc_table):
continue continue
if _print_proc_table: if _print_proc_table:
if extra_table_info == 'None':
extra_table_line = ''
elif extra_table_info == 'cgroup':
extra_table_line = pid_to_cgroup(pid)
elif extra_table_info == 'cmdline':
extra_table_line = pid_to_cmdline(pid)
elif extra_table_info == 'realpath':
extra_table_line = pid_to_realpath(pid)
elif extra_table_info == 'All':
extra_table_line = '[CG: {}] [CL: {}] [RP: {}]'.format(
pid_to_cgroup(pid),
pid_to_cmdline(pid),
pid_to_realpath(pid)
)
else:
extra_table_line = ''
log('{} {} {} {} {}'.format( log('{} {} {} {} {}'.format(
pid.rjust(7), pid.rjust(7),
str(badness).rjust(7), str(badness).rjust(7),
pid_to_name(pid).ljust(15), pid_to_name(pid).ljust(15),
# сейчас ищем уид, а надо всего побольше, и состояние памяти.
# Написать безопасную фцию для нахождения для каждого процесса:
pid_to_uid(pid).rjust(10), pid_to_uid(pid).rjust(10),
# pid_to_cmdline(pid) # Name, PPID, State, VmSize, VmRSS, VmSwap, Threads - на основе
pid_to_realpath(pid) # find victim info.
# pid_to_cgroup(pid) extra_table_line)
# pid_to_name(pid)
# ''
)
) )
pid_badness_list.append((pid, badness)) pid_badness_list.append((pid, badness))
@ -946,10 +1002,15 @@ def find_victim(_print_proc_table):
return pid, victim_badness, victim_name return pid, victim_badness, victim_name
def find_status_for_proc_table(pid):
"""
"""
pass
def find_victim_info(pid, victim_badness, name): def find_victim_info(pid, victim_badness, name):
""" """
""" """
status0 = time() status0 = time()
try: try:
@ -1085,6 +1146,8 @@ def find_victim_info(pid, victim_badness, name):
try: try:
realpath = os.path.realpath('/proc/' + pid + '/exe') realpath = os.path.realpath('/proc/' + pid + '/exe')
victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
victim_cgroup = pid_to_cgroup(pid)
except FileNotFoundError: except FileNotFoundError:
print('The victim died in the search process: FileNotFoundError') print('The victim died in the search process: FileNotFoundError')
update_stat_dict_and_print( update_stat_dict_and_print(
@ -1106,10 +1169,6 @@ def find_victim_info(pid, victim_badness, name):
else: else:
detailed_rss_info = '' detailed_rss_info = ''
victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
victim_cgroup = pid_to_cgroup(pid)
victim_info = 'Victim information (found in {} ms):' \ victim_info = 'Victim information (found in {} ms):' \
'\n Name: {}' \ '\n Name: {}' \
'\n State: {}' \ '\n State: {}' \
@ -1147,7 +1206,6 @@ def find_victim_info(pid, victim_badness, name):
return victim_info return victim_info
# для дедупликации уведомлений # для дедупликации уведомлений
dick = dict() dick = dict()
dick['v'] = [1, 2, 3, time()] dick['v'] = [1, 2, 3, time()]
@ -1196,28 +1254,13 @@ def implement_corrective_action(signal):
'ion:\n MemAvailable' 'ion:\n MemAvailable'
': {} MiB, SwapFree: {} MiB'.format(ma, sf)) ': {} MiB, SwapFree: {} MiB'.format(ma, sf))
cmd = etc_dict[name].replace('$PID', pid).replace( cmd = etc_dict[name].replace('$PID', pid).replace(
'$NAME', pid_to_name(pid)) '$NAME', pid_to_name(pid))
exit_status = exe(cmd) exit_status = exe(cmd)
exit_status = str(exit_status) exit_status = str(exit_status)
response_time = time() - time0 response_time = time() - time0
etc_info = 'Implement a corrective act' \ etc_info = 'Implement a corrective act' \
@ -1274,7 +1317,6 @@ def implement_corrective_action(signal):
exe(cmd) exe(cmd)
if gui_notifications: if gui_notifications:
# min delay after same notification # min delay after same notification
@ -1315,7 +1357,10 @@ def implement_corrective_action(signal):
key = 'ProcessLookupError (the victim died in the se' \ key = 'ProcessLookupError (the victim died in the se' \
'arch process): ' 'arch process): '
try:
log(preventing_oom_message) log(preventing_oom_message)
except UnboundLocalError:
preventing_oom_message = key
update_stat_dict_and_print(key) update_stat_dict_and_print(key)
@ -1388,7 +1433,10 @@ def sleep_after_check_mem():
) )
) )
try:
stdout.flush() stdout.flush()
except OSError: # OSError: [Errno 105] No buffer space available
pass
try: try:
sleep(t) sleep(t)
@ -1568,18 +1616,12 @@ cgroup_re_list = []
realpath_re_list = [] realpath_re_list = []
# dictionary with names and commands for the parameter # dictionary with names and commands for the parameter
# execute_the_command # execute_the_command
# тут тоже список нужен, а не словарь # тут тоже список нужен, а не словарь
etc_dict = dict() etc_dict = dict()
try: try:
with open(config) as f: with open(config) as f:
@ -1595,7 +1637,6 @@ try:
if not a and not b and not c and not d and not etc: if not a and not b and not c and not d and not etc:
a = line.partition('=') a = line.partition('=')
key = a[0].strip() key = a[0].strip()
value = a[2].strip() value = a[2].strip()
@ -1656,11 +1697,6 @@ try:
realpath_re_list.append((badness_adj, reg_exp)) realpath_re_list.append((badness_adj, reg_exp))
except PermissionError: except PermissionError:
errprint('PermissionError', conf_err_mess) errprint('PermissionError', conf_err_mess)
exit(1) exit(1)
@ -1689,8 +1725,8 @@ except FileNotFoundError:
# check for all necessary parameters # check for all necessary parameters
# validation of all parameters # validation of all parameters
psi_debug = conf_parse_bool('psi_debug') psi_debug = conf_parse_bool('psi_debug')
print_total_stat = conf_parse_bool('print_total_stat')
print_proc_table = conf_parse_bool('print_proc_table')
forbid_negative_badness = conf_parse_bool('forbid_negative_badness') forbid_negative_badness = conf_parse_bool('forbid_negative_badness')
print_victim_info = conf_parse_bool('print_victim_info') print_victim_info = conf_parse_bool('print_victim_info')
print_config = conf_parse_bool('print_config') print_config = conf_parse_bool('print_config')
@ -1966,8 +2002,17 @@ else:
exit(1) exit(1)
print_total_stat = conf_parse_bool('print_total_stat') if 'extra_table_info' in config_dict:
print_proc_table = conf_parse_bool('print_proc_table') extra_table_info = config_dict['extra_table_info']
if (extra_table_info != 'None' and extra_table_info != 'cgroup' and
extra_table_info != 'cmdline' and extra_table_info != 'realpath' and
extra_table_info != 'All'):
errprint('Invalid config: invalid extra_table_info value\nExit')
exit(1)
else:
errprint('Invalid config: extra_table_info is not in config\nExit')
exit(1)
separate_log = conf_parse_bool('separate_log') separate_log = conf_parse_bool('separate_log')
@ -2308,7 +2353,8 @@ while True:
if sigkill_psi_exceeded and psi_post_action_delay_exceeded: if sigkill_psi_exceeded and psi_post_action_delay_exceeded:
time0 = time() time0 = time()
mem_info = 'PSI avg value ({}) > sigkill_psi_threshold ({})'.format( mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \
'old ({})'.format(
psi_avg_value, sigkill_psi_threshold) psi_avg_value, sigkill_psi_threshold)
implement_corrective_action(SIGKILL) implement_corrective_action(SIGKILL)
@ -2318,8 +2364,8 @@ while True:
if sigterm_psi_exceeded and psi_post_action_delay_exceeded: if sigterm_psi_exceeded and psi_post_action_delay_exceeded:
time0 = time() time0 = time()
mem_info = 'PSI avg value ({}) > sigterm_psi_threshold ({})'.format( mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \
psi_avg_value, sigterm_psi_threshold) 'shold ({})'.format(psi_avg_value, sigterm_psi_threshold)
implement_corrective_action(SIGTERM) implement_corrective_action(SIGTERM)
@ -2451,6 +2497,7 @@ while True:
swap_sigkill_pc) swap_sigkill_pc)
implement_corrective_action(SIGKILL) implement_corrective_action(SIGKILL)
psi_t0 = time() psi_t0 = time()
continue continue
@ -2468,6 +2515,7 @@ while True:
percent(zram_max_sigkill_kb / mem_total)) percent(zram_max_sigkill_kb / mem_total))
implement_corrective_action(SIGKILL) implement_corrective_action(SIGKILL)
psi_t0 = time() psi_t0 = time()
continue continue
@ -2494,6 +2542,7 @@ while True:
swap_sigterm_pc) swap_sigterm_pc)
implement_corrective_action(SIGTERM) implement_corrective_action(SIGTERM)
psi_t0 = time() psi_t0 = time()
continue continue
@ -2525,9 +2574,7 @@ while True:
warn_time_now = time() warn_time_now = time()
warn_timer += warn_time_delta warn_timer += warn_time_delta
if warn_timer > min_time_between_warnings: if warn_timer > min_time_between_warnings:
t0 = time()
send_notify_warn() send_notify_warn()
log(str(time() - t0) + ' | send notify warning time')
warn_timer = 0 warn_timer = 0
# SLEEP BETWEEN MEM CHECKS # SLEEP BETWEEN MEM CHECKS

View File

@ -32,6 +32,8 @@
Just read the description of the parameters and edit the values. Just read the description of the parameters and edit the values.
Please restart the program after editing the config. Please restart the program after editing the config.
Bool values are case sensitive.
##################################################################### #####################################################################
1. Thresholds below which a signal should be sent to the victim 1. Thresholds below which a signal should be sent to the victim
@ -103,7 +105,7 @@ psi_metrics = some_avg10
sigterm_psi_threshold = 80 sigterm_psi_threshold = 80
sigkill_psi_threshold = 90 sigkill_psi_threshold = 90
psi_post_action_delay = 40 psi_post_action_delay = 60
##################################################################### #####################################################################
@ -148,7 +150,6 @@ min_badness = 20
min_delay_after_sigterm = 0.2 min_delay_after_sigterm = 0.2
min_delay_after_sigkill = 1 min_delay_after_sigkill = 1
Enabling the option requires root privileges.
Valid values are True and False. Valid values are True and False.
Values are case sensitive. Values are case sensitive.
@ -221,7 +222,7 @@ re_match_cgroup = False
@CGROUP_RE -50 /// system.slice @CGROUP_RE -50 /// system.slice
@CGROUP_RE -50 /// foo.service @CGROUP_RE 50 /// foo.service
@CGROUP_RE -50 /// user.slice @CGROUP_RE -50 /// user.slice
@ -300,7 +301,6 @@ gui_low_memory_warnings = True
Execute the command instead of sending GUI notifications if the value is Execute the command instead of sending GUI notifications if the value is
not empty line. For example: not empty line. For example:
warning_exe = cat /proc/meminfo & warning_exe = cat /proc/meminfo &
warning_exe = cat /proc/pressure/memory & cat /sys/fs/cgroup/unified/system.slice/memory.pressure & cat /sys/fs/cgroup/unified/user.slice/memory.pressure &
warning_exe = warning_exe =
@ -332,7 +332,7 @@ print_config = False
Print memory check results. Print memory check results.
Valid values are True and False. Valid values are True and False.
print_mem_check_results = True print_mem_check_results = False
min_mem_report_interval = 60 min_mem_report_interval = 60
@ -343,11 +343,20 @@ print_sleep_periods = False
print_total_stat = True print_total_stat = True
print_proc_table = True print_proc_table = False
Valid values:
None
cgroup
cmdline
realpath
All
extra_table_info = cgroup
print_victim_info = True print_victim_info = True
max_ancestry_depth = 5 max_ancestry_depth = 1
separate_log = False separate_log = False

View File

@ -7,9 +7,8 @@ Documentation=man:nohang(1) https://github.com/hakavlad/nohang
ExecStart=/usr/sbin/nohang --config /etc/nohang/nohang.conf ExecStart=/usr/sbin/nohang --config /etc/nohang/nohang.conf
Slice=nohang.slice Slice=nohang.slice
Restart=always Restart=always
MemoryMax=60M MemoryMax=50M
TasksMax=20 TasksMax=50
OOMScoreAdjust=-5
Nice=-20 Nice=-20
IOSchedulingClass=1 IOSchedulingClass=1
IOSchedulingPriority=0 IOSchedulingPriority=0

View File

@ -1,34 +1,38 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from os import listdir, path, remove
from subprocess import Popen, TimeoutExpired
from sys import argv
# print('Starting nohang_notify_helper') # print('Starting nohang_notify_helper')
# print(argv)
# print(len(argv)) def write(path, string):
"""
"""
with open(path, 'w') as f:
f.write(string)
split_by = '#' * 16
uid = argv[2] try:
write('/proc/self/oom_score_adj', '0')
except Exception:
pass
t000 = argv[4]
wait_time = 10 try:
from os import listdir, path, remove
display_env = 'DISPLAY=' from subprocess import Popen, TimeoutExpired
dbus_env = 'DBUS_SESSION_BUS_ADDRESS=' from sys import argv
user_env = 'USER=' except OSError:
exit(1)
def rline1(path): def rline1(path):
"""read 1st line from path.""" """read 1st line from path."""
try:
with open(path) as f: with open(path) as f:
for line in f: for line in f:
return line return line
except OSError:
exit(1)
def rfile(path): def rfile(path):
@ -37,6 +41,39 @@ def rfile(path):
return f.read() return f.read()
with open('/proc/meminfo') as f:
for line in f:
if line.startswith('SwapTotal'):
swap_total = int(line.split(':')[1][:-4])
if swap_total > 0:
wait_time = 5
else:
wait_time = 0.5
print('nohang_notify_helper: wait_time:', wait_time)
# print(argv)
# print(len(argv))
split_by = '#' * 16
uid = argv[2]
t000 = argv[4]
display_env = 'DISPLAY='
dbus_env = 'DBUS_SESSION_BUS_ADDRESS='
user_env = 'USER='
path_to_cache = '/dev/shm/nohang_notify_cache_uid{}_time{}'.format( path_to_cache = '/dev/shm/nohang_notify_cache_uid{}_time{}'.format(
uid, t000 uid, t000
) )
@ -161,6 +198,10 @@ if list_len > 0:
print('TimeoutExpired: notify user: ' + username) print('TimeoutExpired: notify user: ' + username)
except BlockingIOError: except BlockingIOError:
print('nohang_notify_helper: BlockingIOError') print('nohang_notify_helper: BlockingIOError')
except OSError:
print('nohang_notify_helper: OSError')
except Exception:
print('nohang_notify_helper: CANNOT SPAWN NOTIFY-SEND PROCESS')
else: else:
print( print(
'Not send GUI notification: [', 'Not send GUI notification: [',