Various improvements and fixes
Fix Something went wrong #56 Improve implement_corrective_action(): - Check VmRSS instead od /proc/[pid]/exe realpath to check process existene - Improve displaying total stat Improve check_permissions() Other improvements
This commit is contained in:
parent
0225fc3bff
commit
8e931f8f00
354
nohang/nohang
354
nohang/nohang
@ -17,8 +17,15 @@ def check_permissions():
|
||||
try:
|
||||
os.path.realpath('/proc/1/exe')
|
||||
except Exception as e:
|
||||
print('ERROR: missing CAP_SYS_PTRACE: {}'.format(e))
|
||||
exit(1)
|
||||
print('WARNING: missing CAP_SYS_PTRACE: {}'.format(e))
|
||||
try:
|
||||
os.kill(2, SIGTERM)
|
||||
except Exception as e:
|
||||
print('WARNING: cannot send a signal: {}'.format(e))
|
||||
try:
|
||||
rline1('/proc/1/oom_score')
|
||||
except Exception as e:
|
||||
print('WARNING: {}'.format(e))
|
||||
|
||||
|
||||
def memload():
|
||||
@ -55,8 +62,11 @@ def memload():
|
||||
' '.format(
|
||||
round(mem_available / 1024),
|
||||
round(swap_free / 1024)))
|
||||
|
||||
luid_init = rline1('/proc/1/loginuid')
|
||||
try:
|
||||
luid_init = rline1('/proc/1/loginuid')
|
||||
except Exception as e:
|
||||
print(e)
|
||||
exit(1)
|
||||
luid_self = rline1('/proc/self/loginuid')
|
||||
if luid_init == luid_self:
|
||||
print('The option is available only for logged in users.')
|
||||
@ -119,24 +129,28 @@ def exe(cmd):
|
||||
|
||||
cmd_num_dict['cmd_num'] += 1
|
||||
cmd_num = cmd_num_dict['cmd_num']
|
||||
th_name = threading.current_thread().getName()
|
||||
|
||||
log('Execute the command ({}) in {}: {}'.format(
|
||||
log('Executing the command ({}) in {}: {}'.format(
|
||||
cmd_num,
|
||||
threading.current_thread().getName(),
|
||||
th_name,
|
||||
cmd_list))
|
||||
t3 = monotonic()
|
||||
with Popen(cmd_list) as proc:
|
||||
try:
|
||||
proc.wait(timeout=exe_timeout)
|
||||
exit_status = proc.poll()
|
||||
t4 = monotonic()
|
||||
log('Command ({}) execution completed in {} sec; exit status'
|
||||
': {}'.format(cmd_num, round(t4 - t3, 3), exit_status))
|
||||
except TimeoutExpired:
|
||||
proc.kill()
|
||||
t4 = monotonic()
|
||||
log('TimeoutExpired for the command ({}) in {} sec'.format(
|
||||
cmd_num, round(t4 - t3, 3)))
|
||||
try:
|
||||
with Popen(cmd_list) as proc:
|
||||
try:
|
||||
proc.wait(timeout=exe_timeout)
|
||||
exit_status = proc.poll()
|
||||
t4 = monotonic()
|
||||
log('Command ({}) execution completed in {} sec; exit status'
|
||||
': {}'.format(cmd_num, round(t4 - t3, 3), exit_status))
|
||||
except TimeoutExpired:
|
||||
proc.kill()
|
||||
t4 = monotonic()
|
||||
log('TimeoutExpired for the command ({}) in {} sec'.format(
|
||||
cmd_num, round(t4 - t3, 3)))
|
||||
except Exception as e:
|
||||
log('Exception in {}: {}'.format(th_name, e))
|
||||
|
||||
|
||||
def start_thread(func, *a, **k):
|
||||
@ -146,7 +160,6 @@ def start_thread(func, *a, **k):
|
||||
th_name = th.getName()
|
||||
|
||||
if debug_threading:
|
||||
|
||||
log('Starting {} from {}'.format(
|
||||
th_name, threading.current_thread().getName()
|
||||
))
|
||||
@ -221,7 +234,7 @@ def root_notify_env():
|
||||
# iterates over processes, find processes with suitable env
|
||||
for pid in os.listdir('/proc'):
|
||||
|
||||
if os.path.exists('/proc/' + pid + '/exe') is True:
|
||||
if is_alive(pid):
|
||||
one_env = re_pid_environ(pid)
|
||||
unsorted_envs_list.append(one_env)
|
||||
|
||||
@ -707,7 +720,7 @@ def signal_handler(signum, frame):
|
||||
log('Got the {} signal '.format(
|
||||
sig_dict[signum]))
|
||||
|
||||
update_stat_dict_and_print(None)
|
||||
print_stat_dict()
|
||||
|
||||
m1 = monotonic()
|
||||
pt1 = process_time()
|
||||
@ -965,22 +978,26 @@ def pid_to_realpath(pid):
|
||||
"""
|
||||
"""
|
||||
try:
|
||||
return os.path.realpath('/proc/' + pid + '/exe')
|
||||
return os.path.realpath('/proc/{}/exe'.format(pid))
|
||||
except FileNotFoundError:
|
||||
return ''
|
||||
except ProcessLookupError:
|
||||
return ''
|
||||
except PermissionError:
|
||||
return ''
|
||||
|
||||
|
||||
def pid_to_cwd(pid):
|
||||
"""
|
||||
"""
|
||||
try:
|
||||
return os.path.realpath('/proc/' + pid + '/cwd')
|
||||
return os.path.realpath('/proc/{}/cwd'.format(pid))
|
||||
except FileNotFoundError:
|
||||
return ''
|
||||
except ProcessLookupError:
|
||||
return ''
|
||||
except PermissionError:
|
||||
return ''
|
||||
|
||||
|
||||
def pid_to_uid(pid):
|
||||
@ -1275,38 +1292,30 @@ def mlockall():
|
||||
# log('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT')
|
||||
|
||||
|
||||
def update_stat_dict_and_print(key):
|
||||
def update_stat_dict(key):
|
||||
"""
|
||||
"""
|
||||
if key is not None:
|
||||
|
||||
if key not in stat_dict:
|
||||
|
||||
stat_dict.update({key: 1})
|
||||
|
||||
else:
|
||||
|
||||
new_value = stat_dict[key] + 1
|
||||
stat_dict.update({key: new_value})
|
||||
|
||||
|
||||
def print_stat_dict():
|
||||
"""
|
||||
"""
|
||||
if print_statistics:
|
||||
|
||||
lsd = len(stat_dict)
|
||||
|
||||
if lsd == 0:
|
||||
|
||||
log('No corrective actions applied in the last {}'.format(
|
||||
format_time(monotonic() - start_time)))
|
||||
|
||||
else:
|
||||
|
||||
stats_msg = 'What happened in the last {}:'.format(
|
||||
format_time(monotonic() - start_time))
|
||||
|
||||
for i in stat_dict:
|
||||
|
||||
stats_msg += '\n {}: {}'.format(i, stat_dict[i])
|
||||
|
||||
log(stats_msg)
|
||||
|
||||
|
||||
@ -1339,7 +1348,7 @@ def find_psi_metrics_value(psi_path, psi_metrics):
|
||||
def check_mem_and_swap():
|
||||
"""
|
||||
"""
|
||||
with open('/proc/meminfo', 'rb', buffering=0) as f:
|
||||
with open('/proc/meminfo', 'rb') as f:
|
||||
m_list = f.read().decode().split(' kB\n')
|
||||
return (int(m_list[2].split(':')[1]),
|
||||
int(m_list[swap_total_index].split(':')[1]),
|
||||
@ -1512,25 +1521,40 @@ def zram_stat(zram_id):
|
||||
return disksize, mem_used_total # BYTES, str
|
||||
|
||||
|
||||
def get_pid_list():
|
||||
def is_alive(pid):
|
||||
"""
|
||||
"""
|
||||
try:
|
||||
with open('/proc/{}/statm'.format(pid), 'rb', buffering=0) as f:
|
||||
rss = f.read().decode().split(' ')[1]
|
||||
if rss != '0':
|
||||
return True
|
||||
except FileNotFoundError:
|
||||
return False
|
||||
except ProcessLookupError:
|
||||
return False
|
||||
except NotADirectoryError:
|
||||
return False
|
||||
|
||||
|
||||
def alive_pid_list():
|
||||
"""
|
||||
Find pid list expect kthreads and zombies
|
||||
"""
|
||||
pid_list = []
|
||||
for pid in os.listdir('/proc'):
|
||||
if os.path.exists('/proc/' + pid + '/exe'):
|
||||
pid_list.append(pid)
|
||||
return pid_list
|
||||
|
||||
|
||||
def get_non_decimal_pids():
|
||||
"""
|
||||
"""
|
||||
non_decimal_list = []
|
||||
for pid in pid_list:
|
||||
if pid[0].isdecimal() is False:
|
||||
non_decimal_list.append(pid)
|
||||
return non_decimal_list
|
||||
continue
|
||||
|
||||
if is_alive(pid):
|
||||
pid_list.append(pid)
|
||||
|
||||
pid_list.remove(self_pid)
|
||||
|
||||
if '1' in pid_list:
|
||||
pid_list.remove('1')
|
||||
|
||||
return pid_list
|
||||
|
||||
|
||||
def find_victim(_print_proc_table):
|
||||
@ -1541,18 +1565,7 @@ def find_victim(_print_proc_table):
|
||||
|
||||
ft1 = monotonic()
|
||||
|
||||
pid_list = get_pid_list()
|
||||
|
||||
pid_list.remove(self_pid)
|
||||
|
||||
if '1' in pid_list:
|
||||
pid_list.remove('1')
|
||||
|
||||
non_decimal_list = get_non_decimal_pids()
|
||||
|
||||
for i in non_decimal_list:
|
||||
if i in pid_list:
|
||||
pid_list.remove(i)
|
||||
pid_list = alive_pid_list()
|
||||
|
||||
pid_badness_list = []
|
||||
|
||||
@ -1584,6 +1597,7 @@ def find_victim(_print_proc_table):
|
||||
|
||||
hr = '#' * 107
|
||||
|
||||
log('Tasks state (memory values in mebibytes):')
|
||||
log(hr)
|
||||
log('# PID PPID badness oom_score oom_score_adj e'
|
||||
'UID S VmSize VmRSS VmSwap Name {}'.format(
|
||||
@ -1674,7 +1688,7 @@ def find_victim(_print_proc_table):
|
||||
if _print_proc_table:
|
||||
log(hr)
|
||||
|
||||
log('Found {} processes with existing /proc/[pid]/exe realpath'.format(
|
||||
log('Found {} tasks with non-zero VmRSS (except init and self)'.format(
|
||||
real_proc_num))
|
||||
|
||||
log(
|
||||
@ -1705,12 +1719,6 @@ def find_victim_info(pid, victim_badness, name):
|
||||
state = line.split('\t')[1].rstrip()
|
||||
continue
|
||||
|
||||
"""
|
||||
if n is ppid_index:
|
||||
# ppid = line.split('\t')[1]
|
||||
continue
|
||||
"""
|
||||
|
||||
if n is uid_index:
|
||||
uid = line.split('\t')[2]
|
||||
continue
|
||||
@ -1750,14 +1758,16 @@ def find_victim_info(pid, victim_badness, name):
|
||||
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
|
||||
|
||||
except FileNotFoundError:
|
||||
log('The victim died in the search process: FileNotFoundError')
|
||||
update_stat_dict_and_print(
|
||||
'The victim died in the search process: FileNotFoundError')
|
||||
x = 'The victim died in the search process: FileNotFoundError'
|
||||
log(x)
|
||||
update_stat_dict(x)
|
||||
print_stat_dict()
|
||||
return None
|
||||
except ProcessLookupError:
|
||||
log('The victim died in the search process: ProcessLookupError')
|
||||
update_stat_dict_and_print(
|
||||
'The victim died in the search process: ProcessLookupError')
|
||||
x = 'The victim died in the search process: ProcessLookupError'
|
||||
log(x)
|
||||
update_stat_dict(x)
|
||||
print_stat_dict()
|
||||
return None
|
||||
except UnicodeDecodeError:
|
||||
|
||||
@ -1803,43 +1813,46 @@ def find_victim_info(pid, victim_badness, name):
|
||||
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
|
||||
|
||||
except IndexError:
|
||||
log('The victim died in the search process: IndexError')
|
||||
update_stat_dict_and_print(
|
||||
'The victim died in the search process: IndexError')
|
||||
x = 'The victim died in the search process: IndexError'
|
||||
log(x)
|
||||
update_stat_dict(x)
|
||||
print_stat_dict()
|
||||
return None
|
||||
except ValueError:
|
||||
log('The victim died in the search process: ValueError')
|
||||
update_stat_dict_and_print(
|
||||
'The victim died in the search process: ValueError')
|
||||
x = 'The victim died in the search process: ValueError'
|
||||
log(x)
|
||||
update_stat_dict(x)
|
||||
print_stat_dict()
|
||||
return None
|
||||
|
||||
except FileNotFoundError:
|
||||
log('The victim died in the search process: FileNotFoundError')
|
||||
update_stat_dict_and_print(
|
||||
'The victim died in the search process: FileNotFoundError')
|
||||
x = 'The victim died in the search process: FileNotFoundError'
|
||||
log(x)
|
||||
update_stat_dict(x)
|
||||
print_stat_dict()
|
||||
return None
|
||||
except ProcessLookupError:
|
||||
log('The victim died in the search process: ProcessLookupError')
|
||||
update_stat_dict_and_print(
|
||||
'The victim died in the search process: ProcessLookupError')
|
||||
x = 'The victim died in the search process: ProcessLookupError'
|
||||
log(x)
|
||||
update_stat_dict(x)
|
||||
print_stat_dict()
|
||||
return None
|
||||
|
||||
len_vm = len(str(vm_size))
|
||||
|
||||
try:
|
||||
realpath = os.path.realpath('/proc/' + pid + '/exe')
|
||||
|
||||
cwd = os.path.realpath('/proc/' + pid + '/cwd')
|
||||
|
||||
realpath = pid_to_realpath(pid)
|
||||
cwd = pid_to_cwd(pid)
|
||||
nssid = pid_to_nssid(pid)
|
||||
|
||||
victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
|
||||
victim_cgroup_v1 = pid_to_cgroup_v1(pid)
|
||||
victim_cgroup_v2 = pid_to_cgroup_v2(pid)
|
||||
|
||||
except FileNotFoundError:
|
||||
log('The victim died in the search process: FileNotFoundError')
|
||||
update_stat_dict_and_print(
|
||||
'The victim died in the search process: FileNotFoundError')
|
||||
x = 'The victim died in the search process: FileNotFoundError'
|
||||
log(x)
|
||||
update_stat_dict(x)
|
||||
print_stat_dict()
|
||||
return None
|
||||
|
||||
ancestry = pid_to_ancestry(pid, max_victim_ancestry_depth)
|
||||
@ -2156,8 +2169,7 @@ def is_victim_alive(victim_id):
|
||||
if victim_id != new_victim_id:
|
||||
return 0
|
||||
|
||||
exe_exists = os.path.exists('/proc/{}/exe'.format(pid))
|
||||
if exe_exists:
|
||||
if is_alive(pid):
|
||||
return 1
|
||||
|
||||
state = pid_to_state(pid)
|
||||
@ -2295,29 +2307,29 @@ def implement_corrective_action(
|
||||
|
||||
vwd = None # Victim Will Die
|
||||
|
||||
if threshold is SIGTERM:
|
||||
if victim_id in v_dict:
|
||||
dt = monotonic() - v_dict[victim_id]['time']
|
||||
if dt > max_soft_exit_time:
|
||||
log('max_soft_exit_time is exceeded: the '
|
||||
'victim will get SIGKILL')
|
||||
threshold = SIGKILL
|
||||
else:
|
||||
log('max_soft_exit_time is not exceeded ('
|
||||
'{} < {}) for the victim'.format(round(
|
||||
dt, 1), max_soft_exit_time))
|
||||
|
||||
if debug_sleep:
|
||||
log('Sleep {} sec (over_sleep)'.format(over_sleep))
|
||||
sleep(over_sleep)
|
||||
|
||||
log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
|
||||
'<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
|
||||
|
||||
return psi_t0
|
||||
|
||||
if victim_badness >= min_badness:
|
||||
|
||||
if threshold is SIGTERM:
|
||||
if victim_id in v_dict:
|
||||
dt = monotonic() - v_dict[victim_id]['time']
|
||||
if dt > max_soft_exit_time:
|
||||
log('max_soft_exit_time is exceeded: the '
|
||||
'victim will get SIGKILL')
|
||||
threshold = SIGKILL
|
||||
else:
|
||||
log('max_soft_exit_time is not exceeded ('
|
||||
'{} < {}) for the victim'.format(round(
|
||||
dt, 1), max_soft_exit_time))
|
||||
|
||||
if debug_sleep:
|
||||
log('Sleep {} sec (over_sleep)'.format(over_sleep))
|
||||
sleep(over_sleep)
|
||||
|
||||
log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
|
||||
'<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
|
||||
|
||||
return psi_t0
|
||||
|
||||
if print_victim_status:
|
||||
victim_info = find_victim_info(pid, victim_badness, name)
|
||||
log(victim_info)
|
||||
@ -2345,54 +2357,89 @@ def implement_corrective_action(
|
||||
soft_match = True
|
||||
break
|
||||
|
||||
start_action = monotonic()
|
||||
|
||||
if soft_match:
|
||||
|
||||
cmd = command.replace('$PID', pid).replace('$NAME', pid_to_name(
|
||||
pid)).replace('$SERVICE', service)
|
||||
start_thread(exe, cmd)
|
||||
|
||||
preventing_oom_message = 'Implementing a corrective action:\n ' \
|
||||
'Executing the command: {}'.format(cmd)
|
||||
|
||||
log(preventing_oom_message)
|
||||
|
||||
err = start_thread(exe, cmd)
|
||||
|
||||
if err == 1:
|
||||
key = 'Cannot execute the command in the new thread'
|
||||
update_stat_dict(key)
|
||||
log(key)
|
||||
|
||||
else:
|
||||
|
||||
update_stat_dict('Executing the command "{}"'.format(command))
|
||||
|
||||
response_time = monotonic() - time0
|
||||
|
||||
exit_status = None
|
||||
log('Total response time: {} ms'.format(round(
|
||||
response_time * 1000)))
|
||||
|
||||
preventing_oom_message = 'Implement a corrective act' \
|
||||
'ion:\n Run the command: {}' \
|
||||
'\n Exit status: {}; total response ' \
|
||||
'time: {} ms'.format(
|
||||
cmd,
|
||||
exit_status,
|
||||
round(response_time * 1000))
|
||||
print_stat_dict()
|
||||
|
||||
else:
|
||||
|
||||
preventing_oom_message = 'Implementing a corrective action:\n ' \
|
||||
'Sending {} to the victim'.format(
|
||||
sig_dict[threshold])
|
||||
|
||||
log(preventing_oom_message)
|
||||
|
||||
try:
|
||||
|
||||
os.kill(int(pid), threshold)
|
||||
|
||||
update_stat_dict(
|
||||
'[ OK ] Sending {} to {}'.format(sig_dict[threshold], name)
|
||||
)
|
||||
|
||||
response_time = monotonic() - time0
|
||||
|
||||
send_result = 'total response time: {} ms'.format(
|
||||
send_result = 'OK; total response time: {} ms'.format(
|
||||
round(response_time * 1000))
|
||||
|
||||
preventing_oom_message = 'Implementing a corrective action:' \
|
||||
'\n Sending {} to the victim; {}'.format(
|
||||
sig_dict[threshold], send_result)
|
||||
log(send_result)
|
||||
|
||||
if threshold is SIGKILL:
|
||||
vwd = True
|
||||
|
||||
except FileNotFoundError:
|
||||
vwd = True
|
||||
key = 'The victim died in the search process: ' \
|
||||
'FileNotFoundError'
|
||||
except ProcessLookupError:
|
||||
vwd = True
|
||||
key = 'The victim died in the search process: ' \
|
||||
'ProcessLookupError'
|
||||
print_stat_dict()
|
||||
|
||||
try:
|
||||
log(preventing_oom_message)
|
||||
except UnboundLocalError:
|
||||
pass
|
||||
except FileNotFoundError:
|
||||
|
||||
vwd = True
|
||||
key = 'Cannot send a signal: FileNotFoundError'
|
||||
update_stat_dict(key)
|
||||
print_stat_dict()
|
||||
log(key)
|
||||
|
||||
except ProcessLookupError:
|
||||
|
||||
vwd = True
|
||||
key = 'Cannot send a signal: ProcessLookupError'
|
||||
update_stat_dict(key)
|
||||
print_stat_dict()
|
||||
log(key)
|
||||
|
||||
except PermissionError:
|
||||
vwd = False
|
||||
key = 'Cannot send a signal: PermissionError'
|
||||
log(key)
|
||||
update_stat_dict(key)
|
||||
print_stat_dict()
|
||||
log('Sleep {}s'.format(post_soft_action_delay))
|
||||
sleep(post_soft_action_delay)
|
||||
# do not send signal twice!
|
||||
|
||||
if not vwd:
|
||||
if victim_id not in v_dict:
|
||||
@ -2404,6 +2451,8 @@ def implement_corrective_action(
|
||||
|
||||
last_action_dict['t'] = kill_timestamp = monotonic()
|
||||
|
||||
kill_timestamp = start_action
|
||||
|
||||
while True:
|
||||
sleep(0.01)
|
||||
d = monotonic() - kill_timestamp
|
||||
@ -2449,13 +2498,6 @@ def implement_corrective_action(
|
||||
': {} MiB, SwapFree: {} MiB'.format(
|
||||
round(ma_mib, 1), round(sf_mib, 1)))
|
||||
|
||||
if soft_match is False:
|
||||
key = 'Sending {} to {}'.format(sig_dict[threshold], name)
|
||||
update_stat_dict_and_print(key)
|
||||
else:
|
||||
key = "Run the command '{}'".format(command)
|
||||
update_stat_dict_and_print(key)
|
||||
|
||||
if threshold is SIGKILL and post_kill_exe != '':
|
||||
|
||||
cmd = post_kill_exe.replace('$PID', pid).replace(
|
||||
@ -2474,8 +2516,11 @@ def implement_corrective_action(
|
||||
else:
|
||||
|
||||
response_time = monotonic() - time0
|
||||
victim_badness_is_too_small = 'victim badness ({}) < min_b' \
|
||||
'adness ({}); nothing to do; response time: {} ms'.format(
|
||||
|
||||
victim_badness_is_too_small = 'victim (PID: {}, Name: {}) badness ' \
|
||||
'({}) < min_badness ({}); nothing to do; response tim' \
|
||||
'e: {} ms'.format(
|
||||
pid, name,
|
||||
victim_badness,
|
||||
min_badness,
|
||||
round(response_time * 1000))
|
||||
@ -2484,7 +2529,8 @@ def implement_corrective_action(
|
||||
|
||||
# update stat_dict
|
||||
key = 'victim badness < min_badness'
|
||||
update_stat_dict_and_print(key)
|
||||
update_stat_dict(key)
|
||||
print_stat_dict()
|
||||
|
||||
if vwd is None:
|
||||
|
||||
@ -2696,7 +2742,7 @@ separate_log = False # will be overwritten after parse config
|
||||
cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
|
||||
|
||||
|
||||
pid_list = get_pid_list()
|
||||
pid_list = alive_pid_list()
|
||||
|
||||
|
||||
print_proc_table_flag = False
|
||||
|
Loading…
Reference in New Issue
Block a user