Various improvements and fixes

Fix Something went wrong #56

Improve implement_corrective_action():
- Check VmRSS instead od /proc/[pid]/exe realpath to check process existene
- Improve displaying total stat

Improve check_permissions()

Other improvements
This commit is contained in:
Alexey Avramov 2020-03-15 21:10:01 +09:00
parent 0225fc3bff
commit 8e931f8f00

View File

@ -17,8 +17,15 @@ def check_permissions():
try:
os.path.realpath('/proc/1/exe')
except Exception as e:
print('ERROR: missing CAP_SYS_PTRACE: {}'.format(e))
exit(1)
print('WARNING: missing CAP_SYS_PTRACE: {}'.format(e))
try:
os.kill(2, SIGTERM)
except Exception as e:
print('WARNING: cannot send a signal: {}'.format(e))
try:
rline1('/proc/1/oom_score')
except Exception as e:
print('WARNING: {}'.format(e))
def memload():
@ -55,8 +62,11 @@ def memload():
' '.format(
round(mem_available / 1024),
round(swap_free / 1024)))
try:
luid_init = rline1('/proc/1/loginuid')
except Exception as e:
print(e)
exit(1)
luid_self = rline1('/proc/self/loginuid')
if luid_init == luid_self:
print('The option is available only for logged in users.')
@ -119,12 +129,14 @@ def exe(cmd):
cmd_num_dict['cmd_num'] += 1
cmd_num = cmd_num_dict['cmd_num']
th_name = threading.current_thread().getName()
log('Execute the command ({}) in {}: {}'.format(
log('Executing the command ({}) in {}: {}'.format(
cmd_num,
threading.current_thread().getName(),
th_name,
cmd_list))
t3 = monotonic()
try:
with Popen(cmd_list) as proc:
try:
proc.wait(timeout=exe_timeout)
@ -137,6 +149,8 @@ def exe(cmd):
t4 = monotonic()
log('TimeoutExpired for the command ({}) in {} sec'.format(
cmd_num, round(t4 - t3, 3)))
except Exception as e:
log('Exception in {}: {}'.format(th_name, e))
def start_thread(func, *a, **k):
@ -146,7 +160,6 @@ def start_thread(func, *a, **k):
th_name = th.getName()
if debug_threading:
log('Starting {} from {}'.format(
th_name, threading.current_thread().getName()
))
@ -221,7 +234,7 @@ def root_notify_env():
# iterates over processes, find processes with suitable env
for pid in os.listdir('/proc'):
if os.path.exists('/proc/' + pid + '/exe') is True:
if is_alive(pid):
one_env = re_pid_environ(pid)
unsorted_envs_list.append(one_env)
@ -707,7 +720,7 @@ def signal_handler(signum, frame):
log('Got the {} signal '.format(
sig_dict[signum]))
update_stat_dict_and_print(None)
print_stat_dict()
m1 = monotonic()
pt1 = process_time()
@ -965,22 +978,26 @@ def pid_to_realpath(pid):
"""
"""
try:
return os.path.realpath('/proc/' + pid + '/exe')
return os.path.realpath('/proc/{}/exe'.format(pid))
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
except PermissionError:
return ''
def pid_to_cwd(pid):
"""
"""
try:
return os.path.realpath('/proc/' + pid + '/cwd')
return os.path.realpath('/proc/{}/cwd'.format(pid))
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
except PermissionError:
return ''
def pid_to_uid(pid):
@ -1275,38 +1292,30 @@ def mlockall():
# log('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT')
def update_stat_dict_and_print(key):
def update_stat_dict(key):
"""
"""
if key is not None:
if key not in stat_dict:
stat_dict.update({key: 1})
else:
new_value = stat_dict[key] + 1
stat_dict.update({key: new_value})
def print_stat_dict():
"""
"""
if print_statistics:
lsd = len(stat_dict)
if lsd == 0:
log('No corrective actions applied in the last {}'.format(
format_time(monotonic() - start_time)))
else:
stats_msg = 'What happened in the last {}:'.format(
format_time(monotonic() - start_time))
for i in stat_dict:
stats_msg += '\n {}: {}'.format(i, stat_dict[i])
log(stats_msg)
@ -1339,7 +1348,7 @@ def find_psi_metrics_value(psi_path, psi_metrics):
def check_mem_and_swap():
"""
"""
with open('/proc/meminfo', 'rb', buffering=0) as f:
with open('/proc/meminfo', 'rb') as f:
m_list = f.read().decode().split(' kB\n')
return (int(m_list[2].split(':')[1]),
int(m_list[swap_total_index].split(':')[1]),
@ -1512,25 +1521,40 @@ def zram_stat(zram_id):
return disksize, mem_used_total # BYTES, str
def get_pid_list():
def is_alive(pid):
"""
"""
try:
with open('/proc/{}/statm'.format(pid), 'rb', buffering=0) as f:
rss = f.read().decode().split(' ')[1]
if rss != '0':
return True
except FileNotFoundError:
return False
except ProcessLookupError:
return False
except NotADirectoryError:
return False
def alive_pid_list():
"""
Find pid list expect kthreads and zombies
"""
pid_list = []
for pid in os.listdir('/proc'):
if os.path.exists('/proc/' + pid + '/exe'):
pid_list.append(pid)
return pid_list
def get_non_decimal_pids():
"""
"""
non_decimal_list = []
for pid in pid_list:
if pid[0].isdecimal() is False:
non_decimal_list.append(pid)
return non_decimal_list
continue
if is_alive(pid):
pid_list.append(pid)
pid_list.remove(self_pid)
if '1' in pid_list:
pid_list.remove('1')
return pid_list
def find_victim(_print_proc_table):
@ -1541,18 +1565,7 @@ def find_victim(_print_proc_table):
ft1 = monotonic()
pid_list = get_pid_list()
pid_list.remove(self_pid)
if '1' in pid_list:
pid_list.remove('1')
non_decimal_list = get_non_decimal_pids()
for i in non_decimal_list:
if i in pid_list:
pid_list.remove(i)
pid_list = alive_pid_list()
pid_badness_list = []
@ -1584,6 +1597,7 @@ def find_victim(_print_proc_table):
hr = '#' * 107
log('Tasks state (memory values in mebibytes):')
log(hr)
log('# PID PPID badness oom_score oom_score_adj e'
'UID S VmSize VmRSS VmSwap Name {}'.format(
@ -1674,7 +1688,7 @@ def find_victim(_print_proc_table):
if _print_proc_table:
log(hr)
log('Found {} processes with existing /proc/[pid]/exe realpath'.format(
log('Found {} tasks with non-zero VmRSS (except init and self)'.format(
real_proc_num))
log(
@ -1705,12 +1719,6 @@ def find_victim_info(pid, victim_badness, name):
state = line.split('\t')[1].rstrip()
continue
"""
if n is ppid_index:
# ppid = line.split('\t')[1]
continue
"""
if n is uid_index:
uid = line.split('\t')[2]
continue
@ -1750,14 +1758,16 @@ def find_victim_info(pid, victim_badness, name):
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
except FileNotFoundError:
log('The victim died in the search process: FileNotFoundError')
update_stat_dict_and_print(
'The victim died in the search process: FileNotFoundError')
x = 'The victim died in the search process: FileNotFoundError'
log(x)
update_stat_dict(x)
print_stat_dict()
return None
except ProcessLookupError:
log('The victim died in the search process: ProcessLookupError')
update_stat_dict_and_print(
'The victim died in the search process: ProcessLookupError')
x = 'The victim died in the search process: ProcessLookupError'
log(x)
update_stat_dict(x)
print_stat_dict()
return None
except UnicodeDecodeError:
@ -1803,43 +1813,46 @@ def find_victim_info(pid, victim_badness, name):
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
except IndexError:
log('The victim died in the search process: IndexError')
update_stat_dict_and_print(
'The victim died in the search process: IndexError')
x = 'The victim died in the search process: IndexError'
log(x)
update_stat_dict(x)
print_stat_dict()
return None
except ValueError:
log('The victim died in the search process: ValueError')
update_stat_dict_and_print(
'The victim died in the search process: ValueError')
x = 'The victim died in the search process: ValueError'
log(x)
update_stat_dict(x)
print_stat_dict()
return None
except FileNotFoundError:
log('The victim died in the search process: FileNotFoundError')
update_stat_dict_and_print(
'The victim died in the search process: FileNotFoundError')
x = 'The victim died in the search process: FileNotFoundError'
log(x)
update_stat_dict(x)
print_stat_dict()
return None
except ProcessLookupError:
log('The victim died in the search process: ProcessLookupError')
update_stat_dict_and_print(
'The victim died in the search process: ProcessLookupError')
x = 'The victim died in the search process: ProcessLookupError'
log(x)
update_stat_dict(x)
print_stat_dict()
return None
len_vm = len(str(vm_size))
try:
realpath = os.path.realpath('/proc/' + pid + '/exe')
cwd = os.path.realpath('/proc/' + pid + '/cwd')
realpath = pid_to_realpath(pid)
cwd = pid_to_cwd(pid)
nssid = pid_to_nssid(pid)
victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
victim_cgroup_v1 = pid_to_cgroup_v1(pid)
victim_cgroup_v2 = pid_to_cgroup_v2(pid)
except FileNotFoundError:
log('The victim died in the search process: FileNotFoundError')
update_stat_dict_and_print(
'The victim died in the search process: FileNotFoundError')
x = 'The victim died in the search process: FileNotFoundError'
log(x)
update_stat_dict(x)
print_stat_dict()
return None
ancestry = pid_to_ancestry(pid, max_victim_ancestry_depth)
@ -2156,8 +2169,7 @@ def is_victim_alive(victim_id):
if victim_id != new_victim_id:
return 0
exe_exists = os.path.exists('/proc/{}/exe'.format(pid))
if exe_exists:
if is_alive(pid):
return 1
state = pid_to_state(pid)
@ -2295,8 +2307,6 @@ def implement_corrective_action(
vwd = None # Victim Will Die
if victim_badness >= min_badness:
if threshold is SIGTERM:
if victim_id in v_dict:
dt = monotonic() - v_dict[victim_id]['time']
@ -2318,6 +2328,8 @@ def implement_corrective_action(
return psi_t0
if victim_badness >= min_badness:
if print_victim_status:
victim_info = find_victim_info(pid, victim_badness, name)
log(victim_info)
@ -2345,54 +2357,89 @@ def implement_corrective_action(
soft_match = True
break
start_action = monotonic()
if soft_match:
cmd = command.replace('$PID', pid).replace('$NAME', pid_to_name(
pid)).replace('$SERVICE', service)
start_thread(exe, cmd)
response_time = monotonic() - time0
preventing_oom_message = 'Implementing a corrective action:\n ' \
'Executing the command: {}'.format(cmd)
exit_status = None
log(preventing_oom_message)
preventing_oom_message = 'Implement a corrective act' \
'ion:\n Run the command: {}' \
'\n Exit status: {}; total response ' \
'time: {} ms'.format(
cmd,
exit_status,
round(response_time * 1000))
err = start_thread(exe, cmd)
if err == 1:
key = 'Cannot execute the command in the new thread'
update_stat_dict(key)
log(key)
else:
try:
os.kill(int(pid), threshold)
update_stat_dict('Executing the command "{}"'.format(command))
response_time = monotonic() - time0
send_result = 'total response time: {} ms'.format(
log('Total response time: {} ms'.format(round(
response_time * 1000)))
print_stat_dict()
else:
preventing_oom_message = 'Implementing a corrective action:\n ' \
'Sending {} to the victim'.format(
sig_dict[threshold])
log(preventing_oom_message)
try:
os.kill(int(pid), threshold)
update_stat_dict(
'[ OK ] Sending {} to {}'.format(sig_dict[threshold], name)
)
response_time = monotonic() - time0
send_result = 'OK; total response time: {} ms'.format(
round(response_time * 1000))
preventing_oom_message = 'Implementing a corrective action:' \
'\n Sending {} to the victim; {}'.format(
sig_dict[threshold], send_result)
log(send_result)
if threshold is SIGKILL:
vwd = True
except FileNotFoundError:
vwd = True
key = 'The victim died in the search process: ' \
'FileNotFoundError'
except ProcessLookupError:
vwd = True
key = 'The victim died in the search process: ' \
'ProcessLookupError'
print_stat_dict()
try:
log(preventing_oom_message)
except UnboundLocalError:
pass
except FileNotFoundError:
vwd = True
key = 'Cannot send a signal: FileNotFoundError'
update_stat_dict(key)
print_stat_dict()
log(key)
except ProcessLookupError:
vwd = True
key = 'Cannot send a signal: ProcessLookupError'
update_stat_dict(key)
print_stat_dict()
log(key)
except PermissionError:
vwd = False
key = 'Cannot send a signal: PermissionError'
log(key)
update_stat_dict(key)
print_stat_dict()
log('Sleep {}s'.format(post_soft_action_delay))
sleep(post_soft_action_delay)
# do not send signal twice!
if not vwd:
if victim_id not in v_dict:
@ -2404,6 +2451,8 @@ def implement_corrective_action(
last_action_dict['t'] = kill_timestamp = monotonic()
kill_timestamp = start_action
while True:
sleep(0.01)
d = monotonic() - kill_timestamp
@ -2449,13 +2498,6 @@ def implement_corrective_action(
': {} MiB, SwapFree: {} MiB'.format(
round(ma_mib, 1), round(sf_mib, 1)))
if soft_match is False:
key = 'Sending {} to {}'.format(sig_dict[threshold], name)
update_stat_dict_and_print(key)
else:
key = "Run the command '{}'".format(command)
update_stat_dict_and_print(key)
if threshold is SIGKILL and post_kill_exe != '':
cmd = post_kill_exe.replace('$PID', pid).replace(
@ -2474,8 +2516,11 @@ def implement_corrective_action(
else:
response_time = monotonic() - time0
victim_badness_is_too_small = 'victim badness ({}) < min_b' \
'adness ({}); nothing to do; response time: {} ms'.format(
victim_badness_is_too_small = 'victim (PID: {}, Name: {}) badness ' \
'({}) < min_badness ({}); nothing to do; response tim' \
'e: {} ms'.format(
pid, name,
victim_badness,
min_badness,
round(response_time * 1000))
@ -2484,7 +2529,8 @@ def implement_corrective_action(
# update stat_dict
key = 'victim badness < min_badness'
update_stat_dict_and_print(key)
update_stat_dict(key)
print_stat_dict()
if vwd is None:
@ -2696,7 +2742,7 @@ separate_log = False # will be overwritten after parse config
cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
pid_list = get_pid_list()
pid_list = alive_pid_list()
print_proc_table_flag = False