From 8e931f8f00ba2486cdf4d2ad4b04f38e6db3199d Mon Sep 17 00:00:00 2001 From: Alexey Avramov Date: Sun, 15 Mar 2020 21:10:01 +0900 Subject: [PATCH] Various improvements and fixes Fix Something went wrong #56 Improve implement_corrective_action(): - Check VmRSS instead od /proc/[pid]/exe realpath to check process existene - Improve displaying total stat Improve check_permissions() Other improvements --- nohang/nohang | 354 ++++++++++++++++++++++++++++---------------------- 1 file changed, 200 insertions(+), 154 deletions(-) diff --git a/nohang/nohang b/nohang/nohang index a330fff..d22e021 100755 --- a/nohang/nohang +++ b/nohang/nohang @@ -17,8 +17,15 @@ def check_permissions(): try: os.path.realpath('/proc/1/exe') except Exception as e: - print('ERROR: missing CAP_SYS_PTRACE: {}'.format(e)) - exit(1) + print('WARNING: missing CAP_SYS_PTRACE: {}'.format(e)) + try: + os.kill(2, SIGTERM) + except Exception as e: + print('WARNING: cannot send a signal: {}'.format(e)) + try: + rline1('/proc/1/oom_score') + except Exception as e: + print('WARNING: {}'.format(e)) def memload(): @@ -55,8 +62,11 @@ def memload(): ' '.format( round(mem_available / 1024), round(swap_free / 1024))) - - luid_init = rline1('/proc/1/loginuid') + try: + luid_init = rline1('/proc/1/loginuid') + except Exception as e: + print(e) + exit(1) luid_self = rline1('/proc/self/loginuid') if luid_init == luid_self: print('The option is available only for logged in users.') @@ -119,24 +129,28 @@ def exe(cmd): cmd_num_dict['cmd_num'] += 1 cmd_num = cmd_num_dict['cmd_num'] + th_name = threading.current_thread().getName() - log('Execute the command ({}) in {}: {}'.format( + log('Executing the command ({}) in {}: {}'.format( cmd_num, - threading.current_thread().getName(), + th_name, cmd_list)) t3 = monotonic() - with Popen(cmd_list) as proc: - try: - proc.wait(timeout=exe_timeout) - exit_status = proc.poll() - t4 = monotonic() - log('Command ({}) execution completed in {} sec; exit status' - ': {}'.format(cmd_num, round(t4 - t3, 3), exit_status)) - except TimeoutExpired: - proc.kill() - t4 = monotonic() - log('TimeoutExpired for the command ({}) in {} sec'.format( - cmd_num, round(t4 - t3, 3))) + try: + with Popen(cmd_list) as proc: + try: + proc.wait(timeout=exe_timeout) + exit_status = proc.poll() + t4 = monotonic() + log('Command ({}) execution completed in {} sec; exit status' + ': {}'.format(cmd_num, round(t4 - t3, 3), exit_status)) + except TimeoutExpired: + proc.kill() + t4 = monotonic() + log('TimeoutExpired for the command ({}) in {} sec'.format( + cmd_num, round(t4 - t3, 3))) + except Exception as e: + log('Exception in {}: {}'.format(th_name, e)) def start_thread(func, *a, **k): @@ -146,7 +160,6 @@ def start_thread(func, *a, **k): th_name = th.getName() if debug_threading: - log('Starting {} from {}'.format( th_name, threading.current_thread().getName() )) @@ -221,7 +234,7 @@ def root_notify_env(): # iterates over processes, find processes with suitable env for pid in os.listdir('/proc'): - if os.path.exists('/proc/' + pid + '/exe') is True: + if is_alive(pid): one_env = re_pid_environ(pid) unsorted_envs_list.append(one_env) @@ -707,7 +720,7 @@ def signal_handler(signum, frame): log('Got the {} signal '.format( sig_dict[signum])) - update_stat_dict_and_print(None) + print_stat_dict() m1 = monotonic() pt1 = process_time() @@ -965,22 +978,26 @@ def pid_to_realpath(pid): """ """ try: - return os.path.realpath('/proc/' + pid + '/exe') + return os.path.realpath('/proc/{}/exe'.format(pid)) except FileNotFoundError: return '' except ProcessLookupError: return '' + except PermissionError: + return '' def pid_to_cwd(pid): """ """ try: - return os.path.realpath('/proc/' + pid + '/cwd') + return os.path.realpath('/proc/{}/cwd'.format(pid)) except FileNotFoundError: return '' except ProcessLookupError: return '' + except PermissionError: + return '' def pid_to_uid(pid): @@ -1275,38 +1292,30 @@ def mlockall(): # log('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT') -def update_stat_dict_and_print(key): +def update_stat_dict(key): """ """ if key is not None: - if key not in stat_dict: - stat_dict.update({key: 1}) - else: - new_value = stat_dict[key] + 1 stat_dict.update({key: new_value}) + +def print_stat_dict(): + """ + """ if print_statistics: - lsd = len(stat_dict) - if lsd == 0: - log('No corrective actions applied in the last {}'.format( format_time(monotonic() - start_time))) - else: - stats_msg = 'What happened in the last {}:'.format( format_time(monotonic() - start_time)) - for i in stat_dict: - stats_msg += '\n {}: {}'.format(i, stat_dict[i]) - log(stats_msg) @@ -1339,7 +1348,7 @@ def find_psi_metrics_value(psi_path, psi_metrics): def check_mem_and_swap(): """ """ - with open('/proc/meminfo', 'rb', buffering=0) as f: + with open('/proc/meminfo', 'rb') as f: m_list = f.read().decode().split(' kB\n') return (int(m_list[2].split(':')[1]), int(m_list[swap_total_index].split(':')[1]), @@ -1512,25 +1521,40 @@ def zram_stat(zram_id): return disksize, mem_used_total # BYTES, str -def get_pid_list(): +def is_alive(pid): + """ + """ + try: + with open('/proc/{}/statm'.format(pid), 'rb', buffering=0) as f: + rss = f.read().decode().split(' ')[1] + if rss != '0': + return True + except FileNotFoundError: + return False + except ProcessLookupError: + return False + except NotADirectoryError: + return False + + +def alive_pid_list(): """ - Find pid list expect kthreads and zombies """ pid_list = [] for pid in os.listdir('/proc'): - if os.path.exists('/proc/' + pid + '/exe'): - pid_list.append(pid) - return pid_list - -def get_non_decimal_pids(): - """ - """ - non_decimal_list = [] - for pid in pid_list: if pid[0].isdecimal() is False: - non_decimal_list.append(pid) - return non_decimal_list + continue + + if is_alive(pid): + pid_list.append(pid) + + pid_list.remove(self_pid) + + if '1' in pid_list: + pid_list.remove('1') + + return pid_list def find_victim(_print_proc_table): @@ -1541,18 +1565,7 @@ def find_victim(_print_proc_table): ft1 = monotonic() - pid_list = get_pid_list() - - pid_list.remove(self_pid) - - if '1' in pid_list: - pid_list.remove('1') - - non_decimal_list = get_non_decimal_pids() - - for i in non_decimal_list: - if i in pid_list: - pid_list.remove(i) + pid_list = alive_pid_list() pid_badness_list = [] @@ -1584,6 +1597,7 @@ def find_victim(_print_proc_table): hr = '#' * 107 + log('Tasks state (memory values in mebibytes):') log(hr) log('# PID PPID badness oom_score oom_score_adj e' 'UID S VmSize VmRSS VmSwap Name {}'.format( @@ -1674,7 +1688,7 @@ def find_victim(_print_proc_table): if _print_proc_table: log(hr) - log('Found {} processes with existing /proc/[pid]/exe realpath'.format( + log('Found {} tasks with non-zero VmRSS (except init and self)'.format( real_proc_num)) log( @@ -1705,12 +1719,6 @@ def find_victim_info(pid, victim_badness, name): state = line.split('\t')[1].rstrip() continue - """ - if n is ppid_index: - # ppid = line.split('\t')[1] - continue - """ - if n is uid_index: uid = line.split('\t')[2] continue @@ -1750,14 +1758,16 @@ def find_victim_info(pid, victim_badness, name): oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') except FileNotFoundError: - log('The victim died in the search process: FileNotFoundError') - update_stat_dict_and_print( - 'The victim died in the search process: FileNotFoundError') + x = 'The victim died in the search process: FileNotFoundError' + log(x) + update_stat_dict(x) + print_stat_dict() return None except ProcessLookupError: - log('The victim died in the search process: ProcessLookupError') - update_stat_dict_and_print( - 'The victim died in the search process: ProcessLookupError') + x = 'The victim died in the search process: ProcessLookupError' + log(x) + update_stat_dict(x) + print_stat_dict() return None except UnicodeDecodeError: @@ -1803,43 +1813,46 @@ def find_victim_info(pid, victim_badness, name): oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') except IndexError: - log('The victim died in the search process: IndexError') - update_stat_dict_and_print( - 'The victim died in the search process: IndexError') + x = 'The victim died in the search process: IndexError' + log(x) + update_stat_dict(x) + print_stat_dict() return None except ValueError: - log('The victim died in the search process: ValueError') - update_stat_dict_and_print( - 'The victim died in the search process: ValueError') + x = 'The victim died in the search process: ValueError' + log(x) + update_stat_dict(x) + print_stat_dict() return None + except FileNotFoundError: - log('The victim died in the search process: FileNotFoundError') - update_stat_dict_and_print( - 'The victim died in the search process: FileNotFoundError') + x = 'The victim died in the search process: FileNotFoundError' + log(x) + update_stat_dict(x) + print_stat_dict() return None except ProcessLookupError: - log('The victim died in the search process: ProcessLookupError') - update_stat_dict_and_print( - 'The victim died in the search process: ProcessLookupError') + x = 'The victim died in the search process: ProcessLookupError' + log(x) + update_stat_dict(x) + print_stat_dict() return None len_vm = len(str(vm_size)) try: - realpath = os.path.realpath('/proc/' + pid + '/exe') - - cwd = os.path.realpath('/proc/' + pid + '/cwd') - + realpath = pid_to_realpath(pid) + cwd = pid_to_cwd(pid) nssid = pid_to_nssid(pid) - victim_lifetime = format_time(uptime() - pid_to_starttime(pid)) victim_cgroup_v1 = pid_to_cgroup_v1(pid) victim_cgroup_v2 = pid_to_cgroup_v2(pid) except FileNotFoundError: - log('The victim died in the search process: FileNotFoundError') - update_stat_dict_and_print( - 'The victim died in the search process: FileNotFoundError') + x = 'The victim died in the search process: FileNotFoundError' + log(x) + update_stat_dict(x) + print_stat_dict() return None ancestry = pid_to_ancestry(pid, max_victim_ancestry_depth) @@ -2156,8 +2169,7 @@ def is_victim_alive(victim_id): if victim_id != new_victim_id: return 0 - exe_exists = os.path.exists('/proc/{}/exe'.format(pid)) - if exe_exists: + if is_alive(pid): return 1 state = pid_to_state(pid) @@ -2295,29 +2307,29 @@ def implement_corrective_action( vwd = None # Victim Will Die + if threshold is SIGTERM: + if victim_id in v_dict: + dt = monotonic() - v_dict[victim_id]['time'] + if dt > max_soft_exit_time: + log('max_soft_exit_time is exceeded: the ' + 'victim will get SIGKILL') + threshold = SIGKILL + else: + log('max_soft_exit_time is not exceeded (' + '{} < {}) for the victim'.format(round( + dt, 1), max_soft_exit_time)) + + if debug_sleep: + log('Sleep {} sec (over_sleep)'.format(over_sleep)) + sleep(over_sleep) + + log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' + '<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') + + return psi_t0 + if victim_badness >= min_badness: - if threshold is SIGTERM: - if victim_id in v_dict: - dt = monotonic() - v_dict[victim_id]['time'] - if dt > max_soft_exit_time: - log('max_soft_exit_time is exceeded: the ' - 'victim will get SIGKILL') - threshold = SIGKILL - else: - log('max_soft_exit_time is not exceeded (' - '{} < {}) for the victim'.format(round( - dt, 1), max_soft_exit_time)) - - if debug_sleep: - log('Sleep {} sec (over_sleep)'.format(over_sleep)) - sleep(over_sleep) - - log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' - '<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') - - return psi_t0 - if print_victim_status: victim_info = find_victim_info(pid, victim_badness, name) log(victim_info) @@ -2345,54 +2357,89 @@ def implement_corrective_action( soft_match = True break + start_action = monotonic() + if soft_match: cmd = command.replace('$PID', pid).replace('$NAME', pid_to_name( pid)).replace('$SERVICE', service) - start_thread(exe, cmd) + + preventing_oom_message = 'Implementing a corrective action:\n ' \ + 'Executing the command: {}'.format(cmd) + + log(preventing_oom_message) + + err = start_thread(exe, cmd) + + if err == 1: + key = 'Cannot execute the command in the new thread' + update_stat_dict(key) + log(key) + + else: + + update_stat_dict('Executing the command "{}"'.format(command)) response_time = monotonic() - time0 - exit_status = None + log('Total response time: {} ms'.format(round( + response_time * 1000))) - preventing_oom_message = 'Implement a corrective act' \ - 'ion:\n Run the command: {}' \ - '\n Exit status: {}; total response ' \ - 'time: {} ms'.format( - cmd, - exit_status, - round(response_time * 1000)) + print_stat_dict() else: + preventing_oom_message = 'Implementing a corrective action:\n ' \ + 'Sending {} to the victim'.format( + sig_dict[threshold]) + + log(preventing_oom_message) + try: + os.kill(int(pid), threshold) + update_stat_dict( + '[ OK ] Sending {} to {}'.format(sig_dict[threshold], name) + ) + response_time = monotonic() - time0 - send_result = 'total response time: {} ms'.format( + send_result = 'OK; total response time: {} ms'.format( round(response_time * 1000)) - preventing_oom_message = 'Implementing a corrective action:' \ - '\n Sending {} to the victim; {}'.format( - sig_dict[threshold], send_result) + log(send_result) if threshold is SIGKILL: vwd = True - except FileNotFoundError: - vwd = True - key = 'The victim died in the search process: ' \ - 'FileNotFoundError' - except ProcessLookupError: - vwd = True - key = 'The victim died in the search process: ' \ - 'ProcessLookupError' + print_stat_dict() - try: - log(preventing_oom_message) - except UnboundLocalError: - pass + except FileNotFoundError: + + vwd = True + key = 'Cannot send a signal: FileNotFoundError' + update_stat_dict(key) + print_stat_dict() + log(key) + + except ProcessLookupError: + + vwd = True + key = 'Cannot send a signal: ProcessLookupError' + update_stat_dict(key) + print_stat_dict() + log(key) + + except PermissionError: + vwd = False + key = 'Cannot send a signal: PermissionError' + log(key) + update_stat_dict(key) + print_stat_dict() + log('Sleep {}s'.format(post_soft_action_delay)) + sleep(post_soft_action_delay) + # do not send signal twice! if not vwd: if victim_id not in v_dict: @@ -2404,6 +2451,8 @@ def implement_corrective_action( last_action_dict['t'] = kill_timestamp = monotonic() + kill_timestamp = start_action + while True: sleep(0.01) d = monotonic() - kill_timestamp @@ -2449,13 +2498,6 @@ def implement_corrective_action( ': {} MiB, SwapFree: {} MiB'.format( round(ma_mib, 1), round(sf_mib, 1))) - if soft_match is False: - key = 'Sending {} to {}'.format(sig_dict[threshold], name) - update_stat_dict_and_print(key) - else: - key = "Run the command '{}'".format(command) - update_stat_dict_and_print(key) - if threshold is SIGKILL and post_kill_exe != '': cmd = post_kill_exe.replace('$PID', pid).replace( @@ -2474,8 +2516,11 @@ def implement_corrective_action( else: response_time = monotonic() - time0 - victim_badness_is_too_small = 'victim badness ({}) < min_b' \ - 'adness ({}); nothing to do; response time: {} ms'.format( + + victim_badness_is_too_small = 'victim (PID: {}, Name: {}) badness ' \ + '({}) < min_badness ({}); nothing to do; response tim' \ + 'e: {} ms'.format( + pid, name, victim_badness, min_badness, round(response_time * 1000)) @@ -2484,7 +2529,8 @@ def implement_corrective_action( # update stat_dict key = 'victim badness < min_badness' - update_stat_dict_and_print(key) + update_stat_dict(key) + print_stat_dict() if vwd is None: @@ -2696,7 +2742,7 @@ separate_log = False # will be overwritten after parse config cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes() -pid_list = get_pid_list() +pid_list = alive_pid_list() print_proc_table_flag = False