From 0b9be5a41c48b6a24325497c6635bb058281c1dd Mon Sep 17 00:00:00 2001 From: Alexey Avramov Date: Sun, 29 Mar 2020 05:21:19 +0900 Subject: [PATCH] speeding up the search for the victim --- nohang/nohang | 334 ++++++++++++++++++++++++-------------------------- 1 file changed, 157 insertions(+), 177 deletions(-) diff --git a/nohang/nohang b/nohang/nohang index 170d19f..49d0b04 100755 --- a/nohang/nohang +++ b/nohang/nohang @@ -290,23 +290,27 @@ def pop(cmd, username): )) if swap_total == 0: - wait_time = 5 + wait_time = 10 else: - wait_time = 25 + wait_time = 30 t3 = monotonic() - with Popen(cmd) as proc: - try: - proc.wait(timeout=wait_time) - err = proc.poll() - t4 = monotonic() - except TimeoutExpired: - proc.kill() - t4 = monotonic() + try: + with Popen(cmd) as proc: + try: + proc.wait(timeout=wait_time) + err = proc.poll() + t4 = monotonic() + except TimeoutExpired: + proc.kill() + t4 = monotonic() - if debug_gui_notifications: - log('TimeoutExpired: notify user: {}'.format(username)) + if debug_gui_notifications: + log('TimeoutExpired: notify user: {}'.format(username)) + except Exception as e: + th_name = threading.current_thread().getName() + log('Exception in {}: {}'.format(th_name, e)) if debug_gui_notifications: log('Popen time: {} sec; exit status: {}; cmd: {}'.format( @@ -472,7 +476,6 @@ def check_config(): log(' fill_rate_zram: {}'.format(fill_rate_zram)) log(' max_sleep: {} sec'.format(max_sleep)) log(' min_sleep: {} sec'.format(min_sleep)) - log(' over_sleep: {} sec'.format(over_sleep)) log('\n4. Warnings and notifications') @@ -732,7 +735,7 @@ def signal_handler(signum, frame): def signal_handler_inner(signum, frame): """ """ - log('Signal handler called with the {} signal (ignored) '.format( + log('Got the {} signal (ignored) '.format( sig_dict[signum])) @@ -887,7 +890,7 @@ def pid_to_name(pid): """ """ try: - with open('/proc/' + pid + '/comm', 'rb') as f: + with open('/proc/{}/comm'.format(pid), 'rb', buffering=0) as f: return f.read().decode('utf-8', 'ignore')[:-1] except FileNotFoundError: return '' @@ -1001,23 +1004,25 @@ def pid_to_cwd(pid): def pid_to_uid(pid): """return euid""" try: - with open('/proc/' + pid + '/status') as f: - for n, line in enumerate(f): - if n is uid_index: - return line.split('\t')[2] - except UnicodeDecodeError: - with open('/proc/' + pid + '/status', 'rb') as f: + with open('/proc/{}/status'.format(pid), 'rb', buffering=0) as f: f_list = f.read().decode('utf-8', 'ignore').split('\n') return f_list[uid_index].split('\t')[2] except FileNotFoundError: return '' + except ProcessLookupError: + return '' -def pid_to_badness(pid): +def pid_to_badness(pid, oom_score): """Find and modify badness (if it needs).""" + oom_score_adj = None + try: - oom_score = int(rline1('/proc/' + pid + '/oom_score')) + + if oom_score is None: + + oom_score = pid_to_oom_score(pid) if oom_score == 0: return oom_score, oom_score @@ -1025,7 +1030,7 @@ def pid_to_badness(pid): badness = oom_score if ignore_positive_oom_score_adj: - oom_score_adj = int(rline1('/proc/' + pid + '/oom_score_adj')) + oom_score_adj = pid_to_oom_score_adj(pid) if oom_score_adj > 0: badness = badness - oom_score_adj @@ -1038,8 +1043,7 @@ def pid_to_badness(pid): badness += badness_adj else: if oom_score_adj is None: - oom_score_adj = int(rline1( - '/proc/' + pid + '/oom_score_adj')) + oom_score_adj = pid_to_oom_score_adj(pid) if oom_score_adj >= 0: badness += badness_adj @@ -1052,8 +1056,7 @@ def pid_to_badness(pid): badness += badness_adj else: if oom_score_adj is None: - oom_score_adj = int(rline1( - '/proc/' + pid + '/oom_score_adj')) + oom_score_adj = pid_to_oom_score_adj(pid) if oom_score_adj >= 0: badness += badness_adj @@ -1066,8 +1069,7 @@ def pid_to_badness(pid): badness += badness_adj else: if oom_score_adj is None: - oom_score_adj = int(rline1( - '/proc/' + pid + '/oom_score_adj')) + oom_score_adj = pid_to_oom_score_adj(pid) if oom_score_adj >= 0: badness += badness_adj @@ -1080,8 +1082,7 @@ def pid_to_badness(pid): badness += badness_adj else: if oom_score_adj is None: - oom_score_adj = int(rline1( - '/proc/' + pid + '/oom_score_adj')) + oom_score_adj = pid_to_oom_score_adj(pid) if oom_score_adj >= 0: badness += badness_adj @@ -1094,8 +1095,7 @@ def pid_to_badness(pid): badness += badness_adj else: if oom_score_adj is None: - oom_score_adj = int(rline1( - '/proc/' + pid + '/oom_score_adj')) + oom_score_adj = pid_to_oom_score_adj(pid) if oom_score_adj >= 0: badness += badness_adj @@ -1108,8 +1108,7 @@ def pid_to_badness(pid): badness += badness_adj else: if oom_score_adj is None: - oom_score_adj = int(rline1( - '/proc/' + pid + '/oom_score_adj')) + oom_score_adj = pid_to_oom_score_adj(pid) if oom_score_adj >= 0: badness += badness_adj @@ -1122,8 +1121,7 @@ def pid_to_badness(pid): badness += badness_adj else: if oom_score_adj is None: - oom_score_adj = int(rline1( - '/proc/' + pid + '/oom_score_adj')) + oom_score_adj = pid_to_oom_score_adj(pid) if oom_score_adj >= 0: badness += badness_adj @@ -1136,8 +1134,7 @@ def pid_to_badness(pid): badness += badness_adj else: if oom_score_adj is None: - oom_score_adj = int(rline1( - '/proc/' + pid + '/oom_score_adj')) + oom_score_adj = pid_to_oom_score_adj(pid) if oom_score_adj >= 0: badness += badness_adj @@ -1157,58 +1154,7 @@ def pid_to_status(pid): """ try: - with open('/proc/' + pid + '/status') as f: - - for n, line in enumerate(f): - - if n == 0: - name = line.split('\t')[1][:-1] - - if n is state_index: - state = line.split('\t')[1][0] - continue - - if n is ppid_index: - ppid = line.split('\t')[1][:-1] - continue - - if n is uid_index: - uid = line.split('\t')[2] - continue - - if n is vm_size_index: - vm_size = kib_to_mib(int(line.split('\t')[1][:-4])) - continue - - if n is vm_rss_index: - vm_rss = kib_to_mib(int(line.split('\t')[1][:-4])) - continue - - if n is vm_swap_index: - vm_swap = kib_to_mib(int(line.split('\t')[1][:-4])) - break - - return name, state, ppid, uid, vm_size, vm_rss, vm_swap - - except UnicodeDecodeError: - return pid_to_status_unicode(pid) - - except FileNotFoundError: - return None - - except ProcessLookupError: - return None - - except ValueError: - return None - - -def pid_to_status_unicode(pid): - """ - """ - try: - - with open('/proc/' + pid + '/status', 'rb') as f: + with open('/proc/{}/status'.format(pid), 'rb', buffering=0) as f: f_list = f.read().decode('utf-8', 'ignore').split('\n') for i in range(len(f_list)): @@ -1512,12 +1458,101 @@ def alive_pid_list(): return pid_list +def pid_to_oom_score(pid): + try: + with open('/proc/{}/oom_score'.format(pid), 'rb', buffering=0) as f: + return int(f.read()) + except FileNotFoundError: + return 0 + except ProcessLookupError: + return 0 + except NotADirectoryError: + return 0 + + +def pid_to_oom_score_adj(pid): + try: + with open('/proc/{}/oom_score_adj'.format(pid), 'rb', buffering=0 + ) as f: + return int(f.read()) + except FileNotFoundError: + return 0 + except ProcessLookupError: + return 0 + except NotADirectoryError: + return 0 + + +def badness_pid_list(): + """ + """ + pid_b_list = [] + for pid in os.listdir('/proc'): + o = pid_to_oom_score(pid) + if o >= 1: + if pid[0].isdecimal() is False: + continue + if pid == self_pid or pid == '1': + continue + b = pid_to_badness(pid, o)[0] + # log('PID: {}, oom_score: {}, badness: {}, Name: {}'.format( + # pid, o, b, pid_to_name(pid))) + pid_b_list.append((pid, b)) + return pid_b_list + + +def fast_find_victim(): + """ + """ + + ft1 = monotonic() + + pid_badness_list = badness_pid_list() + + real_proc_num = len(pid_badness_list) + + if real_proc_num == 0: + log('Found {} tasks with non-zero oom_score (except init and ' + 'self)'.format(real_proc_num)) + return None + + # Make list of (pid, badness) tuples, sorted by 'badness' values + # print(pid_badness_list) + pid_tuple_list = sorted( + pid_badness_list, key=itemgetter(1), reverse=True)[0] + + pid = pid_tuple_list[0] + victim_id = get_victim_id(pid) + + # Get maximum 'badness' value + victim_badness = pid_tuple_list[1] + victim_name = pid_to_name(pid) + + log('Found {} tasks with non-zero oom_score (except init and self)'.format( + real_proc_num)) + + log( + 'Process with highest badness (found in {} ms):\n PID: {}, Na' + 'me: {}, badness: {}'.format( + round((monotonic() - ft1) * 1000), + pid, + victim_name, + victim_badness + ) + ) + + return pid, victim_badness, victim_name, victim_id + + def find_victim(_print_proc_table): """ Find the process with highest badness and its badness adjustment Return pid and badness """ + if not _print_proc_table: + return fast_find_victim() + ft1 = monotonic() pid_list = alive_pid_list() @@ -1562,7 +1597,7 @@ def find_victim(_print_proc_table): for pid in pid_list: - badness = pid_to_badness(pid)[0] + badness = pid_to_badness(pid, None)[0] if badness is None: continue @@ -1570,8 +1605,8 @@ def find_victim(_print_proc_table): if _print_proc_table: try: - oom_score = rline1('/proc/' + pid + '/oom_score') - oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') + oom_score = pid_to_oom_score(pid) + oom_score_adj = pid_to_oom_score_adj(pid) except FileNotFoundError: continue @@ -1609,8 +1644,8 @@ def find_victim(_print_proc_table): pid.rjust(7), ppid.rjust(7), str(badness).rjust(7), - oom_score.rjust(9), - oom_score_adj.rjust(13), + str(oom_score).rjust(9), + str(oom_score_adj).rjust(13), uid.rjust(10), state, str(vm_size).rjust(6), @@ -1666,67 +1701,7 @@ def find_victim_info(pid, victim_badness, name): try: - with open('/proc/' + pid + '/status') as f: - - for n, line in enumerate(f): - - if n is state_index: - state = line.split('\t')[1].rstrip() - continue - - if n is uid_index: - uid = line.split('\t')[2] - continue - - if n is vm_size_index: - vm_size = kib_to_mib(int(line.split('\t')[1][:-4])) - continue - - if n is vm_rss_index: - vm_rss = kib_to_mib(int(line.split('\t')[1][:-4])) - continue - - if detailed_rss: - - if n is anon_index: - anon_rss = kib_to_mib( - int(line.split('\t')[1][:-4])) - continue - - if n is file_index: - file_rss = kib_to_mib( - int(line.split('\t')[1][:-4])) - continue - - if n is shmem_index: - shmem_rss = kib_to_mib( - int(line.split('\t')[1][:-4])) - continue - - if n is vm_swap_index: - vm_swap = kib_to_mib(int(line.split('\t')[1][:-4])) - break - - if print_victim_cmdline: - cmdline = pid_to_cmdline(pid) - oom_score = rline1('/proc/' + pid + '/oom_score') - oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') - - except FileNotFoundError: - x = 'The victim died in the search process: FileNotFoundError' - log(x) - update_stat_dict(x) - print_stat_dict() - return None - except ProcessLookupError: - x = 'The victim died in the search process: ProcessLookupError' - log(x) - update_stat_dict(x) - print_stat_dict() - return None - except UnicodeDecodeError: - - with open('/proc/' + pid + '/status', 'rb') as f: + with open('/proc/{}/status'.format(pid), 'rb', buffering=0) as f: f_list = f.read().decode('utf-8', 'ignore').split('\n') for i in range(len(f_list)): @@ -1764,8 +1739,8 @@ def find_victim_info(pid, victim_badness, name): if print_victim_cmdline: cmdline = pid_to_cmdline(pid) - oom_score = rline1('/proc/' + pid + '/oom_score') - oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') + oom_score = pid_to_oom_score(pid) + oom_score_adj = pid_to_oom_score_adj(pid) except IndexError: x = 'The victim died in the search process: IndexError' @@ -1780,19 +1755,6 @@ def find_victim_info(pid, victim_badness, name): print_stat_dict() return None - except FileNotFoundError: - x = 'The victim died in the search process: FileNotFoundError' - log(x) - update_stat_dict(x) - print_stat_dict() - return None - except ProcessLookupError: - x = 'The victim died in the search process: ProcessLookupError' - log(x) - update_stat_dict(x) - print_stat_dict() - return None - len_vm = len(str(vm_size)) try: @@ -2217,11 +2179,25 @@ def implement_corrective_action( if x: victim_id = cached_victim_id pid = victim_id.partition('_pid')[2] - victim_badness = pid_to_badness(pid)[0] + victim_badness = pid_to_badness(pid, None)[0] name = v_dict[victim_id]['name'] log('New victim is cached victim {} ({})'.format(pid, name)) else: - pid, victim_badness, name, victim_id = find_victim(print_proc_table) + + fff = find_victim(print_proc_table) + + if fff is None: + + if debug_sleep: + log('Sleep {} sec'.format(over_sleep)) + sleep(over_sleep) + + log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' + '<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') + + return psi_t0 + + pid, victim_badness, name, victim_id = fff log('Recheck memory levels...') @@ -2269,6 +2245,8 @@ def implement_corrective_action( else: log('Thresholds is not exceeded now') + log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' + '<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') return psi_t0 for i in mem_info_list: @@ -2276,6 +2254,8 @@ def implement_corrective_action( if new_threshold is None or new_threshold == 'WARN': log('Thresholds is not exceeded now') + log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' + '<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') return psi_t0 threshold = new_threshold @@ -2295,7 +2275,7 @@ def implement_corrective_action( dt, 1), max_soft_exit_time)) if debug_sleep: - log('Sleep {} sec (over_sleep)'.format(over_sleep)) + log('Sleep {} sec'.format(over_sleep)) sleep(over_sleep) log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' @@ -2510,7 +2490,7 @@ def implement_corrective_action( if vwd is None: if debug_sleep: - log('Sleep {} sec (over_sleep)'.format(over_sleep)) + log('Sleep {} sec'.format(over_sleep)) sleep(over_sleep) log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'