From 24173cbc47397cdda4187c652d959dbd7ac960e4 Mon Sep 17 00:00:00 2001 From: Alexey Avramov Date: Wed, 1 Apr 2020 03:31:57 +0900 Subject: [PATCH] Print top-15 task by badness before corrective action --- nohang/nohang | 98 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 65 insertions(+), 33 deletions(-) diff --git a/nohang/nohang b/nohang/nohang index 441657c..e7c0c94 100755 --- a/nohang/nohang +++ b/nohang/nohang @@ -1503,42 +1503,51 @@ def badness_pid_list(): def fast_find_victim(): """ """ - ft1 = monotonic() - pid_badness_list = badness_pid_list() - real_proc_num = len(pid_badness_list) if real_proc_num == 0: - log('Found {} tasks with non-zero oom_score (except init and ' - 'self)'.format(real_proc_num)) + log('Found {} tasks with non-zero oom_score (except init and self) ' + 'in {}ms'.format(real_proc_num, round((monotonic() - ft1) * 1000))) return None - # Make list of (pid, badness) tuples, sorted by 'badness' values - # print(pid_badness_list) - pid_tuple_list = sorted( - pid_badness_list, key=itemgetter(1), reverse=True)[0] + log('Found {} tasks with non-zero oom_score (except init and self) ' + 'in {}ms'.format(real_proc_num, round((monotonic() - ft1) * 1000))) - pid = pid_tuple_list[0] + # Make list of (pid, badness) tuples, sorted by 'badness' values + pid_badness_list_sorted = sorted( + pid_badness_list, + key=itemgetter(1), + reverse=True) + + m0 = monotonic() + top_n = 15 + if real_proc_num < top_n: + top_n = real_proc_num + log('TOP-{} tasks by badness:'.format(top_n)) + log(' Name PID badness') + log(' --------------- ------- -------') + for pid_badness in pid_badness_list_sorted[0:top_n]: + p = pid_badness[0] + b = str(pid_badness[1]) + n = pid_to_name(p) + log(' {} {} {}'.format(n.ljust(15), p.rjust(7), b.rjust(7))) + + pid = pid_badness_list_sorted[0][0] victim_id = get_victim_id(pid) # Get maximum 'badness' value - victim_badness = pid_tuple_list[1] + victim_badness = pid_badness_list_sorted[0][1] victim_name = pid_to_name(pid) - log('Found {} tasks with non-zero oom_score (except init and self)'.format( - real_proc_num)) - - log( - 'Process with highest badness (found in {} ms):\n PID: {}, Na' + log('TOP printed in {}ms; process with highest badness:\n PID: {}, na' 'me: {}, badness: {}'.format( - round((monotonic() - ft1) * 1000), + round((monotonic() - m0) * 1000), pid, victim_name, victim_badness - ) - ) + )) return pid, victim_badness, victim_name, victim_id @@ -1681,7 +1690,7 @@ def find_victim(_print_proc_table): real_proc_num)) log( - 'Process with highest badness (found in {} ms):\n PID: {}, Na' + 'Process with highest badness (found in {}ms):\n PID: {}, Na' 'me: {}, badness: {}'.format( round((monotonic() - ft1) * 1000), pid, @@ -1790,7 +1799,7 @@ def find_victim_info(pid, victim_badness, name): else: detailed_rss_info = '' - victim_info = 'Victim status (found in {} ms):' \ + victim_info = 'Victim status (found in {}ms):' \ '\n Name: {}' \ '\n State: {}' \ '\n PID: {}' \ @@ -2183,12 +2192,27 @@ def implement_corrective_action( log('New victim is cached victim {} ({})'.format(pid, name)) else: + s1 = set(os.listdir('/proc')) fff = find_victim(print_proc_table) + # sleep(0.1) + s2 = set(os.listdir('/proc')) + dset = s1 - s2 + + if len(dset) > 0: + log('During the search for the victim, the processes were ' + 'completed: {}'.format(dset)) + + sleep(over_sleep) + + log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' + '<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') + + return psi_t0 if fff is None: if debug_sleep: - log('Sleep {} sec'.format(over_sleep)) + log('Sleep {}s'.format(over_sleep)) sleep(over_sleep) log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' @@ -2274,7 +2298,7 @@ def implement_corrective_action( dt, 1), max_soft_exit_time)) if debug_sleep: - log('Sleep {} sec'.format(over_sleep)) + log('Sleep {}s'.format(over_sleep)) sleep(over_sleep) log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' @@ -2286,7 +2310,15 @@ def implement_corrective_action( if print_victim_status: victim_info = find_victim_info(pid, victim_badness, name) - log(victim_info) + if victim_info is not None: + log(victim_info) + else: + sleep(over_sleep) + + log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' + '<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') + + return psi_t0 soft_match = False if soft_actions and threshold is SIGTERM: @@ -2353,7 +2385,7 @@ def implement_corrective_action( response_time = monotonic() - time0 - log('Total response time: {} ms'.format(round( + log('Total response time: {}ms'.format(round( response_time * 1000))) print_stat_dict() @@ -2376,7 +2408,7 @@ def implement_corrective_action( response_time = monotonic() - time0 - send_result = 'OK; total response time: {} ms'.format( + send_result = 'OK; total response time: {}ms'.format( round(response_time * 1000)) log(send_result) @@ -2441,12 +2473,12 @@ def implement_corrective_action( if vwd and d > sensitivity_test_time + 10: log('The victim doesn\'t respond on corrective action' - ' in {} sec'.format(round(d, 3))) + ' in {}s'.format(round(d, 3))) break if not vwd and d > sensitivity_test_time: log('The victim doesn\'t respond on corrective action' - ' in {} sec'.format(round(d, 3))) + ' in {}s'.format(round(d, 3))) break elif iva == 2: @@ -2454,7 +2486,7 @@ def implement_corrective_action( else: - log('The victim became a zombie in {} sec'.format(round(d, 3))) + log('The victim became a zombie in {}s'.format(round(d, 3))) if victim_id in v_dict: v_dict.pop(victim_id) @@ -2490,7 +2522,7 @@ def implement_corrective_action( victim_badness_is_too_small = 'victim (PID: {}, Name: {}) badness ' \ '({}) < min_badness ({}); nothing to do; response tim' \ - 'e: {} ms'.format( + 'e: {}ms'.format( pid, name, victim_badness, min_badness, @@ -2506,7 +2538,7 @@ def implement_corrective_action( if vwd is None: if debug_sleep: - log('Sleep {} sec'.format(over_sleep)) + log('Sleep {}s'.format(over_sleep)) sleep(over_sleep) log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<' @@ -2520,7 +2552,7 @@ def sleep_after_check_mem(): if stable_sleep: if debug_sleep: - log('Sleep {} sec'.format(min_sleep)) + log('Sleep {}s'.format(min_sleep)) stdout.flush() sleep(min_sleep) return None @@ -2572,7 +2604,7 @@ def sleep_after_check_mem(): pass if debug_sleep: - log('Sleep {} sec (t_mem={}, t_swap={}{})'.format(round(t, 2), round( + log('Sleep {}s (t_mem={}, t_swap={}{})'.format(round(t, 2), round( t_mem, 2), round(t_swap, 2), z)) stdout.flush()