diff --git a/nohang b/nohang index 2853228..d79e91f 100755 --- a/nohang +++ b/nohang @@ -2455,6 +2455,20 @@ else: exit(1) +if 'psi_excess_duration' in config_dict: + psi_excess_duration = string_to_float_convert_test( + config_dict['psi_excess_duration']) + if psi_excess_duration is None: + errprint('Invalid psi_excess_duration value, not float\nExit') + exit(1) + if psi_excess_duration < 0: + errprint('psi_excess_duration must be non-negative number\nExit') + exit(1) +else: + errprint('psi_excess_duration is not in config\nExit') + exit(1) + + if 'max_sleep' in config_dict: max_sleep = string_to_float_convert_test( config_dict['max_sleep']) @@ -2709,12 +2723,17 @@ log('Monitoring has started!') stdout.flush() +psi_kill_exceeded_timer = psi_term_exceeded_timer = delta0 = 0 + +x0 = time() + ########################################################################## while True: - # Q = time() + delta0 = time() - x0 + x0 = time() # FIND VALUES: mem, swap, zram, psi @@ -2751,7 +2770,7 @@ while True: psi_post_action_delay_exceeded = False if print_mem_check_results: - psi_avg_string = 'PSI avg value: {} | '.format( + psi_avg_string = 'PSI avg: {} | '.format( str(psi_avg_value).rjust(6)) if print_mem_check_results: @@ -2863,16 +2882,35 @@ while True: continue if CHECK_PSI: + if psi_avg_value >= sigkill_psi_threshold: sigkill_psi_exceeded = True + psi_kill_exceeded_timer += delta0 else: sigkill_psi_exceeded = False + psi_kill_exceeded_timer = 0 - if sigkill_psi_exceeded and psi_post_action_delay_exceeded: + if psi_debug: - mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \ - 'old ({})'.format( - psi_avg_value, sigkill_psi_threshold) + log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded' + ': {}\npsi_kill_exceeded_timer: {}'.format( + psi_post_action_delay_exceeded, + sigkill_psi_exceeded, + round(psi_kill_exceeded_timer, 1) + ) + ) + + if (psi_kill_exceeded_timer >= psi_excess_duration and + psi_post_action_delay_exceeded): + + mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \ + 'PSI avg exceeded psi_excess_duration (value' \ + ' = {} sec) for {} seconds'.format( + psi_avg_value, + sigkill_psi_threshold, + psi_excess_duration, + round(psi_kill_exceeded_timer, 1) + ) implement_corrective_action(SIGKILL) psi_t0 = time() @@ -2921,20 +2959,31 @@ while True: if CHECK_PSI: if psi_avg_value >= sigterm_psi_threshold: sigterm_psi_exceeded = True + psi_term_exceeded_timer += delta0 else: sigterm_psi_exceeded = False + psi_term_exceeded_timer = 0 if psi_debug: - log('sigterm_psi_exceeded: {}, sigkill_psi_exceeded: {}, ps' - 'i_post_action_delay_exceeded: {}'.format( + + log('sigterm_psi_exceeded: {}\n' + 'psi_term_exceeded_timer: {}\n'.format( sigterm_psi_exceeded, - sigkill_psi_exceeded, - psi_post_action_delay_exceeded)) + round(psi_term_exceeded_timer, 1) + ) + ) - if sigterm_psi_exceeded and psi_post_action_delay_exceeded: + if (psi_term_exceeded_timer >= psi_excess_duration and + psi_post_action_delay_exceeded): - mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \ - 'shold ({})'.format(psi_avg_value, sigterm_psi_threshold) + mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \ + 'PSI avg exceeded psi_excess_duration (value' \ + ' = {} sec) for {} seconds'.format( + psi_avg_value, + sigterm_psi_threshold, + psi_excess_duration, + round(psi_term_exceeded_timer, 1) + ) implement_corrective_action(SIGTERM) psi_t0 = time() @@ -2955,8 +3004,4 @@ while True: send_notify_warn() warn_timer = 0 - - # x = time() - Q - # print(x * 1000) - sleep_after_check_mem() diff --git a/nohang.conf b/nohang.conf index b436dd4..c704dc5 100644 --- a/nohang.conf +++ b/nohang.conf @@ -34,6 +34,8 @@ ignore_zram = False +############################################################################### + 1. Thresholds below which a signal should be sent to the victim Sets the available memory levels at or below which SIGTERM or SIGKILL @@ -72,7 +74,7 @@ zram_max_sigkill = 60 % Disabled by default (ignore_psi = True). -ignore_psi = True +ignore_psi = False Choose a path to PSI file. By default it monitors system-wide file: /proc/pressure/memory @@ -101,10 +103,14 @@ psi_path = /proc/pressure/memory psi_metrics = some_avg10 -sigterm_psi_threshold = 80 +sigterm_psi_threshold = 60 sigkill_psi_threshold = 90 -psi_post_action_delay = 60 + >= 0, float +psi_excess_duration = 10 + +psi_post_action_delay = 30 + ############################################################################### @@ -145,7 +151,7 @@ over_sleep = 0.05 Valid values are integers from the range [0; 1000]. -min_badness = 20 +min_badness = 900 Valid values are non-negative floating-point numbers. Min delay if a victim doesn't respond to SIGTERM in 10 ms. @@ -301,9 +307,9 @@ print_config = False Print memory check results. Valid values are True and False. -print_mem_check_results = False +print_mem_check_results = True -min_mem_report_interval = 300 +min_mem_report_interval = 0 Print sleep periods between memory checks. Valid values are True and False. @@ -325,7 +331,7 @@ print_proc_table = False extra_table_info = cgroup_v1 -print_victim_info = False +print_victim_info = True max_ancestry_depth = 10