added psi_excess_duration

This commit is contained in:
Alexey Avramov 2019-06-03 22:57:14 +09:00
parent 11c4a2a347
commit 8701860297
2 changed files with 75 additions and 24 deletions

79
nohang
View File

@ -2455,6 +2455,20 @@ else:
exit(1) exit(1)
if 'psi_excess_duration' in config_dict:
psi_excess_duration = string_to_float_convert_test(
config_dict['psi_excess_duration'])
if psi_excess_duration is None:
errprint('Invalid psi_excess_duration value, not float\nExit')
exit(1)
if psi_excess_duration < 0:
errprint('psi_excess_duration must be non-negative number\nExit')
exit(1)
else:
errprint('psi_excess_duration is not in config\nExit')
exit(1)
if 'max_sleep' in config_dict: if 'max_sleep' in config_dict:
max_sleep = string_to_float_convert_test( max_sleep = string_to_float_convert_test(
config_dict['max_sleep']) config_dict['max_sleep'])
@ -2709,12 +2723,17 @@ log('Monitoring has started!')
stdout.flush() stdout.flush()
psi_kill_exceeded_timer = psi_term_exceeded_timer = delta0 = 0
x0 = time()
########################################################################## ##########################################################################
while True: while True:
# Q = time() delta0 = time() - x0
x0 = time()
# FIND VALUES: mem, swap, zram, psi # FIND VALUES: mem, swap, zram, psi
@ -2751,7 +2770,7 @@ while True:
psi_post_action_delay_exceeded = False psi_post_action_delay_exceeded = False
if print_mem_check_results: if print_mem_check_results:
psi_avg_string = 'PSI avg value: {} | '.format( psi_avg_string = 'PSI avg: {} | '.format(
str(psi_avg_value).rjust(6)) str(psi_avg_value).rjust(6))
if print_mem_check_results: if print_mem_check_results:
@ -2863,16 +2882,35 @@ while True:
continue continue
if CHECK_PSI: if CHECK_PSI:
if psi_avg_value >= sigkill_psi_threshold: if psi_avg_value >= sigkill_psi_threshold:
sigkill_psi_exceeded = True sigkill_psi_exceeded = True
psi_kill_exceeded_timer += delta0
else: else:
sigkill_psi_exceeded = False sigkill_psi_exceeded = False
psi_kill_exceeded_timer = 0
if sigkill_psi_exceeded and psi_post_action_delay_exceeded: if psi_debug:
mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \ log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded'
'old ({})'.format( ': {}\npsi_kill_exceeded_timer: {}'.format(
psi_avg_value, sigkill_psi_threshold) psi_post_action_delay_exceeded,
sigkill_psi_exceeded,
round(psi_kill_exceeded_timer, 1)
)
)
if (psi_kill_exceeded_timer >= psi_excess_duration and
psi_post_action_delay_exceeded):
mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \
'PSI avg exceeded psi_excess_duration (value' \
' = {} sec) for {} seconds'.format(
psi_avg_value,
sigkill_psi_threshold,
psi_excess_duration,
round(psi_kill_exceeded_timer, 1)
)
implement_corrective_action(SIGKILL) implement_corrective_action(SIGKILL)
psi_t0 = time() psi_t0 = time()
@ -2921,20 +2959,31 @@ while True:
if CHECK_PSI: if CHECK_PSI:
if psi_avg_value >= sigterm_psi_threshold: if psi_avg_value >= sigterm_psi_threshold:
sigterm_psi_exceeded = True sigterm_psi_exceeded = True
psi_term_exceeded_timer += delta0
else: else:
sigterm_psi_exceeded = False sigterm_psi_exceeded = False
psi_term_exceeded_timer = 0
if psi_debug: if psi_debug:
log('sigterm_psi_exceeded: {}, sigkill_psi_exceeded: {}, ps'
'i_post_action_delay_exceeded: {}'.format( log('sigterm_psi_exceeded: {}\n'
'psi_term_exceeded_timer: {}\n'.format(
sigterm_psi_exceeded, sigterm_psi_exceeded,
sigkill_psi_exceeded, round(psi_term_exceeded_timer, 1)
psi_post_action_delay_exceeded)) )
)
if sigterm_psi_exceeded and psi_post_action_delay_exceeded: if (psi_term_exceeded_timer >= psi_excess_duration and
psi_post_action_delay_exceeded):
mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \ mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \
'shold ({})'.format(psi_avg_value, sigterm_psi_threshold) 'PSI avg exceeded psi_excess_duration (value' \
' = {} sec) for {} seconds'.format(
psi_avg_value,
sigterm_psi_threshold,
psi_excess_duration,
round(psi_term_exceeded_timer, 1)
)
implement_corrective_action(SIGTERM) implement_corrective_action(SIGTERM)
psi_t0 = time() psi_t0 = time()
@ -2955,8 +3004,4 @@ while True:
send_notify_warn() send_notify_warn()
warn_timer = 0 warn_timer = 0
# x = time() - Q
# print(x * 1000)
sleep_after_check_mem() sleep_after_check_mem()

View File

@ -34,6 +34,8 @@
ignore_zram = False ignore_zram = False
###############################################################################
1. Thresholds below which a signal should be sent to the victim 1. Thresholds below which a signal should be sent to the victim
Sets the available memory levels at or below which SIGTERM or SIGKILL Sets the available memory levels at or below which SIGTERM or SIGKILL
@ -72,7 +74,7 @@ zram_max_sigkill = 60 %
Disabled by default (ignore_psi = True). Disabled by default (ignore_psi = True).
ignore_psi = True ignore_psi = False
Choose a path to PSI file. Choose a path to PSI file.
By default it monitors system-wide file: /proc/pressure/memory By default it monitors system-wide file: /proc/pressure/memory
@ -101,10 +103,14 @@ psi_path = /proc/pressure/memory
psi_metrics = some_avg10 psi_metrics = some_avg10
sigterm_psi_threshold = 80 sigterm_psi_threshold = 60
sigkill_psi_threshold = 90 sigkill_psi_threshold = 90
psi_post_action_delay = 60 >= 0, float
psi_excess_duration = 10
psi_post_action_delay = 30
############################################################################### ###############################################################################
@ -145,7 +151,7 @@ over_sleep = 0.05
Valid values are integers from the range [0; 1000]. Valid values are integers from the range [0; 1000].
min_badness = 20 min_badness = 900
Valid values are non-negative floating-point numbers. Valid values are non-negative floating-point numbers.
Min delay if a victim doesn't respond to SIGTERM in 10 ms. Min delay if a victim doesn't respond to SIGTERM in 10 ms.
@ -301,9 +307,9 @@ print_config = False
Print memory check results. Print memory check results.
Valid values are True and False. Valid values are True and False.
print_mem_check_results = False print_mem_check_results = True
min_mem_report_interval = 300 min_mem_report_interval = 0
Print sleep periods between memory checks. Print sleep periods between memory checks.
Valid values are True and False. Valid values are True and False.
@ -325,7 +331,7 @@ print_proc_table = False
extra_table_info = cgroup_v1 extra_table_info = cgroup_v1
print_victim_info = False print_victim_info = True
max_ancestry_depth = 10 max_ancestry_depth = 10