added psi_excess_duration

This commit is contained in:
Alexey Avramov 2019-06-03 22:57:14 +09:00
parent 11c4a2a347
commit 8701860297
2 changed files with 75 additions and 24 deletions

79
nohang
View File

@ -2455,6 +2455,20 @@ else:
exit(1)
if 'psi_excess_duration' in config_dict:
psi_excess_duration = string_to_float_convert_test(
config_dict['psi_excess_duration'])
if psi_excess_duration is None:
errprint('Invalid psi_excess_duration value, not float\nExit')
exit(1)
if psi_excess_duration < 0:
errprint('psi_excess_duration must be non-negative number\nExit')
exit(1)
else:
errprint('psi_excess_duration is not in config\nExit')
exit(1)
if 'max_sleep' in config_dict:
max_sleep = string_to_float_convert_test(
config_dict['max_sleep'])
@ -2709,12 +2723,17 @@ log('Monitoring has started!')
stdout.flush()
psi_kill_exceeded_timer = psi_term_exceeded_timer = delta0 = 0
x0 = time()
##########################################################################
while True:
# Q = time()
delta0 = time() - x0
x0 = time()
# FIND VALUES: mem, swap, zram, psi
@ -2751,7 +2770,7 @@ while True:
psi_post_action_delay_exceeded = False
if print_mem_check_results:
psi_avg_string = 'PSI avg value: {} | '.format(
psi_avg_string = 'PSI avg: {} | '.format(
str(psi_avg_value).rjust(6))
if print_mem_check_results:
@ -2863,16 +2882,35 @@ while True:
continue
if CHECK_PSI:
if psi_avg_value >= sigkill_psi_threshold:
sigkill_psi_exceeded = True
psi_kill_exceeded_timer += delta0
else:
sigkill_psi_exceeded = False
psi_kill_exceeded_timer = 0
if sigkill_psi_exceeded and psi_post_action_delay_exceeded:
if psi_debug:
mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \
'old ({})'.format(
psi_avg_value, sigkill_psi_threshold)
log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded'
': {}\npsi_kill_exceeded_timer: {}'.format(
psi_post_action_delay_exceeded,
sigkill_psi_exceeded,
round(psi_kill_exceeded_timer, 1)
)
)
if (psi_kill_exceeded_timer >= psi_excess_duration and
psi_post_action_delay_exceeded):
mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \
'PSI avg exceeded psi_excess_duration (value' \
' = {} sec) for {} seconds'.format(
psi_avg_value,
sigkill_psi_threshold,
psi_excess_duration,
round(psi_kill_exceeded_timer, 1)
)
implement_corrective_action(SIGKILL)
psi_t0 = time()
@ -2921,20 +2959,31 @@ while True:
if CHECK_PSI:
if psi_avg_value >= sigterm_psi_threshold:
sigterm_psi_exceeded = True
psi_term_exceeded_timer += delta0
else:
sigterm_psi_exceeded = False
psi_term_exceeded_timer = 0
if psi_debug:
log('sigterm_psi_exceeded: {}, sigkill_psi_exceeded: {}, ps'
'i_post_action_delay_exceeded: {}'.format(
log('sigterm_psi_exceeded: {}\n'
'psi_term_exceeded_timer: {}\n'.format(
sigterm_psi_exceeded,
sigkill_psi_exceeded,
psi_post_action_delay_exceeded))
round(psi_term_exceeded_timer, 1)
)
)
if sigterm_psi_exceeded and psi_post_action_delay_exceeded:
if (psi_term_exceeded_timer >= psi_excess_duration and
psi_post_action_delay_exceeded):
mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \
'shold ({})'.format(psi_avg_value, sigterm_psi_threshold)
mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \
'PSI avg exceeded psi_excess_duration (value' \
' = {} sec) for {} seconds'.format(
psi_avg_value,
sigterm_psi_threshold,
psi_excess_duration,
round(psi_term_exceeded_timer, 1)
)
implement_corrective_action(SIGTERM)
psi_t0 = time()
@ -2955,8 +3004,4 @@ while True:
send_notify_warn()
warn_timer = 0
# x = time() - Q
# print(x * 1000)
sleep_after_check_mem()

View File

@ -34,6 +34,8 @@
ignore_zram = False
###############################################################################
1. Thresholds below which a signal should be sent to the victim
Sets the available memory levels at or below which SIGTERM or SIGKILL
@ -72,7 +74,7 @@ zram_max_sigkill = 60 %
Disabled by default (ignore_psi = True).
ignore_psi = True
ignore_psi = False
Choose a path to PSI file.
By default it monitors system-wide file: /proc/pressure/memory
@ -101,10 +103,14 @@ psi_path = /proc/pressure/memory
psi_metrics = some_avg10
sigterm_psi_threshold = 80
sigterm_psi_threshold = 60
sigkill_psi_threshold = 90
psi_post_action_delay = 60
>= 0, float
psi_excess_duration = 10
psi_post_action_delay = 30
###############################################################################
@ -145,7 +151,7 @@ over_sleep = 0.05
Valid values are integers from the range [0; 1000].
min_badness = 20
min_badness = 900
Valid values are non-negative floating-point numbers.
Min delay if a victim doesn't respond to SIGTERM in 10 ms.
@ -301,9 +307,9 @@ print_config = False
Print memory check results.
Valid values are True and False.
print_mem_check_results = False
print_mem_check_results = True
min_mem_report_interval = 300
min_mem_report_interval = 0
Print sleep periods between memory checks.
Valid values are True and False.
@ -325,7 +331,7 @@ print_proc_table = False
extra_table_info = cgroup_v1
print_victim_info = False
print_victim_info = True
max_ancestry_depth = 10