added psi_excess_duration
This commit is contained in:
parent
11c4a2a347
commit
8701860297
79
nohang
79
nohang
@ -2455,6 +2455,20 @@ else:
|
||||
exit(1)
|
||||
|
||||
|
||||
if 'psi_excess_duration' in config_dict:
|
||||
psi_excess_duration = string_to_float_convert_test(
|
||||
config_dict['psi_excess_duration'])
|
||||
if psi_excess_duration is None:
|
||||
errprint('Invalid psi_excess_duration value, not float\nExit')
|
||||
exit(1)
|
||||
if psi_excess_duration < 0:
|
||||
errprint('psi_excess_duration must be non-negative number\nExit')
|
||||
exit(1)
|
||||
else:
|
||||
errprint('psi_excess_duration is not in config\nExit')
|
||||
exit(1)
|
||||
|
||||
|
||||
if 'max_sleep' in config_dict:
|
||||
max_sleep = string_to_float_convert_test(
|
||||
config_dict['max_sleep'])
|
||||
@ -2709,12 +2723,17 @@ log('Monitoring has started!')
|
||||
stdout.flush()
|
||||
|
||||
|
||||
psi_kill_exceeded_timer = psi_term_exceeded_timer = delta0 = 0
|
||||
|
||||
x0 = time()
|
||||
|
||||
##########################################################################
|
||||
|
||||
|
||||
while True:
|
||||
|
||||
# Q = time()
|
||||
delta0 = time() - x0
|
||||
x0 = time()
|
||||
|
||||
# FIND VALUES: mem, swap, zram, psi
|
||||
|
||||
@ -2751,7 +2770,7 @@ while True:
|
||||
psi_post_action_delay_exceeded = False
|
||||
|
||||
if print_mem_check_results:
|
||||
psi_avg_string = 'PSI avg value: {} | '.format(
|
||||
psi_avg_string = 'PSI avg: {} | '.format(
|
||||
str(psi_avg_value).rjust(6))
|
||||
|
||||
if print_mem_check_results:
|
||||
@ -2863,16 +2882,35 @@ while True:
|
||||
continue
|
||||
|
||||
if CHECK_PSI:
|
||||
|
||||
if psi_avg_value >= sigkill_psi_threshold:
|
||||
sigkill_psi_exceeded = True
|
||||
psi_kill_exceeded_timer += delta0
|
||||
else:
|
||||
sigkill_psi_exceeded = False
|
||||
psi_kill_exceeded_timer = 0
|
||||
|
||||
if sigkill_psi_exceeded and psi_post_action_delay_exceeded:
|
||||
if psi_debug:
|
||||
|
||||
mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \
|
||||
'old ({})'.format(
|
||||
psi_avg_value, sigkill_psi_threshold)
|
||||
log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded'
|
||||
': {}\npsi_kill_exceeded_timer: {}'.format(
|
||||
psi_post_action_delay_exceeded,
|
||||
sigkill_psi_exceeded,
|
||||
round(psi_kill_exceeded_timer, 1)
|
||||
)
|
||||
)
|
||||
|
||||
if (psi_kill_exceeded_timer >= psi_excess_duration and
|
||||
psi_post_action_delay_exceeded):
|
||||
|
||||
mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \
|
||||
'PSI avg exceeded psi_excess_duration (value' \
|
||||
' = {} sec) for {} seconds'.format(
|
||||
psi_avg_value,
|
||||
sigkill_psi_threshold,
|
||||
psi_excess_duration,
|
||||
round(psi_kill_exceeded_timer, 1)
|
||||
)
|
||||
|
||||
implement_corrective_action(SIGKILL)
|
||||
psi_t0 = time()
|
||||
@ -2921,20 +2959,31 @@ while True:
|
||||
if CHECK_PSI:
|
||||
if psi_avg_value >= sigterm_psi_threshold:
|
||||
sigterm_psi_exceeded = True
|
||||
psi_term_exceeded_timer += delta0
|
||||
else:
|
||||
sigterm_psi_exceeded = False
|
||||
psi_term_exceeded_timer = 0
|
||||
|
||||
if psi_debug:
|
||||
log('sigterm_psi_exceeded: {}, sigkill_psi_exceeded: {}, ps'
|
||||
'i_post_action_delay_exceeded: {}'.format(
|
||||
|
||||
log('sigterm_psi_exceeded: {}\n'
|
||||
'psi_term_exceeded_timer: {}\n'.format(
|
||||
sigterm_psi_exceeded,
|
||||
sigkill_psi_exceeded,
|
||||
psi_post_action_delay_exceeded))
|
||||
round(psi_term_exceeded_timer, 1)
|
||||
)
|
||||
)
|
||||
|
||||
if sigterm_psi_exceeded and psi_post_action_delay_exceeded:
|
||||
if (psi_term_exceeded_timer >= psi_excess_duration and
|
||||
psi_post_action_delay_exceeded):
|
||||
|
||||
mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \
|
||||
'shold ({})'.format(psi_avg_value, sigterm_psi_threshold)
|
||||
mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \
|
||||
'PSI avg exceeded psi_excess_duration (value' \
|
||||
' = {} sec) for {} seconds'.format(
|
||||
psi_avg_value,
|
||||
sigterm_psi_threshold,
|
||||
psi_excess_duration,
|
||||
round(psi_term_exceeded_timer, 1)
|
||||
)
|
||||
|
||||
implement_corrective_action(SIGTERM)
|
||||
psi_t0 = time()
|
||||
@ -2955,8 +3004,4 @@ while True:
|
||||
send_notify_warn()
|
||||
warn_timer = 0
|
||||
|
||||
|
||||
# x = time() - Q
|
||||
# print(x * 1000)
|
||||
|
||||
sleep_after_check_mem()
|
||||
|
20
nohang.conf
20
nohang.conf
@ -34,6 +34,8 @@
|
||||
|
||||
ignore_zram = False
|
||||
|
||||
###############################################################################
|
||||
|
||||
1. Thresholds below which a signal should be sent to the victim
|
||||
|
||||
Sets the available memory levels at or below which SIGTERM or SIGKILL
|
||||
@ -72,7 +74,7 @@ zram_max_sigkill = 60 %
|
||||
|
||||
Disabled by default (ignore_psi = True).
|
||||
|
||||
ignore_psi = True
|
||||
ignore_psi = False
|
||||
|
||||
Choose a path to PSI file.
|
||||
By default it monitors system-wide file: /proc/pressure/memory
|
||||
@ -101,10 +103,14 @@ psi_path = /proc/pressure/memory
|
||||
|
||||
psi_metrics = some_avg10
|
||||
|
||||
sigterm_psi_threshold = 80
|
||||
sigterm_psi_threshold = 60
|
||||
sigkill_psi_threshold = 90
|
||||
|
||||
psi_post_action_delay = 60
|
||||
>= 0, float
|
||||
psi_excess_duration = 10
|
||||
|
||||
psi_post_action_delay = 30
|
||||
|
||||
|
||||
###############################################################################
|
||||
|
||||
@ -145,7 +151,7 @@ over_sleep = 0.05
|
||||
|
||||
Valid values are integers from the range [0; 1000].
|
||||
|
||||
min_badness = 20
|
||||
min_badness = 900
|
||||
|
||||
Valid values are non-negative floating-point numbers.
|
||||
Min delay if a victim doesn't respond to SIGTERM in 10 ms.
|
||||
@ -301,9 +307,9 @@ print_config = False
|
||||
Print memory check results.
|
||||
Valid values are True and False.
|
||||
|
||||
print_mem_check_results = False
|
||||
print_mem_check_results = True
|
||||
|
||||
min_mem_report_interval = 300
|
||||
min_mem_report_interval = 0
|
||||
|
||||
Print sleep periods between memory checks.
|
||||
Valid values are True and False.
|
||||
@ -325,7 +331,7 @@ print_proc_table = False
|
||||
|
||||
extra_table_info = cgroup_v1
|
||||
|
||||
print_victim_info = False
|
||||
print_victim_info = True
|
||||
|
||||
max_ancestry_depth = 10
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user