added psi_excess_duration
This commit is contained in:
parent
11c4a2a347
commit
8701860297
79
nohang
79
nohang
@ -2455,6 +2455,20 @@ else:
|
|||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if 'psi_excess_duration' in config_dict:
|
||||||
|
psi_excess_duration = string_to_float_convert_test(
|
||||||
|
config_dict['psi_excess_duration'])
|
||||||
|
if psi_excess_duration is None:
|
||||||
|
errprint('Invalid psi_excess_duration value, not float\nExit')
|
||||||
|
exit(1)
|
||||||
|
if psi_excess_duration < 0:
|
||||||
|
errprint('psi_excess_duration must be non-negative number\nExit')
|
||||||
|
exit(1)
|
||||||
|
else:
|
||||||
|
errprint('psi_excess_duration is not in config\nExit')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
if 'max_sleep' in config_dict:
|
if 'max_sleep' in config_dict:
|
||||||
max_sleep = string_to_float_convert_test(
|
max_sleep = string_to_float_convert_test(
|
||||||
config_dict['max_sleep'])
|
config_dict['max_sleep'])
|
||||||
@ -2709,12 +2723,17 @@ log('Monitoring has started!')
|
|||||||
stdout.flush()
|
stdout.flush()
|
||||||
|
|
||||||
|
|
||||||
|
psi_kill_exceeded_timer = psi_term_exceeded_timer = delta0 = 0
|
||||||
|
|
||||||
|
x0 = time()
|
||||||
|
|
||||||
##########################################################################
|
##########################################################################
|
||||||
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
|
||||||
# Q = time()
|
delta0 = time() - x0
|
||||||
|
x0 = time()
|
||||||
|
|
||||||
# FIND VALUES: mem, swap, zram, psi
|
# FIND VALUES: mem, swap, zram, psi
|
||||||
|
|
||||||
@ -2751,7 +2770,7 @@ while True:
|
|||||||
psi_post_action_delay_exceeded = False
|
psi_post_action_delay_exceeded = False
|
||||||
|
|
||||||
if print_mem_check_results:
|
if print_mem_check_results:
|
||||||
psi_avg_string = 'PSI avg value: {} | '.format(
|
psi_avg_string = 'PSI avg: {} | '.format(
|
||||||
str(psi_avg_value).rjust(6))
|
str(psi_avg_value).rjust(6))
|
||||||
|
|
||||||
if print_mem_check_results:
|
if print_mem_check_results:
|
||||||
@ -2863,16 +2882,35 @@ while True:
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if CHECK_PSI:
|
if CHECK_PSI:
|
||||||
|
|
||||||
if psi_avg_value >= sigkill_psi_threshold:
|
if psi_avg_value >= sigkill_psi_threshold:
|
||||||
sigkill_psi_exceeded = True
|
sigkill_psi_exceeded = True
|
||||||
|
psi_kill_exceeded_timer += delta0
|
||||||
else:
|
else:
|
||||||
sigkill_psi_exceeded = False
|
sigkill_psi_exceeded = False
|
||||||
|
psi_kill_exceeded_timer = 0
|
||||||
|
|
||||||
if sigkill_psi_exceeded and psi_post_action_delay_exceeded:
|
if psi_debug:
|
||||||
|
|
||||||
mem_info = 'PSI avg value ({}) > sigkill_psi_thresh' \
|
log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded'
|
||||||
'old ({})'.format(
|
': {}\npsi_kill_exceeded_timer: {}'.format(
|
||||||
psi_avg_value, sigkill_psi_threshold)
|
psi_post_action_delay_exceeded,
|
||||||
|
sigkill_psi_exceeded,
|
||||||
|
round(psi_kill_exceeded_timer, 1)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
if (psi_kill_exceeded_timer >= psi_excess_duration and
|
||||||
|
psi_post_action_delay_exceeded):
|
||||||
|
|
||||||
|
mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \
|
||||||
|
'PSI avg exceeded psi_excess_duration (value' \
|
||||||
|
' = {} sec) for {} seconds'.format(
|
||||||
|
psi_avg_value,
|
||||||
|
sigkill_psi_threshold,
|
||||||
|
psi_excess_duration,
|
||||||
|
round(psi_kill_exceeded_timer, 1)
|
||||||
|
)
|
||||||
|
|
||||||
implement_corrective_action(SIGKILL)
|
implement_corrective_action(SIGKILL)
|
||||||
psi_t0 = time()
|
psi_t0 = time()
|
||||||
@ -2921,20 +2959,31 @@ while True:
|
|||||||
if CHECK_PSI:
|
if CHECK_PSI:
|
||||||
if psi_avg_value >= sigterm_psi_threshold:
|
if psi_avg_value >= sigterm_psi_threshold:
|
||||||
sigterm_psi_exceeded = True
|
sigterm_psi_exceeded = True
|
||||||
|
psi_term_exceeded_timer += delta0
|
||||||
else:
|
else:
|
||||||
sigterm_psi_exceeded = False
|
sigterm_psi_exceeded = False
|
||||||
|
psi_term_exceeded_timer = 0
|
||||||
|
|
||||||
if psi_debug:
|
if psi_debug:
|
||||||
log('sigterm_psi_exceeded: {}, sigkill_psi_exceeded: {}, ps'
|
|
||||||
'i_post_action_delay_exceeded: {}'.format(
|
log('sigterm_psi_exceeded: {}\n'
|
||||||
|
'psi_term_exceeded_timer: {}\n'.format(
|
||||||
sigterm_psi_exceeded,
|
sigterm_psi_exceeded,
|
||||||
sigkill_psi_exceeded,
|
round(psi_term_exceeded_timer, 1)
|
||||||
psi_post_action_delay_exceeded))
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if sigterm_psi_exceeded and psi_post_action_delay_exceeded:
|
if (psi_term_exceeded_timer >= psi_excess_duration and
|
||||||
|
psi_post_action_delay_exceeded):
|
||||||
|
|
||||||
mem_info = 'PSI avg value ({}) > sigterm_psi_thre' \
|
mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \
|
||||||
'shold ({})'.format(psi_avg_value, sigterm_psi_threshold)
|
'PSI avg exceeded psi_excess_duration (value' \
|
||||||
|
' = {} sec) for {} seconds'.format(
|
||||||
|
psi_avg_value,
|
||||||
|
sigterm_psi_threshold,
|
||||||
|
psi_excess_duration,
|
||||||
|
round(psi_term_exceeded_timer, 1)
|
||||||
|
)
|
||||||
|
|
||||||
implement_corrective_action(SIGTERM)
|
implement_corrective_action(SIGTERM)
|
||||||
psi_t0 = time()
|
psi_t0 = time()
|
||||||
@ -2955,8 +3004,4 @@ while True:
|
|||||||
send_notify_warn()
|
send_notify_warn()
|
||||||
warn_timer = 0
|
warn_timer = 0
|
||||||
|
|
||||||
|
|
||||||
# x = time() - Q
|
|
||||||
# print(x * 1000)
|
|
||||||
|
|
||||||
sleep_after_check_mem()
|
sleep_after_check_mem()
|
||||||
|
20
nohang.conf
20
nohang.conf
@ -34,6 +34,8 @@
|
|||||||
|
|
||||||
ignore_zram = False
|
ignore_zram = False
|
||||||
|
|
||||||
|
###############################################################################
|
||||||
|
|
||||||
1. Thresholds below which a signal should be sent to the victim
|
1. Thresholds below which a signal should be sent to the victim
|
||||||
|
|
||||||
Sets the available memory levels at or below which SIGTERM or SIGKILL
|
Sets the available memory levels at or below which SIGTERM or SIGKILL
|
||||||
@ -72,7 +74,7 @@ zram_max_sigkill = 60 %
|
|||||||
|
|
||||||
Disabled by default (ignore_psi = True).
|
Disabled by default (ignore_psi = True).
|
||||||
|
|
||||||
ignore_psi = True
|
ignore_psi = False
|
||||||
|
|
||||||
Choose a path to PSI file.
|
Choose a path to PSI file.
|
||||||
By default it monitors system-wide file: /proc/pressure/memory
|
By default it monitors system-wide file: /proc/pressure/memory
|
||||||
@ -101,10 +103,14 @@ psi_path = /proc/pressure/memory
|
|||||||
|
|
||||||
psi_metrics = some_avg10
|
psi_metrics = some_avg10
|
||||||
|
|
||||||
sigterm_psi_threshold = 80
|
sigterm_psi_threshold = 60
|
||||||
sigkill_psi_threshold = 90
|
sigkill_psi_threshold = 90
|
||||||
|
|
||||||
psi_post_action_delay = 60
|
>= 0, float
|
||||||
|
psi_excess_duration = 10
|
||||||
|
|
||||||
|
psi_post_action_delay = 30
|
||||||
|
|
||||||
|
|
||||||
###############################################################################
|
###############################################################################
|
||||||
|
|
||||||
@ -145,7 +151,7 @@ over_sleep = 0.05
|
|||||||
|
|
||||||
Valid values are integers from the range [0; 1000].
|
Valid values are integers from the range [0; 1000].
|
||||||
|
|
||||||
min_badness = 20
|
min_badness = 900
|
||||||
|
|
||||||
Valid values are non-negative floating-point numbers.
|
Valid values are non-negative floating-point numbers.
|
||||||
Min delay if a victim doesn't respond to SIGTERM in 10 ms.
|
Min delay if a victim doesn't respond to SIGTERM in 10 ms.
|
||||||
@ -301,9 +307,9 @@ print_config = False
|
|||||||
Print memory check results.
|
Print memory check results.
|
||||||
Valid values are True and False.
|
Valid values are True and False.
|
||||||
|
|
||||||
print_mem_check_results = False
|
print_mem_check_results = True
|
||||||
|
|
||||||
min_mem_report_interval = 300
|
min_mem_report_interval = 0
|
||||||
|
|
||||||
Print sleep periods between memory checks.
|
Print sleep periods between memory checks.
|
||||||
Valid values are True and False.
|
Valid values are True and False.
|
||||||
@ -325,7 +331,7 @@ print_proc_table = False
|
|||||||
|
|
||||||
extra_table_info = cgroup_v1
|
extra_table_info = cgroup_v1
|
||||||
|
|
||||||
print_victim_info = False
|
print_victim_info = True
|
||||||
|
|
||||||
max_ancestry_depth = 10
|
max_ancestry_depth = 10
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user