check victim lifetime after killing
This commit is contained in:
parent
488592d9ad
commit
57417b0370
210
nohang
210
nohang
@ -41,7 +41,6 @@ sig_dict = {
|
||||
SIGTERM: 'SIGTERM'
|
||||
}
|
||||
|
||||
|
||||
self_pid = str(os.getpid())
|
||||
|
||||
self_uid = os.geteuid()
|
||||
@ -107,7 +106,7 @@ cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
|
||||
|
||||
# define functions
|
||||
|
||||
|
||||
'''
|
||||
def self_rss():
|
||||
"""
|
||||
"""
|
||||
@ -118,14 +117,28 @@ def print_self_rss():
|
||||
"""
|
||||
"""
|
||||
log('Self RSS: {} MiB'.format(self_rss()))
|
||||
'''
|
||||
|
||||
|
||||
def pid_to_rss(pid):
|
||||
try:
|
||||
rss = rline1('/proc/{}/statm'.format(pid)).split(' ')[1]
|
||||
except IndexError:
|
||||
rss = '-0'
|
||||
except FileNotFoundError:
|
||||
rss = '-0'
|
||||
except ProcessLookupError:
|
||||
rss = '-0'
|
||||
return rss
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
"""
|
||||
"""
|
||||
for sig_num in sig_list:
|
||||
signal(sig_num, signal_handler_inner)
|
||||
log('Got the {} signal '.format(sig_dict[signum]))
|
||||
for i in sig_list:
|
||||
signal(i, signal_handler_inner)
|
||||
log('Signal handler called with the {} signal '.format(
|
||||
sig_dict[signum]))
|
||||
update_stat_dict_and_print(None)
|
||||
log('Exit')
|
||||
exit()
|
||||
@ -134,7 +147,21 @@ def signal_handler(signum, frame):
|
||||
def signal_handler_inner(signum, frame):
|
||||
"""
|
||||
"""
|
||||
log('Got the {} signal (ignored) '.format(sig_dict[signum]))
|
||||
log('Signal handler called with the {} signal (ignored) '.format(
|
||||
sig_dict[signum]))
|
||||
|
||||
|
||||
def exe(cmd):
|
||||
"""
|
||||
"""
|
||||
log('Execute the command: {}'.format(cmd))
|
||||
t0 = time()
|
||||
write_self_oom_score_adj(self_oom_score_adj_max)
|
||||
err = os.system(cmd)
|
||||
write_self_oom_score_adj(self_oom_score_adj_min)
|
||||
dt = time() - t0
|
||||
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
|
||||
return err
|
||||
|
||||
|
||||
def write(path, string):
|
||||
@ -158,19 +185,6 @@ self_oom_score_adj_max = '-6'
|
||||
write_self_oom_score_adj(self_oom_score_adj_min)
|
||||
|
||||
|
||||
def exe(cmd):
|
||||
"""
|
||||
"""
|
||||
log('Execute the command: {}'.format(cmd))
|
||||
t0 = time()
|
||||
write_self_oom_score_adj(self_oom_score_adj_max)
|
||||
err = os.system(cmd)
|
||||
write_self_oom_score_adj(self_oom_score_adj_min)
|
||||
dt = time() - t0
|
||||
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
|
||||
return err
|
||||
|
||||
|
||||
def valid_re(reg_exp):
|
||||
"""Validate regular expression.
|
||||
"""
|
||||
@ -1347,13 +1361,26 @@ def implement_corrective_action(signal):
|
||||
dt = time() - corrective_actions_dict[SIGTERM]
|
||||
if dt < min_delay_after_sigterm:
|
||||
# print(' soft_post_action_delay NOT EXCEEDED')
|
||||
sleep_after_check_mem(0.2)
|
||||
return 0 # время задержки между действиями не истекло
|
||||
|
||||
if print_sleep_periods:
|
||||
log('Sleep {} sec (in implement_corrective_action())'.format(
|
||||
over_sleep))
|
||||
|
||||
sleep(over_sleep)
|
||||
|
||||
return None # время задержки между действиями не истекло
|
||||
|
||||
else:
|
||||
dt = time() - corrective_actions_dict[SIGKILL]
|
||||
if dt < min_delay_after_sigkill:
|
||||
# print(' hard_post_action_delay NOT EXCEEDED')
|
||||
sleep_after_check_mem(0.2)
|
||||
|
||||
if print_sleep_periods:
|
||||
log('Sleep {} sec (in implement_corrective_action())'.format(
|
||||
over_sleep))
|
||||
|
||||
sleep(over_sleep)
|
||||
|
||||
return 0 # время задержки между действиями не истекло
|
||||
|
||||
log(mem_info)
|
||||
@ -1472,7 +1499,25 @@ def implement_corrective_action(signal):
|
||||
signal = SIGKILL
|
||||
|
||||
os.kill(int(pid), signal)
|
||||
|
||||
response_time = time() - time0
|
||||
|
||||
sleep(0.001)
|
||||
rp = os.path.exists('/proc/{}/exe'.format(pid))
|
||||
|
||||
if signal is SIGKILL or not rp:
|
||||
|
||||
t0 = time()
|
||||
while True:
|
||||
sleep(0.001)
|
||||
rss = pid_to_rss(pid)
|
||||
if rss == '-0':
|
||||
break
|
||||
t1 = time()
|
||||
kill_duration = t1 - t0
|
||||
log('The victim died in {} sec'.format(
|
||||
round(kill_duration, 3)))
|
||||
|
||||
send_result = 'total response time: {} ms'.format(
|
||||
round(response_time * 1000))
|
||||
|
||||
@ -1525,6 +1570,11 @@ def implement_corrective_action(signal):
|
||||
|
||||
try:
|
||||
log(preventing_oom_message)
|
||||
if rp:
|
||||
log('Seems like the victim handles signal')
|
||||
else:
|
||||
log('Seems like the victim is dead or zombie')
|
||||
|
||||
except UnboundLocalError:
|
||||
preventing_oom_message = key
|
||||
|
||||
@ -1545,8 +1595,6 @@ def implement_corrective_action(signal):
|
||||
key = 'victim badness < min_badness'
|
||||
update_stat_dict_and_print(key)
|
||||
|
||||
# sleep_after_send_signal(signal)
|
||||
|
||||
if signal is SIGTERM:
|
||||
corrective_actions_dict[SIGTERM] = time()
|
||||
else:
|
||||
@ -1556,36 +1604,17 @@ def implement_corrective_action(signal):
|
||||
print('##################################################################')
|
||||
|
||||
|
||||
'''
|
||||
def sleep_after_send_signal(signal):
|
||||
"""
|
||||
Sleeping after signal was sent.
|
||||
|
||||
signal: sent signal
|
||||
"""
|
||||
|
||||
#min_delay_after_sigterm = 0.01
|
||||
#min_delay_after_sigkill = 0.01
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if signal is SIGKILL:
|
||||
if print_sleep_periods:
|
||||
log('Sleep {} sec after implementing a corrective action'.format(
|
||||
min_delay_after_sigkill))
|
||||
sleep(min_delay_after_sigkill)
|
||||
else:
|
||||
if print_sleep_periods:
|
||||
log('Sleep {} sec after implementing a corrective action'.format(
|
||||
min_delay_after_sigterm))
|
||||
sleep(min_delay_after_sigterm)
|
||||
'''
|
||||
|
||||
|
||||
def sleep_after_check_mem(k=1.0):
|
||||
def sleep_after_check_mem():
|
||||
"""Specify sleep times depends on rates and avialable memory."""
|
||||
|
||||
if stable_sleep:
|
||||
|
||||
if print_sleep_periods:
|
||||
log('Sleep {} sec'.format(min_sleep))
|
||||
|
||||
sleep(min_sleep)
|
||||
return None
|
||||
|
||||
if mem_min_sigkill_kb < mem_min_sigterm_kb:
|
||||
mem_point = mem_available - mem_min_sigterm_kb
|
||||
else:
|
||||
@ -1616,10 +1645,10 @@ def sleep_after_check_mem(k=1.0):
|
||||
else:
|
||||
t = t_mem_zram
|
||||
|
||||
if t > max_sleep_time:
|
||||
t = max_sleep_time
|
||||
elif t < min_sleep_time:
|
||||
t = min_sleep_time
|
||||
if t > max_sleep:
|
||||
t = max_sleep
|
||||
elif t < min_sleep:
|
||||
t = min_sleep
|
||||
else:
|
||||
pass
|
||||
|
||||
@ -1639,7 +1668,7 @@ def sleep_after_check_mem(k=1.0):
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
sleep(t * k)
|
||||
sleep(t)
|
||||
|
||||
|
||||
def calculate_percent(arg_key):
|
||||
@ -2355,41 +2384,68 @@ else:
|
||||
exit(1)
|
||||
|
||||
|
||||
if 'max_sleep_time' in config_dict:
|
||||
max_sleep_time = string_to_float_convert_test(
|
||||
config_dict['max_sleep_time'])
|
||||
if max_sleep_time is None:
|
||||
errprint('Invalid max_sleep_time value, not float\nExit')
|
||||
if 'max_sleep' in config_dict:
|
||||
max_sleep = string_to_float_convert_test(
|
||||
config_dict['max_sleep'])
|
||||
if max_sleep is None:
|
||||
errprint('Invalid max_sleep value, not float\nExit')
|
||||
exit(1)
|
||||
if max_sleep_time <= 0:
|
||||
errprint('max_sleep_time must be positive number\nExit')
|
||||
if max_sleep <= 0:
|
||||
errprint('max_sleep must be positive number\nExit')
|
||||
exit(1)
|
||||
else:
|
||||
errprint('max_sleep_time is not in config\nExit')
|
||||
errprint('max_sleep is not in config\nExit')
|
||||
exit(1)
|
||||
|
||||
|
||||
if 'min_sleep_time' in config_dict:
|
||||
min_sleep_time = string_to_float_convert_test(
|
||||
config_dict['min_sleep_time'])
|
||||
if min_sleep_time is None:
|
||||
errprint('Invalid min_sleep_time value, not float\nExit')
|
||||
if 'min_sleep' in config_dict:
|
||||
min_sleep = string_to_float_convert_test(
|
||||
config_dict['min_sleep'])
|
||||
if min_sleep is None:
|
||||
errprint('Invalid min_sleep value, not float\nExit')
|
||||
exit(1)
|
||||
if min_sleep_time <= 0:
|
||||
errprint('min_sleep_time must be positive number\nExit')
|
||||
if min_sleep <= 0:
|
||||
errprint('min_sleep must be positive number\nExit')
|
||||
exit(1)
|
||||
else:
|
||||
errprint('min_sleep_time is not in config\nExit')
|
||||
errprint('min_sleep is not in config\nExit')
|
||||
exit(1)
|
||||
|
||||
|
||||
if max_sleep_time < min_sleep_time:
|
||||
if 'over_sleep' in config_dict:
|
||||
over_sleep = string_to_float_convert_test(
|
||||
config_dict['over_sleep'])
|
||||
if over_sleep is None:
|
||||
errprint('Invalid over_sleep value, not float\nExit')
|
||||
exit(1)
|
||||
if over_sleep <= 0:
|
||||
errprint('over_sleep must be positive number\nExit')
|
||||
exit(1)
|
||||
else:
|
||||
errprint('over_sleep is not in config\nExit')
|
||||
exit(1)
|
||||
|
||||
|
||||
if max_sleep < min_sleep:
|
||||
errprint(
|
||||
'max_sleep_time value must not exceed min_sleep_time value.\nExit'
|
||||
'max_sleep value must not exceed min_sleep value.\nExit'
|
||||
)
|
||||
exit(1)
|
||||
|
||||
|
||||
if min_sleep < over_sleep:
|
||||
errprint(
|
||||
'min_sleep value must not exceed over_sleep value.\nExit'
|
||||
)
|
||||
exit(1)
|
||||
|
||||
|
||||
if max_sleep == min_sleep:
|
||||
stable_sleep = True
|
||||
else:
|
||||
stable_sleep = False
|
||||
|
||||
|
||||
if print_proc_table_flag:
|
||||
|
||||
if not root:
|
||||
@ -2609,8 +2665,8 @@ if print_mem_check_results:
|
||||
|
||||
|
||||
# handle signals
|
||||
for sig_num in sig_list:
|
||||
signal(sig_num, signal_handler)
|
||||
for i in sig_list:
|
||||
signal(i, signal_handler)
|
||||
|
||||
|
||||
while True:
|
||||
|
15
nohang.conf
15
nohang.conf
@ -128,11 +128,12 @@ rate_zram = 500
|
||||
|
||||
See also https://github.com/rfjakob/earlyoom/issues/61
|
||||
|
||||
max_sleep_time = 3
|
||||
max_sleep = 3
|
||||
min_sleep = 0.1
|
||||
|
||||
min_sleep_time = 0.1
|
||||
Sleep time if soft threshold exceeded.
|
||||
|
||||
# sleep_time_if_threshold_is_exceeded = 0.02 # (todo)
|
||||
over_sleep = 0.05
|
||||
|
||||
#####################################################################
|
||||
|
||||
@ -144,8 +145,12 @@ min_badness = 20
|
||||
|
||||
Valid values are non-negative floating-point numbers.
|
||||
|
||||
min_delay_after_sigterm = 2
|
||||
min_delay_after_sigkill = 0.5
|
||||
min_delay_after_sigterm = 3
|
||||
|
||||
New nohang behavior: check victim lifetime after killing.
|
||||
This key should be removed from the config.
|
||||
|
||||
min_delay_after_sigkill = 0.001
|
||||
|
||||
Valid values are True and False.
|
||||
Values are case sensitive.
|
||||
|
@ -46,9 +46,9 @@ with open('/proc/meminfo') as f:
|
||||
if line.startswith('SwapTotal'):
|
||||
swap_total = int(line.split(':')[1][:-4])
|
||||
if swap_total > 0:
|
||||
wait_time = 5
|
||||
wait_time = 8
|
||||
else:
|
||||
wait_time = 1
|
||||
wait_time = 2
|
||||
|
||||
|
||||
print('nohang_notify_helper: wait_time:', wait_time)
|
||||
|
Loading…
Reference in New Issue
Block a user