check victim lifetime after killing

This commit is contained in:
Alexey Avramov 2019-05-04 20:46:38 +09:00
parent 488592d9ad
commit 57417b0370
3 changed files with 149 additions and 88 deletions

210
nohang
View File

@ -41,7 +41,6 @@ sig_dict = {
SIGTERM: 'SIGTERM'
}
self_pid = str(os.getpid())
self_uid = os.geteuid()
@ -107,7 +106,7 @@ cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
# define functions
'''
def self_rss():
"""
"""
@ -118,14 +117,28 @@ def print_self_rss():
"""
"""
log('Self RSS: {} MiB'.format(self_rss()))
'''
def pid_to_rss(pid):
try:
rss = rline1('/proc/{}/statm'.format(pid)).split(' ')[1]
except IndexError:
rss = '-0'
except FileNotFoundError:
rss = '-0'
except ProcessLookupError:
rss = '-0'
return rss
def signal_handler(signum, frame):
"""
"""
for sig_num in sig_list:
signal(sig_num, signal_handler_inner)
log('Got the {} signal '.format(sig_dict[signum]))
for i in sig_list:
signal(i, signal_handler_inner)
log('Signal handler called with the {} signal '.format(
sig_dict[signum]))
update_stat_dict_and_print(None)
log('Exit')
exit()
@ -134,7 +147,21 @@ def signal_handler(signum, frame):
def signal_handler_inner(signum, frame):
"""
"""
log('Got the {} signal (ignored) '.format(sig_dict[signum]))
log('Signal handler called with the {} signal (ignored) '.format(
sig_dict[signum]))
def exe(cmd):
"""
"""
log('Execute the command: {}'.format(cmd))
t0 = time()
write_self_oom_score_adj(self_oom_score_adj_max)
err = os.system(cmd)
write_self_oom_score_adj(self_oom_score_adj_min)
dt = time() - t0
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
return err
def write(path, string):
@ -158,19 +185,6 @@ self_oom_score_adj_max = '-6'
write_self_oom_score_adj(self_oom_score_adj_min)
def exe(cmd):
"""
"""
log('Execute the command: {}'.format(cmd))
t0 = time()
write_self_oom_score_adj(self_oom_score_adj_max)
err = os.system(cmd)
write_self_oom_score_adj(self_oom_score_adj_min)
dt = time() - t0
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
return err
def valid_re(reg_exp):
"""Validate regular expression.
"""
@ -1347,13 +1361,26 @@ def implement_corrective_action(signal):
dt = time() - corrective_actions_dict[SIGTERM]
if dt < min_delay_after_sigterm:
# print(' soft_post_action_delay NOT EXCEEDED')
sleep_after_check_mem(0.2)
return 0 # время задержки между действиями не истекло
if print_sleep_periods:
log('Sleep {} sec (in implement_corrective_action())'.format(
over_sleep))
sleep(over_sleep)
return None # время задержки между действиями не истекло
else:
dt = time() - corrective_actions_dict[SIGKILL]
if dt < min_delay_after_sigkill:
# print(' hard_post_action_delay NOT EXCEEDED')
sleep_after_check_mem(0.2)
if print_sleep_periods:
log('Sleep {} sec (in implement_corrective_action())'.format(
over_sleep))
sleep(over_sleep)
return 0 # время задержки между действиями не истекло
log(mem_info)
@ -1472,7 +1499,25 @@ def implement_corrective_action(signal):
signal = SIGKILL
os.kill(int(pid), signal)
response_time = time() - time0
sleep(0.001)
rp = os.path.exists('/proc/{}/exe'.format(pid))
if signal is SIGKILL or not rp:
t0 = time()
while True:
sleep(0.001)
rss = pid_to_rss(pid)
if rss == '-0':
break
t1 = time()
kill_duration = t1 - t0
log('The victim died in {} sec'.format(
round(kill_duration, 3)))
send_result = 'total response time: {} ms'.format(
round(response_time * 1000))
@ -1525,6 +1570,11 @@ def implement_corrective_action(signal):
try:
log(preventing_oom_message)
if rp:
log('Seems like the victim handles signal')
else:
log('Seems like the victim is dead or zombie')
except UnboundLocalError:
preventing_oom_message = key
@ -1545,8 +1595,6 @@ def implement_corrective_action(signal):
key = 'victim badness < min_badness'
update_stat_dict_and_print(key)
# sleep_after_send_signal(signal)
if signal is SIGTERM:
corrective_actions_dict[SIGTERM] = time()
else:
@ -1556,36 +1604,17 @@ def implement_corrective_action(signal):
print('##################################################################')
'''
def sleep_after_send_signal(signal):
"""
Sleeping after signal was sent.
signal: sent signal
"""
#min_delay_after_sigterm = 0.01
#min_delay_after_sigkill = 0.01
return 0
if signal is SIGKILL:
if print_sleep_periods:
log('Sleep {} sec after implementing a corrective action'.format(
min_delay_after_sigkill))
sleep(min_delay_after_sigkill)
else:
if print_sleep_periods:
log('Sleep {} sec after implementing a corrective action'.format(
min_delay_after_sigterm))
sleep(min_delay_after_sigterm)
'''
def sleep_after_check_mem(k=1.0):
def sleep_after_check_mem():
"""Specify sleep times depends on rates and avialable memory."""
if stable_sleep:
if print_sleep_periods:
log('Sleep {} sec'.format(min_sleep))
sleep(min_sleep)
return None
if mem_min_sigkill_kb < mem_min_sigterm_kb:
mem_point = mem_available - mem_min_sigterm_kb
else:
@ -1616,10 +1645,10 @@ def sleep_after_check_mem(k=1.0):
else:
t = t_mem_zram
if t > max_sleep_time:
t = max_sleep_time
elif t < min_sleep_time:
t = min_sleep_time
if t > max_sleep:
t = max_sleep
elif t < min_sleep:
t = min_sleep
else:
pass
@ -1639,7 +1668,7 @@ def sleep_after_check_mem(k=1.0):
except OSError:
pass
sleep(t * k)
sleep(t)
def calculate_percent(arg_key):
@ -2355,41 +2384,68 @@ else:
exit(1)
if 'max_sleep_time' in config_dict:
max_sleep_time = string_to_float_convert_test(
config_dict['max_sleep_time'])
if max_sleep_time is None:
errprint('Invalid max_sleep_time value, not float\nExit')
if 'max_sleep' in config_dict:
max_sleep = string_to_float_convert_test(
config_dict['max_sleep'])
if max_sleep is None:
errprint('Invalid max_sleep value, not float\nExit')
exit(1)
if max_sleep_time <= 0:
errprint('max_sleep_time must be positive number\nExit')
if max_sleep <= 0:
errprint('max_sleep must be positive number\nExit')
exit(1)
else:
errprint('max_sleep_time is not in config\nExit')
errprint('max_sleep is not in config\nExit')
exit(1)
if 'min_sleep_time' in config_dict:
min_sleep_time = string_to_float_convert_test(
config_dict['min_sleep_time'])
if min_sleep_time is None:
errprint('Invalid min_sleep_time value, not float\nExit')
if 'min_sleep' in config_dict:
min_sleep = string_to_float_convert_test(
config_dict['min_sleep'])
if min_sleep is None:
errprint('Invalid min_sleep value, not float\nExit')
exit(1)
if min_sleep_time <= 0:
errprint('min_sleep_time must be positive number\nExit')
if min_sleep <= 0:
errprint('min_sleep must be positive number\nExit')
exit(1)
else:
errprint('min_sleep_time is not in config\nExit')
errprint('min_sleep is not in config\nExit')
exit(1)
if max_sleep_time < min_sleep_time:
if 'over_sleep' in config_dict:
over_sleep = string_to_float_convert_test(
config_dict['over_sleep'])
if over_sleep is None:
errprint('Invalid over_sleep value, not float\nExit')
exit(1)
if over_sleep <= 0:
errprint('over_sleep must be positive number\nExit')
exit(1)
else:
errprint('over_sleep is not in config\nExit')
exit(1)
if max_sleep < min_sleep:
errprint(
'max_sleep_time value must not exceed min_sleep_time value.\nExit'
'max_sleep value must not exceed min_sleep value.\nExit'
)
exit(1)
if min_sleep < over_sleep:
errprint(
'min_sleep value must not exceed over_sleep value.\nExit'
)
exit(1)
if max_sleep == min_sleep:
stable_sleep = True
else:
stable_sleep = False
if print_proc_table_flag:
if not root:
@ -2609,8 +2665,8 @@ if print_mem_check_results:
# handle signals
for sig_num in sig_list:
signal(sig_num, signal_handler)
for i in sig_list:
signal(i, signal_handler)
while True:

View File

@ -128,11 +128,12 @@ rate_zram = 500
See also https://github.com/rfjakob/earlyoom/issues/61
max_sleep_time = 3
max_sleep = 3
min_sleep = 0.1
min_sleep_time = 0.1
Sleep time if soft threshold exceeded.
# sleep_time_if_threshold_is_exceeded = 0.02 # (todo)
over_sleep = 0.05
#####################################################################
@ -144,8 +145,12 @@ min_badness = 20
Valid values are non-negative floating-point numbers.
min_delay_after_sigterm = 2
min_delay_after_sigkill = 0.5
min_delay_after_sigterm = 3
New nohang behavior: check victim lifetime after killing.
This key should be removed from the config.
min_delay_after_sigkill = 0.001
Valid values are True and False.
Values are case sensitive.

View File

@ -46,9 +46,9 @@ with open('/proc/meminfo') as f:
if line.startswith('SwapTotal'):
swap_total = int(line.split(':')[1][:-4])
if swap_total > 0:
wait_time = 5
wait_time = 8
else:
wait_time = 1
wait_time = 2
print('nohang_notify_helper: wait_time:', wait_time)