check victim lifetime after killing

This commit is contained in:
Alexey Avramov 2019-05-04 20:46:38 +09:00
parent 488592d9ad
commit 57417b0370
3 changed files with 149 additions and 88 deletions

218
nohang
View File

@ -41,7 +41,6 @@ sig_dict = {
SIGTERM: 'SIGTERM' SIGTERM: 'SIGTERM'
} }
self_pid = str(os.getpid()) self_pid = str(os.getpid())
self_uid = os.geteuid() self_uid = os.geteuid()
@ -61,10 +60,10 @@ else:
victim_dict = dict() victim_dict = dict()
#soft_post_action_delay = 1 # soft_post_action_delay = 1
# 1 - 5 # 1 - 5
#hard_post_action_delay = 0.2 # hard_post_action_delay = 0.2
# 0.2 - 1 # 0.2 - 1
@ -107,7 +106,7 @@ cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
# define functions # define functions
'''
def self_rss(): def self_rss():
""" """
""" """
@ -118,14 +117,28 @@ def print_self_rss():
""" """
""" """
log('Self RSS: {} MiB'.format(self_rss())) log('Self RSS: {} MiB'.format(self_rss()))
'''
def pid_to_rss(pid):
try:
rss = rline1('/proc/{}/statm'.format(pid)).split(' ')[1]
except IndexError:
rss = '-0'
except FileNotFoundError:
rss = '-0'
except ProcessLookupError:
rss = '-0'
return rss
def signal_handler(signum, frame): def signal_handler(signum, frame):
""" """
""" """
for sig_num in sig_list: for i in sig_list:
signal(sig_num, signal_handler_inner) signal(i, signal_handler_inner)
log('Got the {} signal '.format(sig_dict[signum])) log('Signal handler called with the {} signal '.format(
sig_dict[signum]))
update_stat_dict_and_print(None) update_stat_dict_and_print(None)
log('Exit') log('Exit')
exit() exit()
@ -134,7 +147,21 @@ def signal_handler(signum, frame):
def signal_handler_inner(signum, frame): def signal_handler_inner(signum, frame):
""" """
""" """
log('Got the {} signal (ignored) '.format(sig_dict[signum])) log('Signal handler called with the {} signal (ignored) '.format(
sig_dict[signum]))
def exe(cmd):
"""
"""
log('Execute the command: {}'.format(cmd))
t0 = time()
write_self_oom_score_adj(self_oom_score_adj_max)
err = os.system(cmd)
write_self_oom_score_adj(self_oom_score_adj_min)
dt = time() - t0
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
return err
def write(path, string): def write(path, string):
@ -158,19 +185,6 @@ self_oom_score_adj_max = '-6'
write_self_oom_score_adj(self_oom_score_adj_min) write_self_oom_score_adj(self_oom_score_adj_min)
def exe(cmd):
"""
"""
log('Execute the command: {}'.format(cmd))
t0 = time()
write_self_oom_score_adj(self_oom_score_adj_max)
err = os.system(cmd)
write_self_oom_score_adj(self_oom_score_adj_min)
dt = time() - t0
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
return err
def valid_re(reg_exp): def valid_re(reg_exp):
"""Validate regular expression. """Validate regular expression.
""" """
@ -1346,14 +1360,27 @@ def implement_corrective_action(signal):
if signal is SIGTERM: if signal is SIGTERM:
dt = time() - corrective_actions_dict[SIGTERM] dt = time() - corrective_actions_dict[SIGTERM]
if dt < min_delay_after_sigterm: if dt < min_delay_after_sigterm:
#print(' soft_post_action_delay NOT EXCEEDED') # print(' soft_post_action_delay NOT EXCEEDED')
sleep_after_check_mem(0.2)
return 0 # время задержки между действиями не истекло if print_sleep_periods:
log('Sleep {} sec (in implement_corrective_action())'.format(
over_sleep))
sleep(over_sleep)
return None # время задержки между действиями не истекло
else: else:
dt = time() - corrective_actions_dict[SIGKILL] dt = time() - corrective_actions_dict[SIGKILL]
if dt < min_delay_after_sigkill: if dt < min_delay_after_sigkill:
#print(' hard_post_action_delay NOT EXCEEDED') # print(' hard_post_action_delay NOT EXCEEDED')
sleep_after_check_mem(0.2)
if print_sleep_periods:
log('Sleep {} sec (in implement_corrective_action())'.format(
over_sleep))
sleep(over_sleep)
return 0 # время задержки между действиями не истекло return 0 # время задержки между действиями не истекло
log(mem_info) log(mem_info)
@ -1472,7 +1499,25 @@ def implement_corrective_action(signal):
signal = SIGKILL signal = SIGKILL
os.kill(int(pid), signal) os.kill(int(pid), signal)
response_time = time() - time0 response_time = time() - time0
sleep(0.001)
rp = os.path.exists('/proc/{}/exe'.format(pid))
if signal is SIGKILL or not rp:
t0 = time()
while True:
sleep(0.001)
rss = pid_to_rss(pid)
if rss == '-0':
break
t1 = time()
kill_duration = t1 - t0
log('The victim died in {} sec'.format(
round(kill_duration, 3)))
send_result = 'total response time: {} ms'.format( send_result = 'total response time: {} ms'.format(
round(response_time * 1000)) round(response_time * 1000))
@ -1525,6 +1570,11 @@ def implement_corrective_action(signal):
try: try:
log(preventing_oom_message) log(preventing_oom_message)
if rp:
log('Seems like the victim handles signal')
else:
log('Seems like the victim is dead or zombie')
except UnboundLocalError: except UnboundLocalError:
preventing_oom_message = key preventing_oom_message = key
@ -1545,8 +1595,6 @@ def implement_corrective_action(signal):
key = 'victim badness < min_badness' key = 'victim badness < min_badness'
update_stat_dict_and_print(key) update_stat_dict_and_print(key)
# sleep_after_send_signal(signal)
if signal is SIGTERM: if signal is SIGTERM:
corrective_actions_dict[SIGTERM] = time() corrective_actions_dict[SIGTERM] = time()
else: else:
@ -1556,36 +1604,17 @@ def implement_corrective_action(signal):
print('##################################################################') print('##################################################################')
''' def sleep_after_check_mem():
def sleep_after_send_signal(signal):
"""
Sleeping after signal was sent.
signal: sent signal
"""
#min_delay_after_sigterm = 0.01
#min_delay_after_sigkill = 0.01
return 0
if signal is SIGKILL:
if print_sleep_periods:
log('Sleep {} sec after implementing a corrective action'.format(
min_delay_after_sigkill))
sleep(min_delay_after_sigkill)
else:
if print_sleep_periods:
log('Sleep {} sec after implementing a corrective action'.format(
min_delay_after_sigterm))
sleep(min_delay_after_sigterm)
'''
def sleep_after_check_mem(k=1.0):
"""Specify sleep times depends on rates and avialable memory.""" """Specify sleep times depends on rates and avialable memory."""
if stable_sleep:
if print_sleep_periods:
log('Sleep {} sec'.format(min_sleep))
sleep(min_sleep)
return None
if mem_min_sigkill_kb < mem_min_sigterm_kb: if mem_min_sigkill_kb < mem_min_sigterm_kb:
mem_point = mem_available - mem_min_sigterm_kb mem_point = mem_available - mem_min_sigterm_kb
else: else:
@ -1616,10 +1645,10 @@ def sleep_after_check_mem(k=1.0):
else: else:
t = t_mem_zram t = t_mem_zram
if t > max_sleep_time: if t > max_sleep:
t = max_sleep_time t = max_sleep
elif t < min_sleep_time: elif t < min_sleep:
t = min_sleep_time t = min_sleep
else: else:
pass pass
@ -1639,7 +1668,7 @@ def sleep_after_check_mem(k=1.0):
except OSError: except OSError:
pass pass
sleep(t * k) sleep(t)
def calculate_percent(arg_key): def calculate_percent(arg_key):
@ -2355,41 +2384,68 @@ else:
exit(1) exit(1)
if 'max_sleep_time' in config_dict: if 'max_sleep' in config_dict:
max_sleep_time = string_to_float_convert_test( max_sleep = string_to_float_convert_test(
config_dict['max_sleep_time']) config_dict['max_sleep'])
if max_sleep_time is None: if max_sleep is None:
errprint('Invalid max_sleep_time value, not float\nExit') errprint('Invalid max_sleep value, not float\nExit')
exit(1) exit(1)
if max_sleep_time <= 0: if max_sleep <= 0:
errprint('max_sleep_time must be positive number\nExit') errprint('max_sleep must be positive number\nExit')
exit(1) exit(1)
else: else:
errprint('max_sleep_time is not in config\nExit') errprint('max_sleep is not in config\nExit')
exit(1) exit(1)
if 'min_sleep_time' in config_dict: if 'min_sleep' in config_dict:
min_sleep_time = string_to_float_convert_test( min_sleep = string_to_float_convert_test(
config_dict['min_sleep_time']) config_dict['min_sleep'])
if min_sleep_time is None: if min_sleep is None:
errprint('Invalid min_sleep_time value, not float\nExit') errprint('Invalid min_sleep value, not float\nExit')
exit(1) exit(1)
if min_sleep_time <= 0: if min_sleep <= 0:
errprint('min_sleep_time must be positive number\nExit') errprint('min_sleep must be positive number\nExit')
exit(1) exit(1)
else: else:
errprint('min_sleep_time is not in config\nExit') errprint('min_sleep is not in config\nExit')
exit(1) exit(1)
if max_sleep_time < min_sleep_time: if 'over_sleep' in config_dict:
over_sleep = string_to_float_convert_test(
config_dict['over_sleep'])
if over_sleep is None:
errprint('Invalid over_sleep value, not float\nExit')
exit(1)
if over_sleep <= 0:
errprint('over_sleep must be positive number\nExit')
exit(1)
else:
errprint('over_sleep is not in config\nExit')
exit(1)
if max_sleep < min_sleep:
errprint( errprint(
'max_sleep_time value must not exceed min_sleep_time value.\nExit' 'max_sleep value must not exceed min_sleep value.\nExit'
) )
exit(1) exit(1)
if min_sleep < over_sleep:
errprint(
'min_sleep value must not exceed over_sleep value.\nExit'
)
exit(1)
if max_sleep == min_sleep:
stable_sleep = True
else:
stable_sleep = False
if print_proc_table_flag: if print_proc_table_flag:
if not root: if not root:
@ -2609,8 +2665,8 @@ if print_mem_check_results:
# handle signals # handle signals
for sig_num in sig_list: for i in sig_list:
signal(sig_num, signal_handler) signal(i, signal_handler)
while True: while True:

View File

@ -128,11 +128,12 @@ rate_zram = 500
See also https://github.com/rfjakob/earlyoom/issues/61 See also https://github.com/rfjakob/earlyoom/issues/61
max_sleep_time = 3 max_sleep = 3
min_sleep = 0.1
min_sleep_time = 0.1 Sleep time if soft threshold exceeded.
# sleep_time_if_threshold_is_exceeded = 0.02 # (todo) over_sleep = 0.05
##################################################################### #####################################################################
@ -144,8 +145,12 @@ min_badness = 20
Valid values are non-negative floating-point numbers. Valid values are non-negative floating-point numbers.
min_delay_after_sigterm = 2 min_delay_after_sigterm = 3
min_delay_after_sigkill = 0.5
New nohang behavior: check victim lifetime after killing.
This key should be removed from the config.
min_delay_after_sigkill = 0.001
Valid values are True and False. Valid values are True and False.
Values are case sensitive. Values are case sensitive.

View File

@ -46,9 +46,9 @@ with open('/proc/meminfo') as f:
if line.startswith('SwapTotal'): if line.startswith('SwapTotal'):
swap_total = int(line.split(':')[1][:-4]) swap_total = int(line.split(':')[1][:-4])
if swap_total > 0: if swap_total > 0:
wait_time = 5 wait_time = 8
else: else:
wait_time = 1 wait_time = 2
print('nohang_notify_helper: wait_time:', wait_time) print('nohang_notify_helper: wait_time:', wait_time)