check victim lifetime after killing
This commit is contained in:
parent
488592d9ad
commit
57417b0370
218
nohang
218
nohang
@ -41,7 +41,6 @@ sig_dict = {
|
|||||||
SIGTERM: 'SIGTERM'
|
SIGTERM: 'SIGTERM'
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
self_pid = str(os.getpid())
|
self_pid = str(os.getpid())
|
||||||
|
|
||||||
self_uid = os.geteuid()
|
self_uid = os.geteuid()
|
||||||
@ -61,10 +60,10 @@ else:
|
|||||||
victim_dict = dict()
|
victim_dict = dict()
|
||||||
|
|
||||||
|
|
||||||
#soft_post_action_delay = 1
|
# soft_post_action_delay = 1
|
||||||
# 1 - 5
|
# 1 - 5
|
||||||
|
|
||||||
#hard_post_action_delay = 0.2
|
# hard_post_action_delay = 0.2
|
||||||
# 0.2 - 1
|
# 0.2 - 1
|
||||||
|
|
||||||
|
|
||||||
@ -107,7 +106,7 @@ cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
|
|||||||
|
|
||||||
# define functions
|
# define functions
|
||||||
|
|
||||||
|
'''
|
||||||
def self_rss():
|
def self_rss():
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
@ -118,14 +117,28 @@ def print_self_rss():
|
|||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
log('Self RSS: {} MiB'.format(self_rss()))
|
log('Self RSS: {} MiB'.format(self_rss()))
|
||||||
|
'''
|
||||||
|
|
||||||
|
|
||||||
|
def pid_to_rss(pid):
|
||||||
|
try:
|
||||||
|
rss = rline1('/proc/{}/statm'.format(pid)).split(' ')[1]
|
||||||
|
except IndexError:
|
||||||
|
rss = '-0'
|
||||||
|
except FileNotFoundError:
|
||||||
|
rss = '-0'
|
||||||
|
except ProcessLookupError:
|
||||||
|
rss = '-0'
|
||||||
|
return rss
|
||||||
|
|
||||||
|
|
||||||
def signal_handler(signum, frame):
|
def signal_handler(signum, frame):
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
for sig_num in sig_list:
|
for i in sig_list:
|
||||||
signal(sig_num, signal_handler_inner)
|
signal(i, signal_handler_inner)
|
||||||
log('Got the {} signal '.format(sig_dict[signum]))
|
log('Signal handler called with the {} signal '.format(
|
||||||
|
sig_dict[signum]))
|
||||||
update_stat_dict_and_print(None)
|
update_stat_dict_and_print(None)
|
||||||
log('Exit')
|
log('Exit')
|
||||||
exit()
|
exit()
|
||||||
@ -134,7 +147,21 @@ def signal_handler(signum, frame):
|
|||||||
def signal_handler_inner(signum, frame):
|
def signal_handler_inner(signum, frame):
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
log('Got the {} signal (ignored) '.format(sig_dict[signum]))
|
log('Signal handler called with the {} signal (ignored) '.format(
|
||||||
|
sig_dict[signum]))
|
||||||
|
|
||||||
|
|
||||||
|
def exe(cmd):
|
||||||
|
"""
|
||||||
|
"""
|
||||||
|
log('Execute the command: {}'.format(cmd))
|
||||||
|
t0 = time()
|
||||||
|
write_self_oom_score_adj(self_oom_score_adj_max)
|
||||||
|
err = os.system(cmd)
|
||||||
|
write_self_oom_score_adj(self_oom_score_adj_min)
|
||||||
|
dt = time() - t0
|
||||||
|
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
|
||||||
|
return err
|
||||||
|
|
||||||
|
|
||||||
def write(path, string):
|
def write(path, string):
|
||||||
@ -158,19 +185,6 @@ self_oom_score_adj_max = '-6'
|
|||||||
write_self_oom_score_adj(self_oom_score_adj_min)
|
write_self_oom_score_adj(self_oom_score_adj_min)
|
||||||
|
|
||||||
|
|
||||||
def exe(cmd):
|
|
||||||
"""
|
|
||||||
"""
|
|
||||||
log('Execute the command: {}'.format(cmd))
|
|
||||||
t0 = time()
|
|
||||||
write_self_oom_score_adj(self_oom_score_adj_max)
|
|
||||||
err = os.system(cmd)
|
|
||||||
write_self_oom_score_adj(self_oom_score_adj_min)
|
|
||||||
dt = time() - t0
|
|
||||||
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
|
|
||||||
return err
|
|
||||||
|
|
||||||
|
|
||||||
def valid_re(reg_exp):
|
def valid_re(reg_exp):
|
||||||
"""Validate regular expression.
|
"""Validate regular expression.
|
||||||
"""
|
"""
|
||||||
@ -1346,14 +1360,27 @@ def implement_corrective_action(signal):
|
|||||||
if signal is SIGTERM:
|
if signal is SIGTERM:
|
||||||
dt = time() - corrective_actions_dict[SIGTERM]
|
dt = time() - corrective_actions_dict[SIGTERM]
|
||||||
if dt < min_delay_after_sigterm:
|
if dt < min_delay_after_sigterm:
|
||||||
#print(' soft_post_action_delay NOT EXCEEDED')
|
# print(' soft_post_action_delay NOT EXCEEDED')
|
||||||
sleep_after_check_mem(0.2)
|
|
||||||
return 0 # время задержки между действиями не истекло
|
if print_sleep_periods:
|
||||||
|
log('Sleep {} sec (in implement_corrective_action())'.format(
|
||||||
|
over_sleep))
|
||||||
|
|
||||||
|
sleep(over_sleep)
|
||||||
|
|
||||||
|
return None # время задержки между действиями не истекло
|
||||||
|
|
||||||
else:
|
else:
|
||||||
dt = time() - corrective_actions_dict[SIGKILL]
|
dt = time() - corrective_actions_dict[SIGKILL]
|
||||||
if dt < min_delay_after_sigkill:
|
if dt < min_delay_after_sigkill:
|
||||||
#print(' hard_post_action_delay NOT EXCEEDED')
|
# print(' hard_post_action_delay NOT EXCEEDED')
|
||||||
sleep_after_check_mem(0.2)
|
|
||||||
|
if print_sleep_periods:
|
||||||
|
log('Sleep {} sec (in implement_corrective_action())'.format(
|
||||||
|
over_sleep))
|
||||||
|
|
||||||
|
sleep(over_sleep)
|
||||||
|
|
||||||
return 0 # время задержки между действиями не истекло
|
return 0 # время задержки между действиями не истекло
|
||||||
|
|
||||||
log(mem_info)
|
log(mem_info)
|
||||||
@ -1472,7 +1499,25 @@ def implement_corrective_action(signal):
|
|||||||
signal = SIGKILL
|
signal = SIGKILL
|
||||||
|
|
||||||
os.kill(int(pid), signal)
|
os.kill(int(pid), signal)
|
||||||
|
|
||||||
response_time = time() - time0
|
response_time = time() - time0
|
||||||
|
|
||||||
|
sleep(0.001)
|
||||||
|
rp = os.path.exists('/proc/{}/exe'.format(pid))
|
||||||
|
|
||||||
|
if signal is SIGKILL or not rp:
|
||||||
|
|
||||||
|
t0 = time()
|
||||||
|
while True:
|
||||||
|
sleep(0.001)
|
||||||
|
rss = pid_to_rss(pid)
|
||||||
|
if rss == '-0':
|
||||||
|
break
|
||||||
|
t1 = time()
|
||||||
|
kill_duration = t1 - t0
|
||||||
|
log('The victim died in {} sec'.format(
|
||||||
|
round(kill_duration, 3)))
|
||||||
|
|
||||||
send_result = 'total response time: {} ms'.format(
|
send_result = 'total response time: {} ms'.format(
|
||||||
round(response_time * 1000))
|
round(response_time * 1000))
|
||||||
|
|
||||||
@ -1525,6 +1570,11 @@ def implement_corrective_action(signal):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
log(preventing_oom_message)
|
log(preventing_oom_message)
|
||||||
|
if rp:
|
||||||
|
log('Seems like the victim handles signal')
|
||||||
|
else:
|
||||||
|
log('Seems like the victim is dead or zombie')
|
||||||
|
|
||||||
except UnboundLocalError:
|
except UnboundLocalError:
|
||||||
preventing_oom_message = key
|
preventing_oom_message = key
|
||||||
|
|
||||||
@ -1545,8 +1595,6 @@ def implement_corrective_action(signal):
|
|||||||
key = 'victim badness < min_badness'
|
key = 'victim badness < min_badness'
|
||||||
update_stat_dict_and_print(key)
|
update_stat_dict_and_print(key)
|
||||||
|
|
||||||
# sleep_after_send_signal(signal)
|
|
||||||
|
|
||||||
if signal is SIGTERM:
|
if signal is SIGTERM:
|
||||||
corrective_actions_dict[SIGTERM] = time()
|
corrective_actions_dict[SIGTERM] = time()
|
||||||
else:
|
else:
|
||||||
@ -1556,36 +1604,17 @@ def implement_corrective_action(signal):
|
|||||||
print('##################################################################')
|
print('##################################################################')
|
||||||
|
|
||||||
|
|
||||||
'''
|
def sleep_after_check_mem():
|
||||||
def sleep_after_send_signal(signal):
|
|
||||||
"""
|
|
||||||
Sleeping after signal was sent.
|
|
||||||
|
|
||||||
signal: sent signal
|
|
||||||
"""
|
|
||||||
|
|
||||||
#min_delay_after_sigterm = 0.01
|
|
||||||
#min_delay_after_sigkill = 0.01
|
|
||||||
|
|
||||||
return 0
|
|
||||||
|
|
||||||
|
|
||||||
if signal is SIGKILL:
|
|
||||||
if print_sleep_periods:
|
|
||||||
log('Sleep {} sec after implementing a corrective action'.format(
|
|
||||||
min_delay_after_sigkill))
|
|
||||||
sleep(min_delay_after_sigkill)
|
|
||||||
else:
|
|
||||||
if print_sleep_periods:
|
|
||||||
log('Sleep {} sec after implementing a corrective action'.format(
|
|
||||||
min_delay_after_sigterm))
|
|
||||||
sleep(min_delay_after_sigterm)
|
|
||||||
'''
|
|
||||||
|
|
||||||
|
|
||||||
def sleep_after_check_mem(k=1.0):
|
|
||||||
"""Specify sleep times depends on rates and avialable memory."""
|
"""Specify sleep times depends on rates and avialable memory."""
|
||||||
|
|
||||||
|
if stable_sleep:
|
||||||
|
|
||||||
|
if print_sleep_periods:
|
||||||
|
log('Sleep {} sec'.format(min_sleep))
|
||||||
|
|
||||||
|
sleep(min_sleep)
|
||||||
|
return None
|
||||||
|
|
||||||
if mem_min_sigkill_kb < mem_min_sigterm_kb:
|
if mem_min_sigkill_kb < mem_min_sigterm_kb:
|
||||||
mem_point = mem_available - mem_min_sigterm_kb
|
mem_point = mem_available - mem_min_sigterm_kb
|
||||||
else:
|
else:
|
||||||
@ -1616,10 +1645,10 @@ def sleep_after_check_mem(k=1.0):
|
|||||||
else:
|
else:
|
||||||
t = t_mem_zram
|
t = t_mem_zram
|
||||||
|
|
||||||
if t > max_sleep_time:
|
if t > max_sleep:
|
||||||
t = max_sleep_time
|
t = max_sleep
|
||||||
elif t < min_sleep_time:
|
elif t < min_sleep:
|
||||||
t = min_sleep_time
|
t = min_sleep
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@ -1639,7 +1668,7 @@ def sleep_after_check_mem(k=1.0):
|
|||||||
except OSError:
|
except OSError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
sleep(t * k)
|
sleep(t)
|
||||||
|
|
||||||
|
|
||||||
def calculate_percent(arg_key):
|
def calculate_percent(arg_key):
|
||||||
@ -2355,41 +2384,68 @@ else:
|
|||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
if 'max_sleep_time' in config_dict:
|
if 'max_sleep' in config_dict:
|
||||||
max_sleep_time = string_to_float_convert_test(
|
max_sleep = string_to_float_convert_test(
|
||||||
config_dict['max_sleep_time'])
|
config_dict['max_sleep'])
|
||||||
if max_sleep_time is None:
|
if max_sleep is None:
|
||||||
errprint('Invalid max_sleep_time value, not float\nExit')
|
errprint('Invalid max_sleep value, not float\nExit')
|
||||||
exit(1)
|
exit(1)
|
||||||
if max_sleep_time <= 0:
|
if max_sleep <= 0:
|
||||||
errprint('max_sleep_time must be positive number\nExit')
|
errprint('max_sleep must be positive number\nExit')
|
||||||
exit(1)
|
exit(1)
|
||||||
else:
|
else:
|
||||||
errprint('max_sleep_time is not in config\nExit')
|
errprint('max_sleep is not in config\nExit')
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
if 'min_sleep_time' in config_dict:
|
if 'min_sleep' in config_dict:
|
||||||
min_sleep_time = string_to_float_convert_test(
|
min_sleep = string_to_float_convert_test(
|
||||||
config_dict['min_sleep_time'])
|
config_dict['min_sleep'])
|
||||||
if min_sleep_time is None:
|
if min_sleep is None:
|
||||||
errprint('Invalid min_sleep_time value, not float\nExit')
|
errprint('Invalid min_sleep value, not float\nExit')
|
||||||
exit(1)
|
exit(1)
|
||||||
if min_sleep_time <= 0:
|
if min_sleep <= 0:
|
||||||
errprint('min_sleep_time must be positive number\nExit')
|
errprint('min_sleep must be positive number\nExit')
|
||||||
exit(1)
|
exit(1)
|
||||||
else:
|
else:
|
||||||
errprint('min_sleep_time is not in config\nExit')
|
errprint('min_sleep is not in config\nExit')
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
if max_sleep_time < min_sleep_time:
|
if 'over_sleep' in config_dict:
|
||||||
|
over_sleep = string_to_float_convert_test(
|
||||||
|
config_dict['over_sleep'])
|
||||||
|
if over_sleep is None:
|
||||||
|
errprint('Invalid over_sleep value, not float\nExit')
|
||||||
|
exit(1)
|
||||||
|
if over_sleep <= 0:
|
||||||
|
errprint('over_sleep must be positive number\nExit')
|
||||||
|
exit(1)
|
||||||
|
else:
|
||||||
|
errprint('over_sleep is not in config\nExit')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if max_sleep < min_sleep:
|
||||||
errprint(
|
errprint(
|
||||||
'max_sleep_time value must not exceed min_sleep_time value.\nExit'
|
'max_sleep value must not exceed min_sleep value.\nExit'
|
||||||
)
|
)
|
||||||
exit(1)
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if min_sleep < over_sleep:
|
||||||
|
errprint(
|
||||||
|
'min_sleep value must not exceed over_sleep value.\nExit'
|
||||||
|
)
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
if max_sleep == min_sleep:
|
||||||
|
stable_sleep = True
|
||||||
|
else:
|
||||||
|
stable_sleep = False
|
||||||
|
|
||||||
|
|
||||||
if print_proc_table_flag:
|
if print_proc_table_flag:
|
||||||
|
|
||||||
if not root:
|
if not root:
|
||||||
@ -2609,8 +2665,8 @@ if print_mem_check_results:
|
|||||||
|
|
||||||
|
|
||||||
# handle signals
|
# handle signals
|
||||||
for sig_num in sig_list:
|
for i in sig_list:
|
||||||
signal(sig_num, signal_handler)
|
signal(i, signal_handler)
|
||||||
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
|
15
nohang.conf
15
nohang.conf
@ -128,11 +128,12 @@ rate_zram = 500
|
|||||||
|
|
||||||
See also https://github.com/rfjakob/earlyoom/issues/61
|
See also https://github.com/rfjakob/earlyoom/issues/61
|
||||||
|
|
||||||
max_sleep_time = 3
|
max_sleep = 3
|
||||||
|
min_sleep = 0.1
|
||||||
|
|
||||||
min_sleep_time = 0.1
|
Sleep time if soft threshold exceeded.
|
||||||
|
|
||||||
# sleep_time_if_threshold_is_exceeded = 0.02 # (todo)
|
over_sleep = 0.05
|
||||||
|
|
||||||
#####################################################################
|
#####################################################################
|
||||||
|
|
||||||
@ -144,8 +145,12 @@ min_badness = 20
|
|||||||
|
|
||||||
Valid values are non-negative floating-point numbers.
|
Valid values are non-negative floating-point numbers.
|
||||||
|
|
||||||
min_delay_after_sigterm = 2
|
min_delay_after_sigterm = 3
|
||||||
min_delay_after_sigkill = 0.5
|
|
||||||
|
New nohang behavior: check victim lifetime after killing.
|
||||||
|
This key should be removed from the config.
|
||||||
|
|
||||||
|
min_delay_after_sigkill = 0.001
|
||||||
|
|
||||||
Valid values are True and False.
|
Valid values are True and False.
|
||||||
Values are case sensitive.
|
Values are case sensitive.
|
||||||
|
@ -46,9 +46,9 @@ with open('/proc/meminfo') as f:
|
|||||||
if line.startswith('SwapTotal'):
|
if line.startswith('SwapTotal'):
|
||||||
swap_total = int(line.split(':')[1][:-4])
|
swap_total = int(line.split(':')[1][:-4])
|
||||||
if swap_total > 0:
|
if swap_total > 0:
|
||||||
wait_time = 5
|
wait_time = 8
|
||||||
else:
|
else:
|
||||||
wait_time = 1
|
wait_time = 2
|
||||||
|
|
||||||
|
|
||||||
print('nohang_notify_helper: wait_time:', wait_time)
|
print('nohang_notify_helper: wait_time:', wait_time)
|
||||||
|
Loading…
Reference in New Issue
Block a user