fix CLI input; fix UnicodeDecodeError; add Lifetime to victim info; add max_post_sigterm_victim_lifetime
This commit is contained in:
parent
7e34a6e03d
commit
7b154d2ae9
385
nohang
385
nohang
@ -21,30 +21,7 @@ optional arguments:
|
|||||||
./nohang.conf, /etc/nohang/nohang.conf"""
|
./nohang.conf, /etc/nohang/nohang.conf"""
|
||||||
|
|
||||||
|
|
||||||
if len(argv) == 1:
|
SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
|
||||||
if os.path.exists('./nohang.conf'):
|
|
||||||
config = cd = os.getcwd() + '/nohang.conf'
|
|
||||||
else:
|
|
||||||
config = '/etc/nohang/nohang.conf'
|
|
||||||
|
|
||||||
elif len(argv) == 2:
|
|
||||||
if argv[1] == '--help' or argv[1] == '-h':
|
|
||||||
errprint(help_mess)
|
|
||||||
exit(1)
|
|
||||||
else:
|
|
||||||
errprint('Invalid CLI input')
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
elif len(argv) > 3:
|
|
||||||
errprint('Invalid CLI input')
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
else:
|
|
||||||
if argv[1] == '--config' or argv[1] == '-c':
|
|
||||||
config = argv[2]
|
|
||||||
else:
|
|
||||||
errprint('Invalid option: {}'.format(argv[1]))
|
|
||||||
exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
conf_err_mess = 'Invalid config. Exit.'
|
conf_err_mess = 'Invalid config. Exit.'
|
||||||
@ -85,20 +62,51 @@ print_proc_table = False
|
|||||||
|
|
||||||
min_mem_report_interval = 5
|
min_mem_report_interval = 5
|
||||||
|
|
||||||
|
|
||||||
post_kill_exe = ''
|
post_kill_exe = ''
|
||||||
|
|
||||||
|
victim_dict = dict()
|
||||||
|
|
||||||
|
max_ancestry_depth = 1
|
||||||
|
|
||||||
|
max_post_sigterm_victim_lifetime = 9
|
||||||
|
|
||||||
|
|
||||||
##########################################################################
|
##########################################################################
|
||||||
|
|
||||||
# define functions
|
# define functions
|
||||||
|
|
||||||
|
|
||||||
def errprint(text):
|
def uptime():
|
||||||
print(text, file=stderr, flush=True)
|
return float(rline1('/proc/uptime').split(' ')[0])
|
||||||
|
|
||||||
|
|
||||||
|
def pid_to_starttime(pid):
|
||||||
|
|
||||||
|
try:
|
||||||
|
starttime = rline1('/proc/' + pid + '/stat').rpartition(')')[
|
||||||
|
2].split(' ')[20]
|
||||||
|
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
print('LOL')
|
||||||
|
with open('/proc/' + pid + '/stat', 'rb') as f:
|
||||||
|
starttime = f.read().decode('utf-8', 'ignore').rpartition(
|
||||||
|
')')[2].split(' ')[20]
|
||||||
|
|
||||||
|
return float(starttime) / SC_CLK_TCK
|
||||||
|
|
||||||
|
|
||||||
|
def get_victim_id(pid):
|
||||||
|
# todo: handle UnicodeDecodeError
|
||||||
|
return pid + '-' + rline1(
|
||||||
|
'/proc/' + pid + '/stat').rpartition(')')[2].split(' ')[20]
|
||||||
|
|
||||||
|
|
||||||
|
def errprint(*text):
|
||||||
|
print(*text, file=stderr, flush=True)
|
||||||
|
|
||||||
|
|
||||||
def mlockall():
|
def mlockall():
|
||||||
|
"""Lock all memory to prevent swapping nohang process."""
|
||||||
|
|
||||||
MCL_CURRENT = 1
|
MCL_CURRENT = 1
|
||||||
MCL_FUTURE = 2
|
MCL_FUTURE = 2
|
||||||
@ -188,17 +196,17 @@ def check_zram():
|
|||||||
disksize_sum += int(stat[0])
|
disksize_sum += int(stat[0])
|
||||||
mem_used_total_sum += int(stat[1])
|
mem_used_total_sum += int(stat[1])
|
||||||
|
|
||||||
ZRAM_DISKSIZE_FACTOR = 0.0042
|
# Means that when setting zram disksize = 1 GiB available memory
|
||||||
# Означает, что при задани zram disksize = 1 GiB доступная память
|
# decrease by 0.0042 GiB.
|
||||||
# уменьшится на 0.0042 GiB.
|
# Found experimentally, requires clarification with different kernaels and architectures.
|
||||||
# Найден экспериментально, требует уточнения с разными ядрами и архитектурами.
|
# On small disk drives (up to gigabyte) it can be more, up to 0.0045.
|
||||||
# На небольших дисксайзах (до гигабайта) может быть больше, до 0.0045.
|
# The creator of the zram module claims that ZRAM_DISKSIZE_FACTOR should be 0.001:
|
||||||
# Создатель модуля zram утверждает, что ZRAM_DISKSIZE_FACTOR доожен быть 0.001:
|
|
||||||
# ("zram uses about 0.1% of the size of the disk"
|
# ("zram uses about 0.1% of the size of the disk"
|
||||||
# - https://www.kernel.org/doc/Documentation/blockdev/zram.txt),
|
# - https://www.kernel.org/doc/Documentation/blockdev/zram.txt),
|
||||||
# но это утверждение противоречит опытным данным.
|
# but this statement contradicts the experimental data.
|
||||||
# ZRAM_DISKSIZE_FACTOR = deltaMemAvailavle / disksize
|
# ZRAM_DISKSIZE_FACTOR = deltaMemAvailavle / disksize
|
||||||
# found experimentally
|
# Found experimentally.
|
||||||
|
ZRAM_DISKSIZE_FACTOR = 0.0042
|
||||||
|
|
||||||
return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0
|
return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0
|
||||||
|
|
||||||
@ -276,9 +284,15 @@ def conf_parse_bool(param):
|
|||||||
|
|
||||||
def rline1(path):
|
def rline1(path):
|
||||||
"""read 1st line from path."""
|
"""read 1st line from path."""
|
||||||
|
try:
|
||||||
with open(path) as f:
|
with open(path) as f:
|
||||||
for line in f:
|
for line in f:
|
||||||
return line[:-1]
|
return line[:-1]
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# print('UDE rline1', path)
|
||||||
|
with open(path, 'rb') as f:
|
||||||
|
return f.read(999).decode(
|
||||||
|
'utf-8', 'ignore').split('\n')[0]
|
||||||
|
|
||||||
|
|
||||||
def kib_to_mib(num):
|
def kib_to_mib(num):
|
||||||
@ -368,19 +382,28 @@ def pid_to_ppid(pid):
|
|||||||
f_list = f.read().decode('utf-8', 'ignore').split('\n')
|
f_list = f.read().decode('utf-8', 'ignore').split('\n')
|
||||||
for i in range(len(f_list)):
|
for i in range(len(f_list)):
|
||||||
if i is ppid_index:
|
if i is ppid_index:
|
||||||
ppid = f_list[i].split('\t')[1]
|
return f_list[i].split('\t')[1]
|
||||||
|
|
||||||
|
|
||||||
def pid_to_ancestry(pid):
|
def pid_to_ancestry(pid, max_ancestry_depth=1):
|
||||||
|
if max_ancestry_depth == 1:
|
||||||
|
ppid = pid_to_ppid(pid)
|
||||||
|
pname = pid_to_name(ppid)
|
||||||
|
return '\n PPID: {} ({})'.format(ppid, pname)
|
||||||
|
if max_ancestry_depth == 0:
|
||||||
|
return ''
|
||||||
anc_list = []
|
anc_list = []
|
||||||
while True:
|
for i in range(max_ancestry_depth):
|
||||||
ppid = pid_to_ppid(pid)
|
ppid = pid_to_ppid(pid)
|
||||||
pname = pid_to_name(ppid)
|
pname = pid_to_name(ppid)
|
||||||
anc_list.append((ppid, pname))
|
anc_list.append((ppid, pname))
|
||||||
if ppid == '1':
|
if ppid == '1':
|
||||||
break
|
break
|
||||||
pid = ppid
|
pid = ppid
|
||||||
print('Ancestry: ', anc_list)
|
a = ''
|
||||||
|
for i in anc_list:
|
||||||
|
a = a + ' <= PID {} ({})'.format(i[0], i[1])
|
||||||
|
return '\n Ancestry: ' + a[4:]
|
||||||
|
|
||||||
|
|
||||||
def pid_to_cmdline(pid):
|
def pid_to_cmdline(pid):
|
||||||
@ -438,7 +461,7 @@ def send_notify_warn():
|
|||||||
|
|
||||||
'''
|
'''
|
||||||
# find process with max badness
|
# find process with max badness
|
||||||
fat_tuple = fattest()
|
fat_tuple = find_victim()
|
||||||
pid = fat_tuple[0]
|
pid = fat_tuple[0]
|
||||||
name = pid_to_name(pid)
|
name = pid_to_name(pid)
|
||||||
|
|
||||||
@ -580,40 +603,13 @@ def get_non_decimal_pids():
|
|||||||
return non_decimal_list
|
return non_decimal_list
|
||||||
|
|
||||||
|
|
||||||
def fattest():
|
def pid_to_badness(pid):
|
||||||
"""
|
"""Find and modify badness (if it needs)."""
|
||||||
Find the process with highest badness and its badness adjustment
|
|
||||||
Return pid and badness
|
|
||||||
|
|
||||||
-> find_mem_hog() or find_victim() or find_worst_process()
|
|
||||||
"""
|
|
||||||
|
|
||||||
ft1 = time()
|
|
||||||
|
|
||||||
pid_list = get_pid_list()
|
|
||||||
|
|
||||||
pid_list.remove(self_pid)
|
|
||||||
|
|
||||||
if '1' in pid_list:
|
|
||||||
pid_list.remove('1')
|
|
||||||
|
|
||||||
non_decimal_list = get_non_decimal_pids()
|
|
||||||
|
|
||||||
for i in non_decimal_list:
|
|
||||||
pid_list.remove(i)
|
|
||||||
|
|
||||||
pid_badness_list = []
|
|
||||||
|
|
||||||
if print_proc_table:
|
|
||||||
print(' PID badness Name eUID')
|
|
||||||
print('------- ------- --------------- ----------')
|
|
||||||
|
|
||||||
for pid in pid_list:
|
|
||||||
|
|
||||||
# find and modify badness (if it needs)
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
badness = int(rline1('/proc/' + pid + '/oom_score'))
|
oom_score = int(rline1('/proc/' + pid + '/oom_score'))
|
||||||
|
badness = oom_score
|
||||||
|
|
||||||
if decrease_oom_score_adj:
|
if decrease_oom_score_adj:
|
||||||
oom_score_adj = int(rline1('/proc/' + pid + '/oom_score_adj'))
|
oom_score_adj = int(rline1('/proc/' + pid + '/oom_score_adj'))
|
||||||
@ -638,19 +634,56 @@ def fattest():
|
|||||||
if search(re_tup[1], uid) is not None:
|
if search(re_tup[1], uid) is not None:
|
||||||
badness += int(re_tup[0])
|
badness += int(re_tup[0])
|
||||||
|
|
||||||
|
return badness, oom_score
|
||||||
|
|
||||||
|
except FileNotFoundError:
|
||||||
|
return None, None
|
||||||
|
except ProcessLookupError:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
|
||||||
|
def find_victim():
|
||||||
|
"""
|
||||||
|
Find the process with highest badness and its badness adjustment
|
||||||
|
Return pid and badness
|
||||||
|
"""
|
||||||
|
|
||||||
|
ft1 = time()
|
||||||
|
|
||||||
|
pid_list = get_pid_list()
|
||||||
|
|
||||||
|
pid_list.remove(self_pid)
|
||||||
|
|
||||||
|
if '1' in pid_list:
|
||||||
|
pid_list.remove('1')
|
||||||
|
|
||||||
|
non_decimal_list = get_non_decimal_pids()
|
||||||
|
|
||||||
|
for i in non_decimal_list:
|
||||||
|
pid_list.remove(i)
|
||||||
|
|
||||||
|
pid_badness_list = []
|
||||||
|
|
||||||
if print_proc_table:
|
if print_proc_table:
|
||||||
print('{} {} {} {}'.format(
|
print(' PID badness Name eUID cmdline')
|
||||||
|
print('------- ------- --------------- ---------- -------')
|
||||||
|
|
||||||
|
for pid in pid_list:
|
||||||
|
|
||||||
|
badness = pid_to_badness(pid)[0]
|
||||||
|
if badness is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if print_proc_table:
|
||||||
|
print('{} {} {} {} {}'.format(
|
||||||
pid.rjust(7),
|
pid.rjust(7),
|
||||||
str(badness).rjust(7),
|
str(badness).rjust(7),
|
||||||
pid_to_name(pid).ljust(15),
|
pid_to_name(pid).ljust(15),
|
||||||
pid_to_uid(pid).rjust(10)
|
pid_to_uid(pid).rjust(10),
|
||||||
)
|
pid_to_cmdline(pid))
|
||||||
)
|
)
|
||||||
|
|
||||||
except FileNotFoundError:
|
|
||||||
continue
|
|
||||||
except ProcessLookupError:
|
|
||||||
continue
|
|
||||||
pid_badness_list.append((pid, badness))
|
pid_badness_list.append((pid, badness))
|
||||||
|
|
||||||
# Make list of (pid, badness) tuples, sorted by 'badness' values
|
# Make list of (pid, badness) tuples, sorted by 'badness' values
|
||||||
@ -665,45 +698,37 @@ def fattest():
|
|||||||
|
|
||||||
# Get maximum 'badness' value
|
# Get maximum 'badness' value
|
||||||
victim_badness = pid_tuple_list[1]
|
victim_badness = pid_tuple_list[1]
|
||||||
|
victim_name = pid_to_name(pid)
|
||||||
|
|
||||||
print(
|
print(
|
||||||
'\nWorst process (PID: {}, Name: {}, badness: {}) found in {} ms'.format(
|
'\nWorst process (PID: {}, Name: {}, badness: {}) found in {} ms'.format(
|
||||||
pid,
|
pid,
|
||||||
pid_to_name(pid),
|
victim_name,
|
||||||
victim_badness,
|
victim_badness,
|
||||||
round((time() - ft1) * 1000)
|
round((time() - ft1) * 1000)
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
return pid, victim_badness
|
return pid, victim_badness, victim_name
|
||||||
|
|
||||||
|
|
||||||
def find_victim_and_send_signal(signal):
|
def find_victim_info(pid, victim_badness, name):
|
||||||
"""
|
|
||||||
Find victim with highest badness and send SIGTERM/SIGKILL
|
|
||||||
|
|
||||||
-> implement_corrective_action()
|
status0 = time()
|
||||||
"""
|
|
||||||
|
|
||||||
pid, victim_badness = fattest()
|
|
||||||
name = pid_to_name(pid)
|
|
||||||
|
|
||||||
if victim_badness >= min_badness:
|
|
||||||
|
|
||||||
# Try to send signal to found victim
|
|
||||||
|
|
||||||
# Get VmRSS and VmSwap and cmdline of victim process
|
|
||||||
# and try to send a signal
|
|
||||||
try:
|
try:
|
||||||
|
|
||||||
with open('/proc/' + pid + '/status') as f:
|
with open('/proc/' + pid + '/status') as f:
|
||||||
|
|
||||||
for n, line in enumerate(f):
|
for n, line in enumerate(f):
|
||||||
|
|
||||||
if n is state_index:
|
if n is state_index:
|
||||||
state = line.split('\t')[1].rstrip()
|
state = line.split('\t')[1].rstrip()
|
||||||
|
continue
|
||||||
|
|
||||||
if n is ppid_index:
|
if n is ppid_index:
|
||||||
ppid = line.split('\t')[1]
|
ppid = line.split('\t')[1]
|
||||||
|
continue
|
||||||
|
|
||||||
if n is uid_index:
|
if n is uid_index:
|
||||||
uid = line.split('\t')[2]
|
uid = line.split('\t')[2]
|
||||||
@ -743,29 +768,28 @@ def find_victim_and_send_signal(signal):
|
|||||||
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
|
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
|
||||||
|
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print(mem_info)
|
|
||||||
print('The victim died in the search process: FileNotFoundError')
|
print('The victim died in the search process: FileNotFoundError')
|
||||||
update_stat_dict_and_print(
|
update_stat_dict_and_print(
|
||||||
'The victim died in the search process: FileNotFoundError')
|
'The victim died in the search process: FileNotFoundError')
|
||||||
return None
|
return None
|
||||||
except ProcessLookupError:
|
except ProcessLookupError:
|
||||||
print(mem_info)
|
|
||||||
print('The victim died in the search process: ProcessLookupError')
|
print('The victim died in the search process: ProcessLookupError')
|
||||||
update_stat_dict_and_print(
|
update_stat_dict_and_print(
|
||||||
'The victim died in the search process: ProcessLookupError')
|
'The victim died in the search process: ProcessLookupError')
|
||||||
return None
|
return None
|
||||||
except UnicodeDecodeError:
|
except UnicodeDecodeError:
|
||||||
|
|
||||||
# тут надо снова все исключ обработать
|
|
||||||
|
|
||||||
with open('/proc/' + pid + '/status', 'rb') as f:
|
with open('/proc/' + pid + '/status', 'rb') as f:
|
||||||
f_list = f.read().decode('utf-8', 'ignore').split('\n')
|
f_list = f.read().decode('utf-8', 'ignore').split('\n')
|
||||||
|
|
||||||
for i in range(len(f_list)):
|
for i in range(len(f_list)):
|
||||||
|
|
||||||
|
if i is state_index:
|
||||||
|
state = f_list[i].split('\t')[1].rstrip()
|
||||||
|
|
||||||
if i is ppid_index:
|
if i is ppid_index:
|
||||||
ppid = f_list[i].split('\t')[1]
|
ppid = f_list[i].split('\t')[1]
|
||||||
|
|
||||||
for i in range(len(f_list)):
|
|
||||||
if i is uid_index:
|
if i is uid_index:
|
||||||
uid = f_list[i].split('\t')[2]
|
uid = f_list[i].split('\t')[2]
|
||||||
|
|
||||||
@ -794,45 +818,48 @@ def find_victim_and_send_signal(signal):
|
|||||||
vm_swap = kib_to_mib(
|
vm_swap = kib_to_mib(
|
||||||
int(f_list[i].split('\t')[1][:-3]))
|
int(f_list[i].split('\t')[1][:-3]))
|
||||||
|
|
||||||
with open('/proc/' + pid + '/cmdline') as file:
|
cmdline = pid_to_cmdline(pid)
|
||||||
cmdline = file.readlines()[0].replace('\x00', ' ')
|
|
||||||
|
|
||||||
oom_score = rline1('/proc/' + pid + '/oom_score')
|
oom_score = rline1('/proc/' + pid + '/oom_score')
|
||||||
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
|
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
|
||||||
|
|
||||||
except IndexError:
|
except IndexError:
|
||||||
print(mem_info)
|
|
||||||
print('The victim died in the search process: IndexError')
|
print('The victim died in the search process: IndexError')
|
||||||
update_stat_dict_and_print(
|
update_stat_dict_and_print(
|
||||||
'The victim died in the search process: IndexError')
|
'The victim died in the search process: IndexError')
|
||||||
return None
|
return None
|
||||||
except ValueError:
|
except ValueError:
|
||||||
print(mem_info)
|
|
||||||
print('The victim died in the search process: ValueError')
|
print('The victim died in the search process: ValueError')
|
||||||
update_stat_dict_and_print(
|
update_stat_dict_and_print(
|
||||||
'The victim died in the search process: ValueError')
|
'The victim died in the search process: ValueError')
|
||||||
return None
|
return None
|
||||||
|
except FileNotFoundError:
|
||||||
|
print('The victim died in the search process: FileNotFoundError')
|
||||||
|
update_stat_dict_and_print(
|
||||||
|
'The victim died in the search process: FileNotFoundError')
|
||||||
|
return None
|
||||||
|
except ProcessLookupError:
|
||||||
|
print('The victim died in the search process: ProcessLookupError')
|
||||||
|
update_stat_dict_and_print(
|
||||||
|
'The victim died in the search process: ProcessLookupError')
|
||||||
|
return None
|
||||||
|
|
||||||
|
# print((time() - status0) * 1000, 'status time')
|
||||||
|
|
||||||
len_vm = len(str(vm_size))
|
len_vm = len(str(vm_size))
|
||||||
|
|
||||||
try:
|
try:
|
||||||
realpath = os.path.realpath('/proc/' + pid + '/exe')
|
realpath = os.path.realpath('/proc/' + pid + '/exe')
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print(mem_info)
|
|
||||||
print('The victim died in the search process: FileNotFoundError')
|
print('The victim died in the search process: FileNotFoundError')
|
||||||
update_stat_dict_and_print(
|
update_stat_dict_and_print(
|
||||||
'The victim died in the search process: FileNotFoundError')
|
'The victim died in the search process: FileNotFoundError')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
#state = pid_to_state(pid)
|
|
||||||
pname = pid_to_name(ppid.strip('\n '))
|
|
||||||
# print([ppid], [pname])
|
|
||||||
|
|
||||||
'''
|
|
||||||
te1 = time()
|
te1 = time()
|
||||||
ancestry = pid_to_ancestry(pid)
|
ancestry = pid_to_ancestry(pid, max_ancestry_depth)
|
||||||
print((time() - te1) * 1000)
|
# print((time() - te1) * 1000, 'ms, ancestry')
|
||||||
'''
|
# if max_ancestry_depth == 0:
|
||||||
|
# ancestry = '\n PPID: {} ({})'.format(ppid, pname)
|
||||||
|
|
||||||
if detailed_rss:
|
if detailed_rss:
|
||||||
detailed_rss_info = ' (' \
|
detailed_rss_info = ' (' \
|
||||||
@ -845,11 +872,13 @@ def find_victim_and_send_signal(signal):
|
|||||||
else:
|
else:
|
||||||
detailed_rss_info = ''
|
detailed_rss_info = ''
|
||||||
|
|
||||||
victim_info = 'Found a process with highest badness:' \
|
victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
|
||||||
|
|
||||||
|
victim_info = '\nFound a process with highest badness:' \
|
||||||
'\n Name: {}' \
|
'\n Name: {}' \
|
||||||
'\n State: {}' \
|
'\n State: {}' \
|
||||||
'\n PID: {}' \
|
'\n PID: {}' \
|
||||||
'\n PPID: {} ({})' \
|
'{}' \
|
||||||
'\n EUID: {}' \
|
'\n EUID: {}' \
|
||||||
'\n badness: {}, ' \
|
'\n badness: {}, ' \
|
||||||
'oom_score: {}, ' \
|
'oom_score: {}, ' \
|
||||||
@ -857,13 +886,13 @@ def find_victim_and_send_signal(signal):
|
|||||||
'\n VmSize: {} MiB' \
|
'\n VmSize: {} MiB' \
|
||||||
'\n VmRSS: {} MiB {}' \
|
'\n VmRSS: {} MiB {}' \
|
||||||
'\n VmSwap: {} MiB' \
|
'\n VmSwap: {} MiB' \
|
||||||
'\n realpath: {}' \
|
'\n Realpath: {}' \
|
||||||
'\n cmdline: {}'.format(
|
'\n Cmdline: {}' \
|
||||||
|
'\n Lifetime: {}'.format(
|
||||||
name,
|
name,
|
||||||
state,
|
state,
|
||||||
pid,
|
pid,
|
||||||
ppid.strip('\n '),
|
ancestry,
|
||||||
pname,
|
|
||||||
uid,
|
uid,
|
||||||
victim_badness,
|
victim_badness,
|
||||||
oom_score,
|
oom_score,
|
||||||
@ -873,12 +902,48 @@ def find_victim_and_send_signal(signal):
|
|||||||
detailed_rss_info,
|
detailed_rss_info,
|
||||||
str(vm_swap).rjust(len_vm),
|
str(vm_swap).rjust(len_vm),
|
||||||
realpath,
|
realpath,
|
||||||
cmdline)
|
cmdline,
|
||||||
|
victim_lifetime)
|
||||||
|
|
||||||
|
return victim_info
|
||||||
|
|
||||||
|
|
||||||
|
def implement_corrective_action(signal):
|
||||||
|
"""
|
||||||
|
Find victim with highest badness and send SIGTERM/SIGKILL
|
||||||
|
"""
|
||||||
|
|
||||||
|
pid, victim_badness, name = find_victim()
|
||||||
|
|
||||||
|
if victim_badness >= min_badness:
|
||||||
|
|
||||||
|
print(find_victim_info(pid, victim_badness, name))
|
||||||
|
|
||||||
|
# kill the victim if it doesn't respond to SIGTERM
|
||||||
|
if signal is SIGTERM:
|
||||||
|
victim_id = get_victim_id(pid)
|
||||||
|
if victim_id not in victim_dict:
|
||||||
|
victim_dict.update({victim_id: time()})
|
||||||
|
else:
|
||||||
|
if time() - victim_dict[
|
||||||
|
victim_id] > max_post_sigterm_victim_lifetime:
|
||||||
|
print(
|
||||||
|
'\nmax_post_sigterm_victim_lifetime excee'
|
||||||
|
'ded: the victim will get SIGKILL'
|
||||||
|
)
|
||||||
|
signal = SIGKILL
|
||||||
|
|
||||||
if execute_the_command and signal is SIGTERM and name in etc_dict:
|
if execute_the_command and signal is SIGTERM and name in etc_dict:
|
||||||
|
|
||||||
command = etc_dict[name]
|
command = etc_dict[name]
|
||||||
|
|
||||||
|
# todo: make new func
|
||||||
|
m = check_mem_and_swap()
|
||||||
|
ma = round(int(m[0]) / 1024.0)
|
||||||
|
sf = round(int(m[2]) / 1024.0)
|
||||||
|
print('\nMemory status before implementing a corrective action:\n MemAvailable'
|
||||||
|
': {} MiB, SwapFree: {} MiB'.format(ma, sf))
|
||||||
|
|
||||||
exit_status = os.system(etc_dict[name].replace(
|
exit_status = os.system(etc_dict[name].replace(
|
||||||
'$PID', pid).replace('$NAME', pid_to_name(pid)))
|
'$PID', pid).replace('$NAME', pid_to_name(pid)))
|
||||||
|
|
||||||
@ -896,7 +961,6 @@ def find_victim_and_send_signal(signal):
|
|||||||
'$PID', pid).replace('$NAME', pid_to_name(pid)), exit_status,
|
'$PID', pid).replace('$NAME', pid_to_name(pid)), exit_status,
|
||||||
round(response_time * 1000))
|
round(response_time * 1000))
|
||||||
|
|
||||||
print(mem_info)
|
|
||||||
print(etc_info)
|
print(etc_info)
|
||||||
|
|
||||||
key = "Run the command '{}'".format(command)
|
key = "Run the command '{}'".format(command)
|
||||||
@ -915,7 +979,7 @@ def find_victim_and_send_signal(signal):
|
|||||||
m = check_mem_and_swap()
|
m = check_mem_and_swap()
|
||||||
ma = round(int(m[0]) / 1024.0)
|
ma = round(int(m[0]) / 1024.0)
|
||||||
sf = round(int(m[2]) / 1024.0)
|
sf = round(int(m[2]) / 1024.0)
|
||||||
print('\nMemory status before sending a signal:\n MemAvailable'
|
print('\nMemory status before implementing a corrective action:\n MemAvailable'
|
||||||
': {} MiB, SwapFree: {} MiB'.format(ma, sf))
|
': {} MiB, SwapFree: {} MiB'.format(ma, sf))
|
||||||
|
|
||||||
os.kill(int(pid), signal)
|
os.kill(int(pid), signal)
|
||||||
@ -923,10 +987,9 @@ def find_victim_and_send_signal(signal):
|
|||||||
send_result = 'OK; response time: {} ms'.format(
|
send_result = 'OK; response time: {} ms'.format(
|
||||||
round(response_time * 1000))
|
round(response_time * 1000))
|
||||||
|
|
||||||
preventing_oom_message = '{}' \
|
preventing_oom_message = '\nImplement a corrective action:' \
|
||||||
'\nImplement a corrective action:\n ' \
|
'\n Send {} to the victim; {}'.format(
|
||||||
'Send {} to the victim; {}'.format(
|
sig_dict[signal], send_result)
|
||||||
victim_info, sig_dict[signal], send_result)
|
|
||||||
|
|
||||||
key = 'Send {} to {}'.format(
|
key = 'Send {} to {}'.format(
|
||||||
sig_dict[signal], name)
|
sig_dict[signal], name)
|
||||||
@ -953,7 +1016,6 @@ def find_victim_and_send_signal(signal):
|
|||||||
round(response_time * 1000))
|
round(response_time * 1000))
|
||||||
key = 'ProcessLookupError (the victim died in the search process): '
|
key = 'ProcessLookupError (the victim died in the search process): '
|
||||||
|
|
||||||
print(mem_info)
|
|
||||||
print(preventing_oom_message)
|
print(preventing_oom_message)
|
||||||
|
|
||||||
update_stat_dict_and_print(key)
|
update_stat_dict_and_print(key)
|
||||||
@ -961,7 +1023,6 @@ def find_victim_and_send_signal(signal):
|
|||||||
else:
|
else:
|
||||||
|
|
||||||
response_time = time() - time0
|
response_time = time() - time0
|
||||||
print(mem_info)
|
|
||||||
victim_badness_is_too_small = 'victim badness {} < min_b' \
|
victim_badness_is_too_small = 'victim badness {} < min_b' \
|
||||||
'adness {}; nothing to do; response time: {} ms'.format(
|
'adness {}; nothing to do; response time: {} ms'.format(
|
||||||
victim_badness,
|
victim_badness,
|
||||||
@ -1095,10 +1156,31 @@ def calculate_percent(arg_key):
|
|||||||
|
|
||||||
##########################################################################
|
##########################################################################
|
||||||
|
|
||||||
# Try to lock all memory
|
|
||||||
|
|
||||||
|
if len(argv) == 1:
|
||||||
|
if os.path.exists('./nohang.conf'):
|
||||||
|
config = cd = os.getcwd() + '/nohang.conf'
|
||||||
|
else:
|
||||||
|
config = '/etc/nohang/nohang.conf'
|
||||||
|
|
||||||
mlockall()
|
elif len(argv) == 2:
|
||||||
|
if argv[1] == '--help' or argv[1] == '-h':
|
||||||
|
errprint(help_mess)
|
||||||
|
exit(1)
|
||||||
|
else:
|
||||||
|
errprint('Invalid CLI input')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
elif len(argv) > 3:
|
||||||
|
errprint('Invalid CLI input')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
if argv[1] == '--config' or argv[1] == '-c':
|
||||||
|
config = argv[2]
|
||||||
|
else:
|
||||||
|
errprint('Invalid option: {}'.format(argv[1]))
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
##########################################################################
|
##########################################################################
|
||||||
@ -1621,9 +1703,19 @@ warn_time_now = 0
|
|||||||
warn_time_delta = 1000
|
warn_time_delta = 1000
|
||||||
warn_timer = 0
|
warn_timer = 0
|
||||||
|
|
||||||
|
|
||||||
|
##########################################################################
|
||||||
|
|
||||||
|
# Try to lock all memory
|
||||||
|
|
||||||
|
mlockall()
|
||||||
|
|
||||||
|
##########################################################################
|
||||||
|
|
||||||
|
|
||||||
if print_proc_table:
|
if print_proc_table:
|
||||||
print()
|
print()
|
||||||
fattest()
|
find_victim()
|
||||||
print()
|
print()
|
||||||
|
|
||||||
print('Monitoring started!')
|
print('Monitoring started!')
|
||||||
@ -1664,14 +1756,14 @@ while True:
|
|||||||
if avg10 >= sigkill_psi and time() - psi_t0 >= psi_avg10_sleep_time:
|
if avg10 >= sigkill_psi and time() - psi_t0 >= psi_avg10_sleep_time:
|
||||||
time0 = time()
|
time0 = time()
|
||||||
mem_info = 'avg ({}) > sigkill_psi ({})'.format(avg10, sigkill_psi)
|
mem_info = 'avg ({}) > sigkill_psi ({})'.format(avg10, sigkill_psi)
|
||||||
find_victim_and_send_signal(SIGKILL)
|
implement_corrective_action(SIGKILL)
|
||||||
psi_t0 = time()
|
psi_t0 = time()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if avg10 >= sigterm_psi and time() - psi_t0 >= psi_avg10_sleep_time:
|
if avg10 >= sigterm_psi and time() - psi_t0 >= psi_avg10_sleep_time:
|
||||||
time0 = time()
|
time0 = time()
|
||||||
mem_info = 'avg ({}) > sigterm_psi ({})'.format(avg10, sigterm_psi)
|
mem_info = 'avg ({}) > sigterm_psi ({})'.format(avg10, sigterm_psi)
|
||||||
find_victim_and_send_signal(SIGTERM)
|
implement_corrective_action(SIGTERM)
|
||||||
psi_t0 = time()
|
psi_t0 = time()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -1780,12 +1872,11 @@ while True:
|
|||||||
# далее пошла проверка превышения порогов
|
# далее пошла проверка превышения порогов
|
||||||
|
|
||||||
# MEM SWAP KILL
|
# MEM SWAP KILL
|
||||||
if mem_available <= mem_min_sigkill_kb and \
|
if (mem_available <= mem_min_sigkill_kb and
|
||||||
swap_free <= swap_min_sigkill_kb:
|
swap_free <= swap_min_sigkill_kb):
|
||||||
time0 = time()
|
time0 = time()
|
||||||
|
|
||||||
mem_info = '{}\nMemory status that r' \
|
mem_info = '{}\nMemory status that requires corrective actions:' \
|
||||||
'equires corrective actions:' \
|
|
||||||
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
||||||
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
||||||
'p_min_sigkill [{} MiB, {} %]'.format(
|
'p_min_sigkill [{} MiB, {} %]'.format(
|
||||||
@ -1799,7 +1890,7 @@ while True:
|
|||||||
kib_to_mib(swap_min_sigkill_kb),
|
kib_to_mib(swap_min_sigkill_kb),
|
||||||
swap_sigkill_pc)
|
swap_sigkill_pc)
|
||||||
|
|
||||||
find_victim_and_send_signal(SIGKILL)
|
implement_corrective_action(SIGKILL)
|
||||||
psi_t0 = time()
|
psi_t0 = time()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -1807,8 +1898,7 @@ while True:
|
|||||||
if mem_used_zram >= zram_max_sigkill_kb:
|
if mem_used_zram >= zram_max_sigkill_kb:
|
||||||
time0 = time()
|
time0 = time()
|
||||||
|
|
||||||
mem_info = '{}\nMemory statu' \
|
mem_info = '{}\nMemory status that requires corrective actions:' \
|
||||||
's that requires corrective actions:' \
|
|
||||||
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
|
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
|
||||||
'kill [{} MiB, {} %]'.format(
|
'kill [{} MiB, {} %]'.format(
|
||||||
HR,
|
HR,
|
||||||
@ -1817,7 +1907,7 @@ while True:
|
|||||||
kib_to_mib(zram_max_sigkill_kb),
|
kib_to_mib(zram_max_sigkill_kb),
|
||||||
percent(zram_max_sigkill_kb / mem_total))
|
percent(zram_max_sigkill_kb / mem_total))
|
||||||
|
|
||||||
find_victim_and_send_signal(SIGKILL)
|
implement_corrective_action(SIGKILL)
|
||||||
psi_t0 = time()
|
psi_t0 = time()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -1827,8 +1917,7 @@ while True:
|
|||||||
|
|
||||||
time0 = time()
|
time0 = time()
|
||||||
|
|
||||||
mem_info = '{}\nMemory status tha' \
|
mem_info = '{}\nMemory status that requires corrective actions:' \
|
||||||
't requires corrective actions:' \
|
|
||||||
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
||||||
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
||||||
'p_min_sigterm [{} MiB, {} %]'.format(
|
'p_min_sigterm [{} MiB, {} %]'.format(
|
||||||
@ -1844,7 +1933,9 @@ while True:
|
|||||||
kib_to_mib(swap_min_sigterm_kb),
|
kib_to_mib(swap_min_sigterm_kb),
|
||||||
swap_sigterm_pc)
|
swap_sigterm_pc)
|
||||||
|
|
||||||
find_victim_and_send_signal(SIGTERM)
|
print(mem_info)
|
||||||
|
|
||||||
|
implement_corrective_action(SIGTERM)
|
||||||
psi_t0 = time()
|
psi_t0 = time()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@ -1852,8 +1943,7 @@ while True:
|
|||||||
if mem_used_zram >= zram_max_sigterm_kb:
|
if mem_used_zram >= zram_max_sigterm_kb:
|
||||||
time0 = time()
|
time0 = time()
|
||||||
|
|
||||||
mem_info = '{}\nMemory status that r' \
|
mem_info = '{}\nMemory status that requires corrective actions:' \
|
||||||
'equires corrective actions:' \
|
|
||||||
'\n MemUsedZram [{} MiB, {} %] >= ' \
|
'\n MemUsedZram [{} MiB, {} %] >= ' \
|
||||||
'zram_max_sigterm [{} M, {} %]'.format(
|
'zram_max_sigterm [{} M, {} %]'.format(
|
||||||
HR,
|
HR,
|
||||||
@ -1862,9 +1952,8 @@ while True:
|
|||||||
kib_to_mib(zram_max_sigterm_kb),
|
kib_to_mib(zram_max_sigterm_kb),
|
||||||
percent(zram_max_sigterm_kb / mem_total))
|
percent(zram_max_sigterm_kb / mem_total))
|
||||||
|
|
||||||
find_victim_and_send_signal(SIGTERM)
|
implement_corrective_action(SIGTERM)
|
||||||
|
|
||||||
# сделать одно время для обоих уровней.
|
|
||||||
psi_t0 = time()
|
psi_t0 = time()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user