3258 lines
96 KiB
Python
Executable File
3258 lines
96 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""A daemon that prevents OOM in Linux systems."""
|
||
|
||
import os
|
||
from ctypes import CDLL
|
||
from time import sleep, time
|
||
from operator import itemgetter
|
||
from sys import stdout, stderr, argv, exit
|
||
from re import search
|
||
from sre_constants import error as invalid_re
|
||
from signal import signal, SIGKILL, SIGTERM, SIGINT, SIGQUIT, SIGHUP
|
||
|
||
|
||
##########################################################################
|
||
|
||
# define functions
|
||
|
||
|
||
def get_swap_threshold_tuple(string):
|
||
# re (Num %, True) or (Num KiB, False)
|
||
"""Returns KiB value if abs val was set in config, or tuple with %"""
|
||
# return tuple with abs and bool: (abs %, True) or (abs MiB, False)
|
||
|
||
if string.endswith('%'):
|
||
valid = string_to_float_convert_test(string[:-1])
|
||
if valid is None:
|
||
errprint('somewhere swap unit is not float_%')
|
||
exit(1)
|
||
|
||
value = float(string[:-1].strip())
|
||
if value < 0 or value > 100:
|
||
errprint('invalid value, must be from the range[0; 100] %')
|
||
exit(1)
|
||
|
||
return value, True
|
||
|
||
elif string.endswith('M'):
|
||
valid = string_to_float_convert_test(string[:-1])
|
||
if valid is None:
|
||
errprint('somewhere swap unit is not float_M')
|
||
exit(1)
|
||
|
||
value = float(string[:-1].strip()) * 1024
|
||
if value < 0:
|
||
errprint('invalid unit in config (negative value)')
|
||
exit(1)
|
||
|
||
return value, False
|
||
|
||
else:
|
||
errprint(
|
||
'Invalid config file. There are invalid units somewhere\nExit')
|
||
exit(1)
|
||
|
||
|
||
def find_cgroup_indexes():
|
||
""" Find cgroup-line positions in /proc/*/cgroup file.
|
||
"""
|
||
|
||
cgroup_v1_index = cgroup_v2_index = None
|
||
|
||
with open('/proc/self/cgroup') as f:
|
||
for index, line in enumerate(f):
|
||
if ':name=' in line:
|
||
cgroup_v1_index = index
|
||
if line.startswith('0::'):
|
||
cgroup_v2_index = index
|
||
|
||
return cgroup_v1_index, cgroup_v2_index
|
||
|
||
|
||
def pid_to_rss(pid):
|
||
"""
|
||
"""
|
||
try:
|
||
rss = int(rline1(
|
||
'/proc/{}/statm'.format(pid)).split(' ')[1]) * SC_PAGESIZE
|
||
except IndexError:
|
||
rss = None
|
||
except FileNotFoundError:
|
||
rss = None
|
||
except ProcessLookupError:
|
||
rss = None
|
||
return rss
|
||
|
||
|
||
def pid_to_vm_size(pid):
|
||
"""
|
||
"""
|
||
try:
|
||
vm_size = int(rline1(
|
||
'/proc/{}/statm'.format(pid)).partition(' ')[0]) * SC_PAGESIZE
|
||
except IndexError:
|
||
vm_size = None
|
||
except FileNotFoundError:
|
||
vm_size = None
|
||
except ProcessLookupError:
|
||
vm_size = None
|
||
return vm_size
|
||
|
||
|
||
def signal_handler(signum, frame):
|
||
"""
|
||
"""
|
||
for i in sig_list:
|
||
signal(i, signal_handler_inner)
|
||
log('Signal handler called with the {} signal '.format(
|
||
sig_dict[signum]))
|
||
update_stat_dict_and_print(None)
|
||
log('Exit')
|
||
exit()
|
||
|
||
|
||
def signal_handler_inner(signum, frame):
|
||
"""
|
||
"""
|
||
log('Signal handler called with the {} signal (ignored) '.format(
|
||
sig_dict[signum]))
|
||
|
||
|
||
def exe(cmd):
|
||
"""
|
||
"""
|
||
log('Execute the command: {}'.format(cmd))
|
||
t0 = time()
|
||
write_self_oom_score_adj(self_oom_score_adj_max)
|
||
err = os.system(cmd)
|
||
write_self_oom_score_adj(self_oom_score_adj_min)
|
||
dt = time() - t0
|
||
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
|
||
return err
|
||
|
||
|
||
def write(path, string):
|
||
"""
|
||
"""
|
||
with open(path, 'w') as f:
|
||
f.write(string)
|
||
|
||
|
||
def write_self_oom_score_adj(new_value):
|
||
"""
|
||
"""
|
||
if root:
|
||
write('/proc/self/oom_score_adj', new_value)
|
||
|
||
|
||
def valid_re(reg_exp):
|
||
"""Validate regular expression.
|
||
"""
|
||
try:
|
||
search(reg_exp, '')
|
||
except invalid_re:
|
||
log('Invalid config: invalid regexp: {}'.format(reg_exp))
|
||
exit(1)
|
||
|
||
|
||
def func_print_proc_table():
|
||
"""
|
||
"""
|
||
print_proc_table = True
|
||
find_victim(print_proc_table)
|
||
exit()
|
||
|
||
|
||
def log(*msg):
|
||
"""
|
||
"""
|
||
try:
|
||
print(*msg)
|
||
except OSError:
|
||
sleep(0.01)
|
||
if separate_log:
|
||
try:
|
||
info(*msg)
|
||
except OSError:
|
||
sleep(0.01)
|
||
|
||
|
||
def print_version():
|
||
"""
|
||
"""
|
||
try:
|
||
v = rline1('/etc/nohang/version')
|
||
except FileNotFoundError:
|
||
v = None
|
||
if v is None:
|
||
print('Nohang unknown version')
|
||
else:
|
||
print('Nohang ' + v)
|
||
exit()
|
||
|
||
|
||
def pid_to_cgroup_v1(pid):
|
||
"""
|
||
"""
|
||
cgroup_v1 = ''
|
||
try:
|
||
with open('/proc/' + pid + '/cgroup') as f:
|
||
for index, line in enumerate(f):
|
||
if index == cgroup_v1_index:
|
||
cgroup_v1 = '/' + line.partition('/')[2][:-1]
|
||
return cgroup_v1
|
||
except FileNotFoundError:
|
||
return ''
|
||
|
||
|
||
def pid_to_cgroup_v2(pid):
|
||
"""
|
||
"""
|
||
cgroup_v2 = ''
|
||
try:
|
||
with open('/proc/' + pid + '/cgroup') as f:
|
||
for index, line in enumerate(f):
|
||
if index == cgroup_v2_index:
|
||
cgroup_v2 = line[3:-1]
|
||
return cgroup_v2
|
||
except FileNotFoundError:
|
||
return ''
|
||
|
||
|
||
def pid_to_starttime(pid):
|
||
""" handle FNF error!
|
||
"""
|
||
try:
|
||
starttime = rline1('/proc/' + pid + '/stat').rpartition(')')[
|
||
2].split(' ')[20]
|
||
|
||
except UnicodeDecodeError:
|
||
# print('LOL')
|
||
with open('/proc/' + pid + '/stat', 'rb') as f:
|
||
starttime = f.read().decode('utf-8', 'ignore').rpartition(
|
||
')')[2].split(' ')[20]
|
||
|
||
return float(starttime) / SC_CLK_TCK
|
||
|
||
|
||
def get_victim_id(pid):
|
||
"""victim_id is starttime + pid"""
|
||
try:
|
||
return rline1('/proc/' + pid + '/stat').rpartition(
|
||
')')[2].split(' ')[20] + '_pid' + pid
|
||
except FileNotFoundError:
|
||
return ''
|
||
except ProcessLookupError:
|
||
return ''
|
||
|
||
|
||
def pid_to_state(pid):
|
||
""" Handle FNF error! (BTW it already handled in find_victim_info())
|
||
"""
|
||
return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1]
|
||
|
||
|
||
def pid_to_name(pid):
|
||
"""
|
||
"""
|
||
try:
|
||
with open('/proc/' + pid + '/comm', 'rb') as f:
|
||
return f.read().decode('utf-8', 'ignore')[:-1]
|
||
except FileNotFoundError:
|
||
return ''
|
||
except ProcessLookupError:
|
||
return ''
|
||
|
||
|
||
def pid_to_ppid(pid):
|
||
"""
|
||
"""
|
||
try:
|
||
with open('/proc/' + pid + '/status') as f:
|
||
for n, line in enumerate(f):
|
||
if n is ppid_index:
|
||
return line.split('\t')[1].strip()
|
||
except FileNotFoundError:
|
||
return ''
|
||
except ProcessLookupError:
|
||
return ''
|
||
except UnicodeDecodeError:
|
||
with open('/proc/' + pid + '/status', 'rb') as f:
|
||
f_list = f.read().decode('utf-8', 'ignore').split('\n')
|
||
for i in range(len(f_list)):
|
||
if i is ppid_index:
|
||
return f_list[i].split('\t')[1]
|
||
|
||
|
||
def pid_to_ancestry(pid, max_ancestry_depth=1):
|
||
"""
|
||
"""
|
||
if max_ancestry_depth == 1:
|
||
ppid = pid_to_ppid(pid)
|
||
pname = pid_to_name(ppid)
|
||
return '\n PPID: {} ({})'.format(ppid, pname)
|
||
if max_ancestry_depth == 0:
|
||
return ''
|
||
anc_list = []
|
||
for i in range(max_ancestry_depth):
|
||
ppid = pid_to_ppid(pid)
|
||
pname = pid_to_name(ppid)
|
||
anc_list.append((ppid, pname))
|
||
if ppid == '1':
|
||
break
|
||
pid = ppid
|
||
a = ''
|
||
for i in anc_list:
|
||
a = a + ' <= PID {} ({})'.format(i[0], i[1])
|
||
return '\n Ancestry: ' + a[4:]
|
||
|
||
|
||
def pid_to_cmdline(pid):
|
||
"""
|
||
Get process cmdline by pid.
|
||
|
||
pid: str pid of required process
|
||
returns string cmdline
|
||
"""
|
||
try:
|
||
with open('/proc/' + pid + '/cmdline') as f:
|
||
return f.read().replace('\x00', ' ').rstrip()
|
||
except FileNotFoundError:
|
||
return ''
|
||
|
||
|
||
def pid_to_environ(pid):
|
||
"""
|
||
Get process environ by pid.
|
||
|
||
pid: str pid of required process
|
||
returns string environ
|
||
"""
|
||
try:
|
||
with open('/proc/' + pid + '/environ') as f:
|
||
return f.read().replace('\x00', ' ').rstrip()
|
||
except FileNotFoundError:
|
||
return ''
|
||
|
||
|
||
def pid_to_realpath(pid):
|
||
"""
|
||
"""
|
||
try:
|
||
return os.path.realpath('/proc/' + pid + '/exe')
|
||
except FileNotFoundError:
|
||
return ''
|
||
|
||
|
||
def pid_to_uid(pid):
|
||
"""return euid"""
|
||
try:
|
||
with open('/proc/' + pid + '/status') as f:
|
||
for n, line in enumerate(f):
|
||
if n is uid_index:
|
||
return line.split('\t')[2]
|
||
except UnicodeDecodeError:
|
||
with open('/proc/' + pid + '/status', 'rb') as f:
|
||
f_list = f.read().decode('utf-8', 'ignore').split('\n')
|
||
return f_list[uid_index].split('\t')[2]
|
||
except FileNotFoundError:
|
||
return ''
|
||
|
||
|
||
def pid_to_badness(pid):
|
||
"""Find and modify badness (if it needs)."""
|
||
|
||
try:
|
||
|
||
oom_score = int(rline1('/proc/' + pid + '/oom_score'))
|
||
badness = oom_score
|
||
|
||
if decrease_oom_score_adj:
|
||
oom_score_adj = int(rline1('/proc/' + pid + '/oom_score_adj'))
|
||
if badness > oom_score_adj_max and oom_score_adj > 0:
|
||
badness = badness - oom_score_adj + oom_score_adj_max
|
||
|
||
if regex_matching:
|
||
name = pid_to_name(pid)
|
||
for re_tup in badness_adj_re_name_list:
|
||
if search(re_tup[1], name) is not None:
|
||
badness += int(re_tup[0])
|
||
|
||
if re_match_cgroup_v1:
|
||
cgroup_v1 = pid_to_cgroup_v1(pid)
|
||
for re_tup in badness_adj_re_cgroup_v1_list:
|
||
if search(re_tup[1], cgroup_v1) is not None:
|
||
badness += int(re_tup[0])
|
||
|
||
if re_match_cgroup_v2:
|
||
cgroup_v2 = pid_to_cgroup_v2(pid)
|
||
for re_tup in badness_adj_re_cgroup_v2_list:
|
||
if search(re_tup[1], cgroup_v2) is not None:
|
||
badness += int(re_tup[0])
|
||
|
||
if re_match_realpath:
|
||
realpath = pid_to_realpath(pid)
|
||
for re_tup in badness_adj_re_realpath_list:
|
||
if search(re_tup[1], realpath) is not None:
|
||
badness += int(re_tup[0])
|
||
|
||
if re_match_cmdline:
|
||
cmdline = pid_to_cmdline(pid)
|
||
for re_tup in badness_adj_re_cmdline_list:
|
||
if search(re_tup[1], cmdline) is not None:
|
||
badness += int(re_tup[0])
|
||
|
||
if re_match_environ:
|
||
environ = pid_to_environ(pid)
|
||
for re_tup in badness_adj_re_environ_list:
|
||
if search(re_tup[1], environ) is not None:
|
||
badness += int(re_tup[0])
|
||
|
||
if re_match_uid:
|
||
uid = pid_to_uid(pid)
|
||
for re_tup in badness_adj_re_uid_list:
|
||
if search(re_tup[1], uid) is not None:
|
||
badness += int(re_tup[0])
|
||
|
||
if forbid_negative_badness:
|
||
if badness < 0:
|
||
badness = 0
|
||
|
||
return badness, oom_score
|
||
|
||
except FileNotFoundError:
|
||
return None, None
|
||
except ProcessLookupError:
|
||
return None, None
|
||
|
||
|
||
def pid_to_status(pid):
|
||
"""
|
||
"""
|
||
|
||
try:
|
||
|
||
with open('/proc/' + pid + '/status') as f:
|
||
|
||
for n, line in enumerate(f):
|
||
|
||
if n is 0:
|
||
name = line.split('\t')[1][:-1]
|
||
|
||
if n is state_index:
|
||
state = line.split('\t')[1][0]
|
||
continue
|
||
|
||
if n is ppid_index:
|
||
ppid = line.split('\t')[1][:-1]
|
||
continue
|
||
|
||
if n is uid_index:
|
||
uid = line.split('\t')[2]
|
||
continue
|
||
|
||
if n is vm_size_index:
|
||
vm_size = kib_to_mib(int(line.split('\t')[1][:-4]))
|
||
continue
|
||
|
||
if n is vm_rss_index:
|
||
vm_rss = kib_to_mib(int(line.split('\t')[1][:-4]))
|
||
continue
|
||
|
||
if n is vm_swap_index:
|
||
vm_swap = kib_to_mib(int(line.split('\t')[1][:-4]))
|
||
break
|
||
|
||
return name, state, ppid, uid, vm_size, vm_rss, vm_swap
|
||
|
||
except UnicodeDecodeError:
|
||
return pid_to_status_unicode(pid)
|
||
|
||
except FileNotFoundError:
|
||
return None
|
||
|
||
except ProcessLookupError:
|
||
return None
|
||
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def pid_to_status_unicode(pid):
|
||
"""
|
||
"""
|
||
try:
|
||
|
||
with open('/proc/' + pid + '/status', 'rb') as f:
|
||
f_list = f.read().decode('utf-8', 'ignore').split('\n')
|
||
|
||
for i in range(len(f_list)):
|
||
|
||
if i is 0:
|
||
name = f_list[i].split('\t')[1]
|
||
|
||
if i is state_index:
|
||
state = f_list[i].split('\t')[1][0]
|
||
|
||
if i is ppid_index:
|
||
ppid = f_list[i].split('\t')[1]
|
||
|
||
if i is uid_index:
|
||
uid = f_list[i].split('\t')[2]
|
||
|
||
if i is vm_size_index:
|
||
vm_size = kib_to_mib(
|
||
int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if i is vm_rss_index:
|
||
vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if i is vm_swap_index:
|
||
vm_swap = kib_to_mib(int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
return name, state, ppid, uid, vm_size, vm_rss, vm_swap
|
||
|
||
except FileNotFoundError:
|
||
return None
|
||
|
||
except ProcessLookupError:
|
||
return None
|
||
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def uptime():
|
||
"""
|
||
"""
|
||
return float(rline1('/proc/uptime').split(' ')[0])
|
||
|
||
|
||
def errprint(*text):
|
||
"""
|
||
"""
|
||
print(*text, file=stderr, flush=True)
|
||
|
||
|
||
def mlockall():
|
||
"""Lock all memory to prevent swapping nohang process."""
|
||
|
||
MCL_CURRENT = 1
|
||
MCL_FUTURE = 2
|
||
MCL_ONFAULT = 4
|
||
|
||
libc = CDLL('libc.so.6', use_errno=True)
|
||
|
||
result = libc.mlockall(
|
||
MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT
|
||
)
|
||
if result != 0:
|
||
result = libc.mlockall(
|
||
MCL_CURRENT | MCL_FUTURE
|
||
)
|
||
if result != 0:
|
||
log('WARNING: cannot lock all memory')
|
||
else:
|
||
pass
|
||
# log('All memory locked with MCL_CURRENT | MCL_FUTURE')
|
||
else:
|
||
pass
|
||
# log('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT')
|
||
|
||
|
||
def update_stat_dict_and_print(key):
|
||
"""
|
||
"""
|
||
|
||
if key is not None:
|
||
|
||
if key not in stat_dict:
|
||
|
||
stat_dict.update({key: 1})
|
||
|
||
else:
|
||
|
||
new_value = stat_dict[key] + 1
|
||
stat_dict.update({key: new_value})
|
||
|
||
if print_total_stat:
|
||
|
||
stats_msg = 'Total stat (what happened in the last {}):'.format(
|
||
format_time(time() - start_time))
|
||
|
||
for i in stat_dict:
|
||
stats_msg += '\n {}: {}'.format(i, stat_dict[i])
|
||
|
||
log(stats_msg)
|
||
|
||
|
||
def find_psi_metrics_value(psi_path, psi_metrics):
|
||
"""
|
||
"""
|
||
|
||
if psi_support:
|
||
|
||
if psi_metrics == 'some_avg10':
|
||
return float(rline1(psi_path).split(' ')[1].split('=')[1])
|
||
if psi_metrics == 'some_avg60':
|
||
return float(rline1(psi_path).split(' ')[2].split('=')[1])
|
||
if psi_metrics == 'some_avg300':
|
||
return float(rline1(psi_path).split(' ')[3].split('=')[1])
|
||
|
||
if psi_metrics == 'full_avg10':
|
||
with open(psi_path) as f:
|
||
psi_list = f.readlines()
|
||
return float(psi_list[1].split(' ')[1].split('=')[1])
|
||
if psi_metrics == 'full_avg60':
|
||
with open(psi_path) as f:
|
||
psi_list = f.readlines()
|
||
return float(psi_list[1].split(' ')[2].split('=')[1])
|
||
if psi_metrics == 'full_avg300':
|
||
with open(psi_path) as f:
|
||
psi_list = f.readlines()
|
||
return float(psi_list[1].split(' ')[3].split('=')[1])
|
||
|
||
|
||
def check_mem_and_swap():
|
||
"""find mem_available, swap_total, swap_free"""
|
||
with open('/proc/meminfo') as f:
|
||
for n, line in enumerate(f):
|
||
if n is 2:
|
||
mem_available = int(line.split(':')[1][:-4])
|
||
continue
|
||
if n is swap_total_index:
|
||
swap_total = int(line.split(':')[1][:-4])
|
||
continue
|
||
if n is swap_free_index:
|
||
swap_free = int(line.split(':')[1][:-4])
|
||
break
|
||
return mem_available, swap_total, swap_free
|
||
|
||
|
||
def check_zram():
|
||
"""find MemUsedZram"""
|
||
disksize_sum = 0
|
||
mem_used_total_sum = 0
|
||
|
||
for dev in os.listdir('/sys/block'):
|
||
if dev.startswith('zram'):
|
||
stat = zram_stat(dev)
|
||
disksize_sum += int(stat[0])
|
||
mem_used_total_sum += int(stat[1])
|
||
|
||
# Means that when setting zram disksize = 1 GiB available memory
|
||
# decrease by 0.0042 GiB.
|
||
# Found experimentally, requires clarification with different kernaels and
|
||
# architectures.
|
||
# On small disk drives (up to gigabyte) it can be more, up to 0.0045.
|
||
# The creator of the zram module claims that ZRAM_DISKSIZE_FACTOR should
|
||
# be 0.001:
|
||
# ("zram uses about 0.1% of the size of the disk"
|
||
# - https://www.kernel.org/doc/Documentation/blockdev/zram.txt),
|
||
# but this statement contradicts the experimental data.
|
||
# ZRAM_DISKSIZE_FACTOR = deltaMemAvailavle / disksize
|
||
# Found experimentally.
|
||
ZRAM_DISKSIZE_FACTOR = 0.0042
|
||
|
||
return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0
|
||
|
||
|
||
def format_time(t):
|
||
"""
|
||
"""
|
||
t = int(t)
|
||
if t < 60:
|
||
return '{} sec'.format(t)
|
||
elif t >= 60 and t < 3600:
|
||
m = t // 60
|
||
s = t % 60
|
||
return '{} min {} sec'.format(m, s)
|
||
else:
|
||
h = t // 3600
|
||
s0 = t - h * 3600
|
||
m = s0 // 60
|
||
s = s0 % 60
|
||
return '{} h {} min {} sec'.format(h, m, s)
|
||
|
||
|
||
def string_to_float_convert_test(string):
|
||
"""Try to interprete string values as floats."""
|
||
try:
|
||
return float(string)
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def string_to_int_convert_test(string):
|
||
"""Try to interpret string values as integers."""
|
||
try:
|
||
return int(string)
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def conf_parse_string(param):
|
||
"""
|
||
Get string parameters from the config dict.
|
||
|
||
param: config_dict key
|
||
returns config_dict[param].strip()
|
||
"""
|
||
if param in config_dict:
|
||
return config_dict[param].strip()
|
||
else:
|
||
errprint('All the necessary parameters must be in the config')
|
||
errprint('There is no "{}" parameter in the config'.format(param))
|
||
exit(1)
|
||
|
||
|
||
def conf_parse_bool(param):
|
||
"""
|
||
Get bool parameters from the config_dict.
|
||
|
||
param: config_dicst key
|
||
returns bool
|
||
"""
|
||
if param in config_dict:
|
||
param_str = config_dict[param]
|
||
if param_str == 'True':
|
||
return True
|
||
elif param_str == 'False':
|
||
return False
|
||
else:
|
||
errprint('Invalid value of the "{}" parameter.'.format(param))
|
||
errprint('Valid values are True and False.')
|
||
errprint('Exit')
|
||
exit(1)
|
||
else:
|
||
errprint('All the necessary parameters must be in the config')
|
||
errprint('There is no "{}" parameter in the config'.format(param))
|
||
exit(1)
|
||
|
||
|
||
def rline1(path):
|
||
"""read 1st line from path."""
|
||
try:
|
||
with open(path) as f:
|
||
for line in f:
|
||
return line[:-1]
|
||
except UnicodeDecodeError:
|
||
with open(path, 'rb') as f:
|
||
return f.read(999).decode(
|
||
'utf-8', 'ignore').split('\n')[0] # use partition()!
|
||
|
||
|
||
def kib_to_mib(num):
|
||
"""Convert KiB values to MiB values."""
|
||
return round(num / 1024.0)
|
||
|
||
|
||
def percent(num):
|
||
"""Interprete num as percentage."""
|
||
return round(num * 100, 1)
|
||
|
||
|
||
def just_percent_mem(num):
|
||
"""convert num to percent and justify"""
|
||
return str(round(num * 100, 1)).rjust(4, ' ')
|
||
|
||
|
||
def just_percent_swap(num):
|
||
"""
|
||
"""
|
||
return str(round(num * 100, 1)).rjust(5, ' ')
|
||
|
||
|
||
def human(num, lenth):
|
||
"""Convert KiB values to MiB values with right alignment"""
|
||
return str(round(num / 1024)).rjust(lenth, ' ')
|
||
|
||
|
||
def zram_stat(zram_id):
|
||
"""
|
||
Get zram state.
|
||
|
||
zram_id: str zram block-device id
|
||
returns bytes diskcize, str mem_used_total
|
||
"""
|
||
try:
|
||
disksize = rline1('/sys/block/' + zram_id + '/disksize')
|
||
except FileNotFoundError:
|
||
return '0', '0'
|
||
if disksize == ['0\n']:
|
||
return '0', '0'
|
||
try:
|
||
mm_stat = rline1('/sys/block/' + zram_id + '/mm_stat').split(' ')
|
||
mm_stat_list = []
|
||
for i in mm_stat:
|
||
if i != '':
|
||
mm_stat_list.append(i)
|
||
mem_used_total = mm_stat_list[2]
|
||
except FileNotFoundError:
|
||
mem_used_total = rline1('/sys/block/' + zram_id + '/mem_used_total')
|
||
return disksize, mem_used_total # BYTES, str
|
||
|
||
|
||
def send_notify_warn():
|
||
"""
|
||
Look for process with maximum 'badness' and warn user with notification.
|
||
(implement Low memory warnings)
|
||
"""
|
||
log('Warning threshold exceeded')
|
||
|
||
if check_warning_exe:
|
||
exe(warning_exe)
|
||
|
||
else:
|
||
|
||
title = 'Low memory'
|
||
|
||
body = 'MemAvail: {}%\nSwapFree: {}%'.format(
|
||
round(mem_available / mem_total * 100),
|
||
round(swap_free / (swap_total + 0.1) * 100)
|
||
)
|
||
|
||
send_notification(title, body)
|
||
|
||
|
||
def send_notify(threshold, name, pid):
|
||
"""
|
||
Notificate about OOM Preventing.
|
||
|
||
threshold: key for notify_sig_dict
|
||
name: str process name
|
||
pid: str process pid
|
||
"""
|
||
|
||
# wait for memory release after corrective action
|
||
# may be useful if free memory was about 0 immediately after
|
||
# corrective action
|
||
sleep(0.05)
|
||
|
||
title = 'Freeze prevention'
|
||
body = '<b>{}</b> [{}] <b>{}</b>'.format(
|
||
notify_sig_dict[threshold],
|
||
pid,
|
||
name.replace(
|
||
# symbol '&' can break notifications in some themes,
|
||
# therefore it is replaced by '*'
|
||
'&', '*'
|
||
)
|
||
)
|
||
|
||
send_notification(title, body)
|
||
|
||
|
||
def send_notify_etc(pid, name, command):
|
||
"""
|
||
Notificate about OOM Preventing.
|
||
|
||
command: str command that will be executed
|
||
name: str process name
|
||
pid: str process pid
|
||
"""
|
||
title = 'Freeze prevention'
|
||
body = '<b>Victim is</b> [{}] <b>{}</b>\nExecute the co' \
|
||
'mmand:\n<b>{}</b>'.format(
|
||
pid, name.replace('&', '*'), command.replace('&', '*'))
|
||
|
||
send_notification(title, body)
|
||
|
||
|
||
def send_notification(title, body):
|
||
"""
|
||
"""
|
||
split_by = '#' * 16
|
||
|
||
t000 = time()
|
||
|
||
path_to_cache = '/dev/shm/nohang_notify_cache_uid{}_time{}'.format(
|
||
str(self_uid), t000
|
||
)
|
||
|
||
text = '{}{}{}'.format(title, split_by, body)
|
||
|
||
try:
|
||
with open(path_to_cache, 'w') as f:
|
||
f.write(text)
|
||
os.chmod(path_to_cache, 0o600)
|
||
except OSError:
|
||
log('OSError while send notification '
|
||
'(No space left on device: /dev/shm)')
|
||
return None
|
||
|
||
cmd = '{} --uid {} --time {} &'.format(notify_helper_path, self_uid, t000)
|
||
exe(cmd)
|
||
|
||
|
||
def get_pid_list():
|
||
"""
|
||
Find pid list expect kthreads and zombies
|
||
"""
|
||
pid_list = []
|
||
for pid in os.listdir('/proc'):
|
||
if os.path.exists('/proc/' + pid + '/exe') is True:
|
||
pid_list.append(pid)
|
||
return pid_list
|
||
|
||
|
||
def get_non_decimal_pids():
|
||
"""
|
||
"""
|
||
non_decimal_list = []
|
||
for pid in pid_list:
|
||
if pid[0].isdecimal() is False:
|
||
non_decimal_list.append(pid)
|
||
return non_decimal_list
|
||
|
||
|
||
def find_victim(_print_proc_table):
|
||
"""
|
||
Find the process with highest badness and its badness adjustment
|
||
Return pid and badness
|
||
"""
|
||
|
||
ft1 = time()
|
||
|
||
pid_list = get_pid_list()
|
||
|
||
pid_list.remove(self_pid)
|
||
|
||
if '1' in pid_list:
|
||
pid_list.remove('1')
|
||
|
||
non_decimal_list = get_non_decimal_pids()
|
||
|
||
for i in non_decimal_list:
|
||
if i in pid_list:
|
||
pid_list.remove(i)
|
||
|
||
pid_badness_list = []
|
||
|
||
if _print_proc_table:
|
||
|
||
if extra_table_info == 'None':
|
||
extra_table_title = ''
|
||
|
||
elif extra_table_info == 'cgroup_v1':
|
||
extra_table_title = 'CGroup_v1'
|
||
|
||
elif extra_table_info == 'cgroup_v2':
|
||
extra_table_title = 'CGroup_v2'
|
||
|
||
elif extra_table_info == 'cmdline':
|
||
extra_table_title = 'cmdline'
|
||
|
||
elif extra_table_info == 'environ':
|
||
extra_table_title = 'environ'
|
||
|
||
elif extra_table_info == 'realpath':
|
||
extra_table_title = 'realpath'
|
||
|
||
elif extra_table_info == 'All':
|
||
extra_table_title = '[CGroup] [CmdLine] [RealPath]'
|
||
else:
|
||
extra_table_title = ''
|
||
|
||
hr = '#' * 115
|
||
|
||
log(hr)
|
||
log('# PID PPID badness oom_score oom_score_adj e'
|
||
'UID S VmSize VmRSS VmSwap Name {}'.format(
|
||
extra_table_title))
|
||
log('#------- ------- ------- --------- ------------- -------'
|
||
'--- - ------ ----- ------ --------------- --------')
|
||
|
||
for pid in pid_list:
|
||
|
||
badness = pid_to_badness(pid)[0]
|
||
|
||
if badness is None:
|
||
continue
|
||
|
||
if _print_proc_table:
|
||
|
||
try:
|
||
oom_score = rline1('/proc/' + pid + '/oom_score')
|
||
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
|
||
except FileNotFoundError:
|
||
continue
|
||
|
||
if pid_to_status(pid) is None:
|
||
continue
|
||
else:
|
||
(name, state, ppid, uid, vm_size, vm_rss,
|
||
vm_swap) = pid_to_status(pid)
|
||
|
||
if extra_table_info == 'None':
|
||
extra_table_line = ''
|
||
|
||
elif extra_table_info == 'cgroup_v1':
|
||
extra_table_line = pid_to_cgroup_v1(pid)
|
||
|
||
elif extra_table_info == 'cgroup_v2':
|
||
extra_table_line = pid_to_cgroup_v2(pid)
|
||
|
||
elif extra_table_info == 'cmdline':
|
||
extra_table_line = pid_to_cmdline(pid)
|
||
|
||
elif extra_table_info == 'environ':
|
||
extra_table_line = pid_to_environ(pid)
|
||
|
||
elif extra_table_info == 'realpath':
|
||
extra_table_line = pid_to_realpath(pid)
|
||
|
||
elif extra_table_info == 'All':
|
||
extra_table_line = '[CG: {}] [CL: {}] [RP: {}]'.format(
|
||
pid_to_cgroup_v1(pid),
|
||
pid_to_cmdline(pid),
|
||
pid_to_realpath(pid)
|
||
)
|
||
else:
|
||
extra_table_line = ''
|
||
|
||
log('#{} {} {} {} {} {} {} {} {} {} {} {}'.format(
|
||
pid.rjust(7),
|
||
ppid.rjust(7),
|
||
str(badness).rjust(7),
|
||
oom_score.rjust(9),
|
||
oom_score_adj.rjust(13),
|
||
uid.rjust(10),
|
||
state,
|
||
str(vm_size).rjust(6),
|
||
str(vm_rss).rjust(5),
|
||
str(vm_swap).rjust(6),
|
||
name.ljust(15),
|
||
extra_table_line
|
||
)
|
||
)
|
||
|
||
pid_badness_list.append((pid, badness))
|
||
|
||
real_proc_num = len(pid_badness_list)
|
||
|
||
# Make list of (pid, badness) tuples, sorted by 'badness' values
|
||
# print(pid_badness_list)
|
||
pid_tuple_list = sorted(
|
||
pid_badness_list,
|
||
key=itemgetter(1),
|
||
reverse=True
|
||
)[0]
|
||
|
||
pid = pid_tuple_list[0]
|
||
victim_id = get_victim_id(pid)
|
||
|
||
# Get maximum 'badness' value
|
||
victim_badness = pid_tuple_list[1]
|
||
victim_name = pid_to_name(pid)
|
||
|
||
if _print_proc_table:
|
||
log(hr)
|
||
|
||
log('Found {} processes with existing /proc/[pid]/exe'.format(
|
||
real_proc_num))
|
||
|
||
log(
|
||
'Process with highest badness (found in {} ms):\n PID: {}, Na'
|
||
'me: {}, badness: {}'.format(
|
||
round((time() - ft1) * 1000),
|
||
pid,
|
||
victim_name,
|
||
victim_badness
|
||
)
|
||
)
|
||
|
||
return pid, victim_badness, victim_name, victim_id
|
||
|
||
|
||
def find_victim_info(pid, victim_badness, name):
|
||
"""
|
||
"""
|
||
status0 = time()
|
||
|
||
try:
|
||
|
||
with open('/proc/' + pid + '/status') as f:
|
||
|
||
for n, line in enumerate(f):
|
||
|
||
if n is state_index:
|
||
state = line.split('\t')[1].rstrip()
|
||
continue
|
||
|
||
if n is ppid_index:
|
||
ppid = line.split('\t')[1]
|
||
continue
|
||
|
||
if n is uid_index:
|
||
uid = line.split('\t')[2]
|
||
continue
|
||
|
||
if n is vm_size_index:
|
||
vm_size = kib_to_mib(int(line.split('\t')[1][:-4]))
|
||
continue
|
||
|
||
if n is vm_rss_index:
|
||
vm_rss = kib_to_mib(int(line.split('\t')[1][:-4]))
|
||
continue
|
||
|
||
if detailed_rss:
|
||
|
||
if n is anon_index:
|
||
anon_rss = kib_to_mib(
|
||
int(line.split('\t')[1][:-4]))
|
||
continue
|
||
|
||
if n is file_index:
|
||
file_rss = kib_to_mib(
|
||
int(line.split('\t')[1][:-4]))
|
||
continue
|
||
|
||
if n is shmem_index:
|
||
shmem_rss = kib_to_mib(
|
||
int(line.split('\t')[1][:-4]))
|
||
continue
|
||
|
||
if n is vm_swap_index:
|
||
vm_swap = kib_to_mib(int(line.split('\t')[1][:-4]))
|
||
break
|
||
|
||
if print_victim_cmdline:
|
||
cmdline = pid_to_cmdline(pid)
|
||
oom_score = rline1('/proc/' + pid + '/oom_score')
|
||
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
|
||
|
||
except FileNotFoundError:
|
||
log('The victim died in the search process: FileNotFoundError')
|
||
update_stat_dict_and_print(
|
||
'The victim died in the search process: FileNotFoundError')
|
||
return None
|
||
except ProcessLookupError:
|
||
log('The victim died in the search process: ProcessLookupError')
|
||
update_stat_dict_and_print(
|
||
'The victim died in the search process: ProcessLookupError')
|
||
return None
|
||
except UnicodeDecodeError:
|
||
|
||
with open('/proc/' + pid + '/status', 'rb') as f:
|
||
f_list = f.read().decode('utf-8', 'ignore').split('\n')
|
||
|
||
for i in range(len(f_list)):
|
||
|
||
if i is state_index:
|
||
state = f_list[i].split('\t')[1].rstrip()
|
||
|
||
if i is ppid_index:
|
||
ppid = f_list[i].split('\t')[1]
|
||
|
||
if i is uid_index:
|
||
uid = f_list[i].split('\t')[2]
|
||
|
||
if i is vm_size_index:
|
||
vm_size = kib_to_mib(
|
||
int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if i is vm_rss_index:
|
||
vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if detailed_rss:
|
||
|
||
if i is anon_index:
|
||
anon_rss = kib_to_mib(
|
||
int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if i is file_index:
|
||
file_rss = kib_to_mib(
|
||
int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if i is shmem_index:
|
||
shmem_rss = kib_to_mib(
|
||
int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if i is vm_swap_index:
|
||
vm_swap = kib_to_mib(
|
||
int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if print_victim_cmdline:
|
||
cmdline = pid_to_cmdline(pid)
|
||
oom_score = rline1('/proc/' + pid + '/oom_score')
|
||
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
|
||
|
||
except IndexError:
|
||
log('The victim died in the search process: IndexError')
|
||
update_stat_dict_and_print(
|
||
'The victim died in the search process: IndexError')
|
||
return None
|
||
except ValueError:
|
||
log('The victim died in the search process: ValueError')
|
||
update_stat_dict_and_print(
|
||
'The victim died in the search process: ValueError')
|
||
return None
|
||
except FileNotFoundError:
|
||
log('The victim died in the search process: FileNotFoundError')
|
||
update_stat_dict_and_print(
|
||
'The victim died in the search process: FileNotFoundError')
|
||
return None
|
||
except ProcessLookupError:
|
||
log('The victim died in the search process: ProcessLookupError')
|
||
update_stat_dict_and_print(
|
||
'The victim died in the search process: ProcessLookupError')
|
||
return None
|
||
|
||
len_vm = len(str(vm_size))
|
||
|
||
try:
|
||
realpath = os.path.realpath('/proc/' + pid + '/exe')
|
||
victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
|
||
victim_cgroup_v1 = pid_to_cgroup_v1(pid)
|
||
victim_cgroup_v2 = pid_to_cgroup_v2(pid)
|
||
|
||
except FileNotFoundError:
|
||
print('The victim died in the search process: FileNotFoundError')
|
||
update_stat_dict_and_print(
|
||
'The victim died in the search process: FileNotFoundError')
|
||
return None
|
||
|
||
ancestry = pid_to_ancestry(pid, max_ancestry_depth)
|
||
|
||
if print_victim_cmdline is False:
|
||
cmdline = ''
|
||
c1 = ''
|
||
else:
|
||
c1 = '\n Cmdline: '
|
||
|
||
if detailed_rss:
|
||
detailed_rss_info = ' (' \
|
||
'Anon: {} MiB, ' \
|
||
'File: {} MiB, ' \
|
||
'Shmem: {} MiB)'.format(
|
||
anon_rss,
|
||
file_rss,
|
||
shmem_rss)
|
||
else:
|
||
detailed_rss_info = ''
|
||
|
||
victim_info = 'Victim information (found in {} ms):' \
|
||
'\n Name: {}' \
|
||
'\n State: {}' \
|
||
'\n PID: {}' \
|
||
'{}' \
|
||
'\n EUID: {}' \
|
||
'\n badness: {}, ' \
|
||
'oom_score: {}, ' \
|
||
'oom_score_adj: {}' \
|
||
'\n VmSize: {} MiB' \
|
||
'\n VmRSS: {} MiB {}' \
|
||
'\n VmSwap: {} MiB' \
|
||
'\n CGroup_v1: {}' \
|
||
'\n CGroup_v2: {}' \
|
||
'\n Realpath: {}' \
|
||
'{}{}' \
|
||
'\n Lifetime: {}'.format(
|
||
round((time() - status0) * 1000),
|
||
name,
|
||
state,
|
||
pid,
|
||
ancestry,
|
||
uid,
|
||
victim_badness,
|
||
oom_score,
|
||
oom_score_adj,
|
||
vm_size,
|
||
str(vm_rss).rjust(len_vm),
|
||
detailed_rss_info,
|
||
str(vm_swap).rjust(len_vm),
|
||
victim_cgroup_v1,
|
||
victim_cgroup_v2,
|
||
realpath,
|
||
c1, cmdline,
|
||
victim_lifetime)
|
||
|
||
return victim_info
|
||
|
||
|
||
def check_mem_swap_ex():
|
||
"""
|
||
Check: is mem and swap threshold exceeded?
|
||
Return: None, (SIGTERM, meminfo), (SIGKILL, meminfo)
|
||
"""
|
||
|
||
mem_available, swap_total, swap_free = check_mem_and_swap()
|
||
|
||
# if swap_min_sigkill is set in percent
|
||
if swap_kill_is_percent:
|
||
swap_min_sigkill_kb = swap_total * swap_min_sigkill_percent / 100.0
|
||
else:
|
||
swap_min_sigkill_kb = swap_kb_dict['swap_min_sigkill_kb']
|
||
|
||
if swap_term_is_percent:
|
||
swap_min_sigterm_kb = swap_total * swap_min_sigterm_percent / 100.0
|
||
else:
|
||
swap_min_sigterm_kb = swap_kb_dict['swap_min_sigterm_kb']
|
||
|
||
if swap_warn_is_percent:
|
||
swap_min_warnings_kb = swap_total * swap_min_warnings_percent / 100.0
|
||
else:
|
||
swap_min_warnings_kb = swap_kb_dict['swap_min_warnings_kb']
|
||
|
||
if swap_total > swap_min_sigkill_kb:
|
||
swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1))
|
||
else:
|
||
swap_sigkill_pc = '-'
|
||
|
||
if swap_total > swap_min_sigterm_kb:
|
||
swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1))
|
||
else:
|
||
swap_sigterm_pc = '-'
|
||
|
||
if (mem_available <= mem_min_sigkill_kb and
|
||
swap_free <= swap_min_sigkill_kb):
|
||
|
||
mem_info = 'Memory status that requ' \
|
||
'ires corrective actions (hard threshold exceeded):' \
|
||
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
||
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
||
'p_min_sigkill [{} MiB, {} %]'.format(
|
||
kib_to_mib(mem_available),
|
||
percent(mem_available / mem_total),
|
||
kib_to_mib(mem_min_sigkill_kb),
|
||
percent(mem_min_sigkill_kb / mem_total),
|
||
kib_to_mib(swap_free),
|
||
percent(swap_free / (swap_total + 0.1)),
|
||
kib_to_mib(swap_min_sigkill_kb),
|
||
swap_sigkill_pc)
|
||
|
||
return SIGKILL, mem_info, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
|
||
|
||
if (mem_available <= mem_min_sigterm_kb and
|
||
swap_free <= swap_min_sigterm_kb):
|
||
|
||
mem_info = 'Memory status that requi' \
|
||
'res corrective actions (soft threshold exceeded):' \
|
||
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
||
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
||
'p_min_sigterm [{} MiB, {} %]'.format(
|
||
kib_to_mib(mem_available),
|
||
percent(mem_available / mem_total),
|
||
kib_to_mib(mem_min_sigterm_kb),
|
||
round(mem_min_sigterm_percent, 1),
|
||
kib_to_mib(swap_free),
|
||
percent(swap_free / (swap_total + 0.1)),
|
||
kib_to_mib(swap_min_sigterm_kb),
|
||
swap_sigterm_pc)
|
||
|
||
return SIGTERM, mem_info, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
|
||
|
||
if gui_low_memory_warnings:
|
||
|
||
if (mem_available <= mem_min_warnings_kb and swap_free <=
|
||
swap_min_warnings_kb + 0.1):
|
||
return 'WARN', None, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
|
||
|
||
return None, None, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
|
||
|
||
|
||
def check_zram_ex():
|
||
"""
|
||
"""
|
||
mem_used_zram = check_zram()
|
||
|
||
if mem_used_zram >= zram_max_sigkill_kb:
|
||
|
||
mem_info = 'Memory status that requir' \
|
||
'es corrective actions (hard threshold exceeded):' \
|
||
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
|
||
'kill [{} MiB, {} %]'.format(
|
||
kib_to_mib(mem_used_zram),
|
||
percent(mem_used_zram / mem_total),
|
||
kib_to_mib(zram_max_sigkill_kb),
|
||
percent(zram_max_sigkill_kb / mem_total))
|
||
|
||
return SIGKILL, mem_info, mem_used_zram
|
||
|
||
if mem_used_zram >= zram_max_sigterm_kb:
|
||
|
||
mem_info = 'Memory status that require' \
|
||
's corrective actions (soft threshold exceeded):\n MemUsedZram [{} MiB, {} %] >= zra' \
|
||
'm_max_sigterm [{} M, {} %]'.format(
|
||
kib_to_mib(mem_used_zram),
|
||
percent(mem_used_zram / mem_total),
|
||
kib_to_mib(zram_max_sigterm_kb),
|
||
percent(zram_max_sigterm_kb / mem_total))
|
||
|
||
return SIGTERM, mem_info, mem_used_zram
|
||
|
||
if gui_low_memory_warnings:
|
||
if mem_used_zram >= zram_max_warnings_kb:
|
||
return 'WARN', None, mem_used_zram
|
||
|
||
return None, None, mem_used_zram
|
||
|
||
|
||
def check_psi_ex(psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0):
|
||
"""
|
||
"""
|
||
|
||
delta0 = time() - x0
|
||
x0 = time()
|
||
|
||
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
|
||
# print(psi_avg_value)
|
||
|
||
psi_post_action_delay_timer = time() - psi_t0
|
||
|
||
if psi_post_action_delay_timer >= psi_post_action_delay:
|
||
psi_post_action_delay_exceeded = True
|
||
else:
|
||
psi_post_action_delay_exceeded = False
|
||
|
||
if psi_avg_value >= sigkill_psi_threshold:
|
||
sigkill_psi_exceeded = True
|
||
psi_kill_exceeded_timer += delta0
|
||
else:
|
||
sigkill_psi_exceeded = False
|
||
psi_kill_exceeded_timer = 0
|
||
|
||
if psi_debug:
|
||
|
||
log('psi_post_action_delay_timer: {}'.format(
|
||
round(psi_post_action_delay_timer, 3)))
|
||
|
||
log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded'
|
||
': {}\npsi_kill_exceeded_timer: {}'.format(
|
||
psi_post_action_delay_exceeded,
|
||
sigkill_psi_exceeded,
|
||
round(psi_kill_exceeded_timer, 1)
|
||
)
|
||
)
|
||
|
||
if (psi_kill_exceeded_timer >= psi_excess_duration and
|
||
psi_post_action_delay_exceeded):
|
||
|
||
mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \
|
||
'PSI avg exceeded psi_excess_duration (value' \
|
||
' = {} sec) for {} seconds'.format(
|
||
psi_avg_value,
|
||
sigkill_psi_threshold,
|
||
psi_excess_duration,
|
||
round(psi_kill_exceeded_timer, 1)
|
||
)
|
||
|
||
# psi_t0 = time() # ВОТ ГДЕ ПРОБЛЕМА. Таймер надо сбрасывать именно после применения корректирующего действия. Именно ПОСЛЕ. А не здесь.
|
||
# Или после любого применения, или после успешного.
|
||
# Если жертва умерла в процессе поиска - сбрасываем. Если отправлен
|
||
# сигнал - сбрасываем.
|
||
|
||
return SIGKILL, mem_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
|
||
|
||
if psi_avg_value >= sigterm_psi_threshold:
|
||
sigterm_psi_exceeded = True
|
||
psi_term_exceeded_timer += delta0
|
||
else:
|
||
sigterm_psi_exceeded = False
|
||
psi_term_exceeded_timer = 0
|
||
|
||
if psi_debug:
|
||
|
||
log('sigterm_psi_exceeded: {}\n'
|
||
'psi_term_exceeded_timer: {}\n'.format(
|
||
sigterm_psi_exceeded,
|
||
round(psi_term_exceeded_timer, 1)
|
||
)
|
||
)
|
||
|
||
if (psi_term_exceeded_timer >= psi_excess_duration and
|
||
psi_post_action_delay_exceeded):
|
||
|
||
mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \
|
||
'PSI avg exceeded psi_excess_duration (value' \
|
||
' = {} sec) for {} seconds'.format(
|
||
psi_avg_value,
|
||
sigterm_psi_threshold,
|
||
psi_excess_duration,
|
||
round(psi_term_exceeded_timer, 1)
|
||
)
|
||
|
||
return SIGTERM, mem_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
|
||
|
||
if gui_low_memory_warnings:
|
||
|
||
if psi_avg_value >= psi_avg_warnings:
|
||
return 'WARN', None, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
|
||
|
||
return None, None, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
|
||
|
||
|
||
def is_victim_alive(pid):
|
||
"""
|
||
Проверка статуса жертвы:
|
||
1 - жива
|
||
0 - полное исчезновение
|
||
2 - умирает, освобождает память, зомби
|
||
"""
|
||
exe_exists = os.path.exists('/proc/{}/exe'.format(pid))
|
||
if exe_exists:
|
||
return 1
|
||
statm_exists = os.path.exists('/proc/{}/statm'.format(pid))
|
||
if statm_exists:
|
||
return 2
|
||
else:
|
||
return 0
|
||
|
||
|
||
def implement_corrective_action(
|
||
threshold,
|
||
mem_info_list,
|
||
psi_t0,
|
||
# да это ж тупо время последнего коррект действия. В идеале - время оконч
|
||
# действия. Любого.
|
||
psi_kill_exceeded_timer,
|
||
psi_term_exceeded_timer,
|
||
x0, psi_threshold, zram_threshold, zram_info, psi_info):
|
||
"""
|
||
Find victim with highest badness and send SIGTERM/SIGKILL
|
||
"""
|
||
|
||
# Ёбаная запутанная фция. Распутать всё нахуй. Выделить части в отдельн фции.
|
||
# Разбить саму фцию на части. Сделать ее структуру простой и понятной.
|
||
|
||
time0 = time() # начало корр действия. Для вычисл времени действия.
|
||
|
||
# выходим из фции, если для SIGTERM порога не превышено время
|
||
# min_delay_after_sigterm и спим в течение over_sleep
|
||
# если хард порог превышен - идем дальше.
|
||
if threshold is SIGTERM:
|
||
|
||
dt = time() - actions_time_dict['action_handled'][0]
|
||
|
||
if dt < min_delay_after_sigterm:
|
||
log('min_delay_after_sigterm IS NOT EXCEEDED ({} < {})'.format(
|
||
round(dt, 3), min_delay_after_sigterm))
|
||
|
||
if print_sleep_periods:
|
||
log('Sleep {} sec [in implement_corrective_action()]'.format(
|
||
over_sleep))
|
||
|
||
sleep(over_sleep)
|
||
|
||
return psi_t0 # время задержки между действиями не истекло
|
||
else:
|
||
log('min_delay_after_sigterm IS EXCEEDED, it is time to action')
|
||
|
||
"""
|
||
|
||
При заходе в фцию проверяем права на сигтерм. Права на сигкилл у нас
|
||
всегда есть
|
||
(потому что идем дальше только после полн освободж памяти после
|
||
смерти жертвы)
|
||
|
||
actions_time_dict[action_handled] = time()
|
||
actions_time_dict[veto] = True
|
||
|
||
actions_time_dict['action_handled'] = [time(), victim_id]
|
||
|
||
|
||
|
||
"""
|
||
|
||
for i in mem_info_list:
|
||
log(i)
|
||
|
||
# ищем жертву с ее бэднес.
|
||
pid, victim_badness, name, victim_id = find_victim(print_proc_table)
|
||
# sleep(0.1)
|
||
|
||
log('Recheck memory levels...')
|
||
|
||
# перепроверяем пороги: они могли измениться за время поиска жертвы
|
||
(masf_threshold, masf_info, mem_available, swap_min_sigkill_kb,
|
||
swap_min_sigterm_kb, swap_free, swap_total) = check_mem_swap_ex()
|
||
|
||
if CHECK_ZRAM:
|
||
zram_threshold, zram_info, mem_used_zram = check_zram_ex()
|
||
|
||
if CHECK_PSI:
|
||
(psi_threshold, psi_info, psi_t0, psi_kill_exceeded_timer,
|
||
psi_term_exceeded_timer, x0) = check_psi_ex(
|
||
psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0)
|
||
|
||
if masf_threshold is SIGKILL or zram_threshold is SIGKILL or psi_threshold is SIGKILL:
|
||
|
||
new_threshold = SIGKILL
|
||
mem_info_list = []
|
||
|
||
if masf_threshold is SIGKILL or masf_threshold is SIGTERM:
|
||
mem_info_list.append(masf_info)
|
||
|
||
if zram_threshold is SIGKILL or zram_threshold is SIGTERM:
|
||
mem_info_list.append(zram_info)
|
||
|
||
if psi_threshold is SIGKILL or psi_threshold is SIGTERM:
|
||
mem_info_list.append(psi_info)
|
||
|
||
elif masf_threshold is SIGTERM or zram_threshold is SIGTERM or psi_threshold is SIGTERM:
|
||
|
||
new_threshold = SIGTERM
|
||
mem_info_list = []
|
||
|
||
if masf_threshold is SIGKILL or masf_threshold is SIGTERM:
|
||
mem_info_list.append(masf_info)
|
||
|
||
if zram_threshold is SIGKILL or zram_threshold is SIGTERM:
|
||
mem_info_list.append(zram_info)
|
||
|
||
if psi_threshold is SIGKILL or psi_threshold is SIGTERM:
|
||
mem_info_list.append(psi_info)
|
||
|
||
else:
|
||
log('Thresholds is not exceeded now')
|
||
return psi_t0
|
||
|
||
# печать порогов
|
||
for i in mem_info_list:
|
||
log(i)
|
||
|
||
# может это излишне
|
||
if new_threshold is None or new_threshold == 'WARN':
|
||
log('Thresholds is not exceeded now')
|
||
return psi_t0
|
||
|
||
threshold = new_threshold
|
||
|
||
if victim_badness >= min_badness:
|
||
|
||
psi_t0 = time() # так себе идея
|
||
|
||
if print_victim_info:
|
||
victim_info = find_victim_info(pid, victim_badness, name)
|
||
log(victim_info)
|
||
|
||
# kill the victim if it doesn't respond to SIGTERM В ТЕЧЕНИЕ
|
||
# ЗАДАННОГО ВРЕМЕНИ
|
||
|
||
# переопределяем сигнал для старых жертв
|
||
if threshold is SIGTERM:
|
||
|
||
if victim_id in victim_dict:
|
||
|
||
dt = time() - victim_dict[victim_id]
|
||
|
||
if dt > max_post_sigterm_victim_lifetime:
|
||
print('max_post_sigterm_victim_lifetime exceeded: the '
|
||
'victim will get SIGKILL')
|
||
threshold = SIGKILL
|
||
|
||
# matching with re to customize corrective actions
|
||
soft_match = False
|
||
|
||
if soft_actions and threshold is SIGTERM:
|
||
name = pid_to_name(pid)
|
||
cgroup_v1 = pid_to_cgroup_v1(pid)
|
||
service = ''
|
||
cgroup_v1_tail = cgroup_v1.rpartition('/')[2]
|
||
if cgroup_v1_tail.endswith('.service'):
|
||
service = cgroup_v1_tail
|
||
for i in soft_actions_list:
|
||
unit = i[0]
|
||
if unit == 'name':
|
||
u = name
|
||
else:
|
||
u = cgroup_v1
|
||
regexp = i[1]
|
||
command = i[2]
|
||
if search(regexp, u) is not None:
|
||
log("Regexp '{}' matches with {} '{}'".format(
|
||
regexp, unit, u))
|
||
soft_match = True
|
||
break
|
||
|
||
if soft_match: # ПЕРЕОПРЕДЕЛЕНИЕ МЯГКОГО КОРРЕКТИРУЮЩЕГО ДЕЙСТВИЯ
|
||
|
||
# todo: make new func
|
||
m = check_mem_and_swap()
|
||
ma = int(m[0]) / 1024.0
|
||
sf = int(m[2]) / 1024.0
|
||
log('Memory status before implementing a corrective act'
|
||
'ion:\n MemAvailable'
|
||
': {} MiB, SwapFree: {} MiB'.format(
|
||
round(ma, 1), round(sf, 1)
|
||
)
|
||
)
|
||
|
||
cmd = command.replace(
|
||
'$PID',
|
||
pid).replace(
|
||
'$NAME',
|
||
pid_to_name(pid)).replace(
|
||
'$SERVICE',
|
||
service)
|
||
|
||
exit_status = exe(cmd)
|
||
|
||
exit_status = str(exit_status)
|
||
|
||
response_time = time() - time0
|
||
|
||
# тут надо, как и при дефолтном действии, проверять существование
|
||
# жертвы, ее реакцию на действие,
|
||
# и время ее смерти в случае успеха, о обновление таймстемпов
|
||
# действия
|
||
|
||
etc_info = 'Implement a corrective act' \
|
||
'ion:\n Run the command: {}' \
|
||
'\n Exit status: {}; total response ' \
|
||
'time: {} ms'.format(
|
||
cmd,
|
||
exit_status,
|
||
round(response_time * 1000))
|
||
|
||
log(etc_info)
|
||
|
||
key = "Run the command '{}'".format(cmd)
|
||
update_stat_dict_and_print(key)
|
||
|
||
if gui_notifications:
|
||
send_notify_etc(
|
||
pid,
|
||
name,
|
||
command.replace('$PID', pid).replace(
|
||
'$NAME', pid_to_name(pid)))
|
||
|
||
else:
|
||
|
||
# обычное действие через сигнал
|
||
|
||
# вот тут поработать. Тут ебаный цикл. Нахуй его.
|
||
|
||
try: # вот тут засрано, в блоке try должно быть только kill(), остальное ниже за пределами
|
||
|
||
os.kill(int(pid), threshold)
|
||
|
||
a_dict[threshold] = time()
|
||
v_dict[victim_id] = time()
|
||
|
||
kill_timestamp = time()
|
||
response_time = kill_timestamp - time0
|
||
|
||
while True:
|
||
victim_alive = is_victim_alive(pid)
|
||
dt = time() - kill_timestamp
|
||
if victim_alive == 2 or dt > 0.02:
|
||
# print(dt)
|
||
break
|
||
sleep(0.002)
|
||
|
||
if dt > 0.02:
|
||
log('Timer (value = 0.02 sec) expired; victim does not respond on action in 0.02 sec')
|
||
|
||
actions_time_dict['action_handled'] = [
|
||
time(), get_victim_id(pid)]
|
||
|
||
if victim_id not in victim_dict: # хз как надо.
|
||
victim_dict.update({victim_id: time()})
|
||
|
||
# log('actions_time_dict', actions_time_dict)
|
||
# log('victim_dict', victim_dict)
|
||
|
||
else:
|
||
log('Process exited (VmRSS = 0) in {} sec'.format(
|
||
round(dt, 5)))
|
||
|
||
if threshold is SIGKILL or victim_alive == 2:
|
||
# жертва умирает от SIGKILL. Дожидаемся ее полной смерти.
|
||
|
||
while True:
|
||
sleep(0.002)
|
||
rss = pid_to_rss(pid)
|
||
if rss is None: # процесс исчез
|
||
break
|
||
t1 = time()
|
||
kill_duration = t1 - kill_timestamp
|
||
log('The victim died in {} sec'.format(
|
||
round(kill_duration, 3)))
|
||
|
||
mem_available, swap_total, swap_free = check_mem_and_swap()
|
||
|
||
ma_mib = int(mem_available) / 1024.0
|
||
sf_mib = int(swap_free) / 1024.0
|
||
log('Memory status after implementing a corrective act'
|
||
'ion:\n MemAvailable'
|
||
': {} MiB, SwapFree: {} MiB'.format(
|
||
round(ma_mib, 1), round(sf_mib, 1)
|
||
)
|
||
)
|
||
|
||
send_result = 'total response time: {} ms'.format(
|
||
round(response_time * 1000))
|
||
|
||
preventing_oom_message = 'Implement a corrective action:' \
|
||
'\n Send {} to the victim; {}'.format(
|
||
sig_dict[threshold], send_result)
|
||
|
||
key = 'Send {} to {}'.format(sig_dict[threshold], name)
|
||
|
||
if threshold is SIGKILL and post_kill_exe != '':
|
||
|
||
cmd = post_kill_exe.replace('$PID', pid).replace(
|
||
'$NAME', pid_to_name(pid))
|
||
|
||
log('Execute post_kill_exe')
|
||
|
||
exe(cmd)
|
||
|
||
if gui_notifications:
|
||
send_notify(threshold, name, pid)
|
||
|
||
except FileNotFoundError:
|
||
response_time = time() - time0
|
||
send_result = 'no such process; response time: {} ms'.format(
|
||
round(response_time * 1000))
|
||
key = 'The victim died in the search process: FileNotFoundError'
|
||
except ProcessLookupError:
|
||
response_time = time() - time0
|
||
send_result = 'no such process; response time: {} ms'.format(
|
||
round(response_time * 1000))
|
||
key = 'The victim died in the search process: ProcessLookupError'
|
||
|
||
try:
|
||
log(preventing_oom_message)
|
||
|
||
except UnboundLocalError: # какой позор
|
||
preventing_oom_message = key
|
||
|
||
update_stat_dict_and_print(key)
|
||
|
||
# нехуй делать, бэднес жертвы слишком мал
|
||
else:
|
||
|
||
# может эту часть наверх отправить через if
|
||
|
||
response_time = time() - time0
|
||
victim_badness_is_too_small = 'victim badness {} < min_b' \
|
||
'adness {}; nothing to do; response time: {} ms'.format(
|
||
victim_badness,
|
||
min_badness,
|
||
round(response_time * 1000))
|
||
|
||
log(victim_badness_is_too_small)
|
||
|
||
# update stat_dict
|
||
key = 'victim badness < min_badness'
|
||
update_stat_dict_and_print(key)
|
||
|
||
# тут надо поспать хорошенько. а может и счетчики поправить.
|
||
# херню несу. во-первых, внезапно может кто-то появиться c блльшим
|
||
# бэднес.. Далее надо минимизировать аутпут спам.
|
||
sleep(over_sleep)
|
||
|
||
# обновлять время не на каждый кил, а только на килл той жертвы,
|
||
# которая не отвечала на софт экшн.
|
||
# Вывод: ко времени действия прилагать также виктим айди.
|
||
|
||
print('##################################################################')
|
||
|
||
sleep(over_sleep) # Спать если бэднес жертвы мал
|
||
|
||
# Что делать с psi_t0 если у мертвы мал бэднес? Ничего, потому что кор действия не было.
|
||
# демон может жрать 10% цпу при этом. Можно отдельн парам ввести. А можно
|
||
# не вводить. кек
|
||
|
||
return psi_t0
|
||
|
||
|
||
def sleep_after_check_mem():
|
||
"""Specify sleep times depends on rates and avialable memory."""
|
||
|
||
if stable_sleep:
|
||
|
||
if print_sleep_periods:
|
||
log('Sleep {} sec'.format(min_sleep))
|
||
stdout.flush()
|
||
sleep(min_sleep)
|
||
return None
|
||
|
||
if mem_min_sigkill_kb < mem_min_sigterm_kb:
|
||
mem_point = mem_available - mem_min_sigterm_kb
|
||
else:
|
||
mem_point = mem_available - mem_min_sigkill_kb
|
||
|
||
if swap_min_sigkill_kb < swap_min_sigterm_kb:
|
||
swap_point = swap_free - swap_min_sigterm_kb
|
||
else:
|
||
swap_point = swap_free - swap_min_sigkill_kb
|
||
|
||
if swap_point < 0:
|
||
swap_point = 0
|
||
|
||
if mem_point < 0:
|
||
mem_point = 0
|
||
|
||
t_mem = mem_point / rate_mem
|
||
t_swap = swap_point / rate_swap
|
||
|
||
if CHECK_ZRAM:
|
||
t_zram = (mem_total * 0.8 - mem_used_zram) / rate_zram
|
||
if t_zram < 0:
|
||
t_zram = 0
|
||
t_mem_zram = t_mem + t_zram
|
||
z = ', t_zram={}'.format(round(t_zram, 2))
|
||
else:
|
||
z = ''
|
||
|
||
t_mem_swap = t_mem + t_swap
|
||
|
||
if CHECK_ZRAM:
|
||
|
||
if t_mem_swap <= t_mem_zram:
|
||
t = t_mem_swap
|
||
else:
|
||
t = t_mem_zram
|
||
else:
|
||
t = t_mem_swap
|
||
|
||
if t > max_sleep:
|
||
t = max_sleep
|
||
elif t < min_sleep:
|
||
t = min_sleep
|
||
else:
|
||
pass
|
||
|
||
if print_sleep_periods:
|
||
|
||
log(
|
||
'Sleep {} sec (t_mem={}, t_swap={}{})'.format(
|
||
round(t, 2), round(t_mem, 2), round(t_swap, 2), z)
|
||
)
|
||
|
||
try:
|
||
stdout.flush()
|
||
except OSError:
|
||
pass
|
||
|
||
sleep(t)
|
||
|
||
|
||
def calculate_percent(arg_key):
|
||
"""
|
||
parse conf dict
|
||
Calculate mem_min_KEY_percent.
|
||
|
||
Try use this one)
|
||
arg_key: str key for config_dict
|
||
returns int mem_min_percent or NoneType if got some error
|
||
"""
|
||
|
||
if arg_key in config_dict:
|
||
mem_min = config_dict[arg_key]
|
||
|
||
if mem_min.endswith('%'):
|
||
# truncate percents, so we have a number
|
||
mem_min_percent = mem_min[:-1].strip()
|
||
# then 'float test'
|
||
mem_min_percent = string_to_float_convert_test(mem_min_percent)
|
||
if mem_min_percent is None:
|
||
errprint('Invalid {} value, not float\nExit'.format(arg_key))
|
||
exit(1)
|
||
# Final validations...
|
||
if mem_min_percent < 0 or mem_min_percent > 100:
|
||
errprint(
|
||
'{}, as percents value, out of ran'
|
||
'ge [0; 100]\nExit'.format(arg_key))
|
||
exit(1)
|
||
|
||
# mem_min_sigterm_percent is clean and valid float percentage. Can
|
||
# translate into Kb
|
||
mem_min_kb = mem_min_percent / 100 * mem_total
|
||
mem_min_mb = round(mem_min_kb / 1024)
|
||
|
||
elif mem_min.endswith('M'):
|
||
mem_min_mb = string_to_float_convert_test(mem_min[:-1].strip())
|
||
if mem_min_mb is None:
|
||
errprint('Invalid {} value, not float\nExit'.format(arg_key))
|
||
exit(1)
|
||
mem_min_kb = mem_min_mb * 1024
|
||
if mem_min_kb > mem_total:
|
||
errprint(
|
||
'{} value can not be greater then MemT'
|
||
'otal ({} MiB)\nExit'.format(
|
||
arg_key, round(
|
||
mem_total / 1024)))
|
||
exit(1)
|
||
mem_min_percent = mem_min_kb / mem_total * 100
|
||
|
||
else:
|
||
log('Invalid {} units in config.\n Exit'.format(arg_key))
|
||
exit(1)
|
||
mem_min_percent = None
|
||
|
||
else:
|
||
log('{} not in config\nExit'.format(arg_key))
|
||
exit(1)
|
||
mem_min_percent = None
|
||
|
||
return mem_min_kb, mem_min_mb, mem_min_percent
|
||
|
||
|
||
##########################################################################
|
||
|
||
|
||
victim_dict = dict()
|
||
victim_id = None
|
||
actions_time_dict = dict()
|
||
actions_time_dict['action_handled'] = [time(), victim_id]
|
||
# print(actions_time_dict)
|
||
|
||
|
||
# (victim_id : {SIGKILL: ts, SIGTERM: ts}}
|
||
v_dict = dict()
|
||
|
||
|
||
# {SIGTERM: timestamp, SIGKILL: timestamp, 'last_action_ts': ts}
|
||
a_dict = dict()
|
||
|
||
|
||
a_dict['last_action_ts'] = a_dict[SIGTERM] = a_dict[SIGKILL] = time()
|
||
|
||
|
||
# print(a_dict)
|
||
|
||
|
||
start_time = time()
|
||
|
||
|
||
help_mess = """usage: nohang [-h] [-v] [-t] [-p] [-c CONFIG]
|
||
|
||
optional arguments:
|
||
-h, --help show this help message and exit
|
||
-v, --version print version
|
||
-p, --print-proc-table
|
||
print table of processes with their badness values
|
||
-c CONFIG, --config CONFIG
|
||
path to the config file, default values:
|
||
./nohang.conf, /etc/nohang/nohang.conf"""
|
||
|
||
|
||
SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
|
||
|
||
SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE'])
|
||
|
||
conf_err_mess = 'Invalid config. Exit.'
|
||
|
||
sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP]
|
||
|
||
sig_dict = {
|
||
SIGKILL: 'SIGKILL',
|
||
SIGINT: 'SIGINT',
|
||
SIGQUIT: 'SIGQUIT',
|
||
SIGHUP: 'SIGHUP',
|
||
SIGTERM: 'SIGTERM'
|
||
}
|
||
|
||
self_pid = str(os.getpid())
|
||
|
||
self_uid = os.geteuid()
|
||
|
||
if self_uid == 0:
|
||
root = True
|
||
else:
|
||
root = False
|
||
|
||
|
||
if os.path.exists('./nohang_notify_helper'):
|
||
notify_helper_path = './nohang_notify_helper'
|
||
else:
|
||
notify_helper_path = '/usr/sbin/nohang_notify_helper'
|
||
|
||
|
||
# will store corrective actions stat
|
||
stat_dict = dict()
|
||
|
||
|
||
separate_log = False # will be overwritten after parse config
|
||
|
||
|
||
cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
|
||
|
||
|
||
self_oom_score_adj_min = '-600'
|
||
self_oom_score_adj_max = '-6'
|
||
|
||
|
||
write_self_oom_score_adj(self_oom_score_adj_min)
|
||
|
||
|
||
pid_list = get_pid_list()
|
||
|
||
|
||
print_proc_table_flag = False
|
||
|
||
if len(argv) == 1:
|
||
if os.path.exists('./nohang.conf'):
|
||
config = os.getcwd() + '/nohang.conf'
|
||
else:
|
||
config = '/etc/nohang/nohang.conf'
|
||
|
||
elif len(argv) == 2:
|
||
if argv[1] == '--help' or argv[1] == '-h':
|
||
print(help_mess)
|
||
exit()
|
||
elif argv[1] == '--version' or argv[1] == '-v':
|
||
print_version()
|
||
elif argv[1] == '--print-proc-table' or argv[1] == '-p':
|
||
print_proc_table_flag = True
|
||
if os.path.exists('./nohang.conf'):
|
||
config = os.getcwd() + '/nohang.conf'
|
||
else:
|
||
config = '/etc/nohang/nohang.conf'
|
||
else:
|
||
errprint('Unknown option: {}'.format(argv[1]))
|
||
exit(1)
|
||
|
||
elif len(argv) == 3:
|
||
if argv[1] == '--config' or argv[1] == '-c':
|
||
config = argv[2]
|
||
else:
|
||
errprint('Unknown option: {}'.format(argv[1]))
|
||
exit(1)
|
||
|
||
else:
|
||
errprint('Invalid CLI input: too many options')
|
||
exit(1)
|
||
|
||
|
||
# find mem_total
|
||
# find positions of SwapFree and SwapTotal in /proc/meminfo
|
||
|
||
with open('/proc/meminfo') as f:
|
||
mem_list = f.readlines()
|
||
|
||
mem_list_names = []
|
||
for s in mem_list:
|
||
mem_list_names.append(s.split(':')[0])
|
||
|
||
if mem_list_names[2] != 'MemAvailable':
|
||
errprint('WARNING: Your Linux kernel is too old, Linux 3.14+ requied')
|
||
# exit(1)
|
||
|
||
swap_total_index = mem_list_names.index('SwapTotal')
|
||
swap_free_index = swap_total_index + 1
|
||
|
||
mem_total = int(mem_list[0].split(':')[1][:-4])
|
||
|
||
# Get names from /proc/*/status to be able to get VmRSS and VmSwap values
|
||
|
||
with open('/proc/self/status') as file:
|
||
status_list = file.readlines()
|
||
|
||
status_names = []
|
||
for s in status_list:
|
||
status_names.append(s.split(':')[0])
|
||
|
||
ppid_index = status_names.index('PPid')
|
||
vm_size_index = status_names.index('VmSize')
|
||
vm_rss_index = status_names.index('VmRSS')
|
||
vm_swap_index = status_names.index('VmSwap')
|
||
uid_index = status_names.index('Uid')
|
||
state_index = status_names.index('State')
|
||
|
||
|
||
try:
|
||
anon_index = status_names.index('RssAnon')
|
||
file_index = status_names.index('RssFile')
|
||
shmem_index = status_names.index('RssShmem')
|
||
detailed_rss = True
|
||
# print(detailed_rss, 'detailed_rss')
|
||
except ValueError:
|
||
detailed_rss = False
|
||
# print('It is not Linux 4.5+')
|
||
|
||
|
||
log('Config: ' + config)
|
||
|
||
|
||
##########################################################################
|
||
|
||
# parsing the config with obtaining the parameters dictionary
|
||
|
||
# conf_parameters_dict
|
||
# conf_restart_dict
|
||
|
||
# dictionary with config options
|
||
config_dict = dict()
|
||
|
||
badness_adj_re_name_list = []
|
||
badness_adj_re_cmdline_list = []
|
||
badness_adj_re_environ_list = []
|
||
badness_adj_re_uid_list = []
|
||
badness_adj_re_cgroup_v1_list = []
|
||
badness_adj_re_cgroup_v2_list = []
|
||
badness_adj_re_realpath_list = []
|
||
|
||
soft_actions_list = []
|
||
|
||
# separator for optional parameters (that starts with @)
|
||
opt_separator = '///'
|
||
|
||
# stupid conf parsing, need refactoring
|
||
try:
|
||
with open(config) as f:
|
||
|
||
for line in f:
|
||
|
||
a = line.startswith('#')
|
||
b = line.startswith('\n')
|
||
c = line.startswith('\t')
|
||
d = line.startswith(' ')
|
||
|
||
etc = line.startswith('@SOFT_ACTION_RE_NAME')
|
||
etc2 = line.startswith('@SOFT_ACTION_RE_CGROUP_V1')
|
||
|
||
if not a and not b and not c and not d and not etc and not etc2:
|
||
a = line.partition('=')
|
||
|
||
key = a[0].strip()
|
||
value = a[2].strip()
|
||
|
||
if key not in config_dict:
|
||
config_dict[key] = value
|
||
else:
|
||
log('ERROR: config key duplication: {}'.format(key))
|
||
exit(1)
|
||
|
||
if etc:
|
||
|
||
a = line.partition('@SOFT_ACTION_RE_NAME')[
|
||
2].partition(opt_separator)
|
||
|
||
a1 = 'name'
|
||
|
||
a2 = a[0].strip()
|
||
valid_re(a2)
|
||
|
||
a3 = a[2].strip()
|
||
|
||
zzz = (a1, a2, a3)
|
||
|
||
soft_actions_list.append(zzz)
|
||
|
||
if etc2:
|
||
|
||
a = line.partition('@SOFT_ACTION_RE_CGROUP_V1')[
|
||
2].partition(opt_separator)
|
||
|
||
a1 = 'cgroup_v1'
|
||
|
||
a2 = a[0].strip()
|
||
valid_re(a2)
|
||
|
||
a3 = a[2].strip()
|
||
|
||
zzz = (a1, a2, a3)
|
||
|
||
soft_actions_list.append(zzz)
|
||
|
||
if line.startswith('@BADNESS_ADJ_RE_NAME'):
|
||
a = line.partition('@BADNESS_ADJ_RE_NAME')[2].strip(
|
||
' \n').partition(opt_separator)
|
||
badness_adj = a[0].strip(' ')
|
||
reg_exp = a[2].strip(' ')
|
||
valid_re(reg_exp)
|
||
badness_adj_re_name_list.append((badness_adj, reg_exp))
|
||
|
||
if line.startswith('@BADNESS_ADJ_RE_CMDLINE'):
|
||
a = line.partition('@BADNESS_ADJ_RE_CMDLINE')[2].strip(
|
||
' \n').partition(opt_separator)
|
||
badness_adj = a[0].strip(' ')
|
||
reg_exp = a[2].strip(' ')
|
||
valid_re(reg_exp)
|
||
badness_adj_re_cmdline_list.append((badness_adj, reg_exp))
|
||
|
||
if line.startswith('@BADNESS_ADJ_RE_UID'):
|
||
a = line.partition('@BADNESS_ADJ_RE_UID')[2].strip(
|
||
' \n').partition(opt_separator)
|
||
badness_adj = a[0].strip(' ')
|
||
reg_exp = a[2].strip(' ')
|
||
valid_re(reg_exp)
|
||
badness_adj_re_uid_list.append((badness_adj, reg_exp))
|
||
|
||
if line.startswith('@BADNESS_ADJ_RE_CGROUP_V1'):
|
||
a = line.partition('@BADNESS_ADJ_RE_CGROUP_V1')[2].strip(
|
||
' \n').partition(opt_separator)
|
||
badness_adj = a[0].strip(' ')
|
||
reg_exp = a[2].strip(' ')
|
||
valid_re(reg_exp)
|
||
badness_adj_re_cgroup_v1_list.append((badness_adj, reg_exp))
|
||
|
||
if line.startswith('@BADNESS_ADJ_RE_CGROUP_V2'):
|
||
a = line.partition('@BADNESS_ADJ_RE_CGROUP_V2')[2].strip(
|
||
' \n').partition(opt_separator)
|
||
badness_adj = a[0].strip(' ')
|
||
reg_exp = a[2].strip(' ')
|
||
valid_re(reg_exp)
|
||
badness_adj_re_cgroup_v2_list.append((badness_adj, reg_exp))
|
||
|
||
if line.startswith('@BADNESS_ADJ_RE_REALPATH'):
|
||
a = line.partition('@BADNESS_ADJ_RE_REALPATH')[2].strip(
|
||
' \n').partition(opt_separator)
|
||
badness_adj = a[0].strip(' ')
|
||
reg_exp = a[2].strip(' ')
|
||
valid_re(reg_exp)
|
||
badness_adj_re_realpath_list.append((badness_adj, reg_exp))
|
||
|
||
if line.startswith('@BADNESS_ADJ_RE_ENVIRON'):
|
||
a = line.partition('@BADNESS_ADJ_RE_ENVIRON')[2].strip(
|
||
' \n').partition(opt_separator)
|
||
badness_adj = a[0].strip(' ')
|
||
reg_exp = a[2].strip(' ')
|
||
valid_re(reg_exp)
|
||
badness_adj_re_environ_list.append((badness_adj, reg_exp))
|
||
|
||
|
||
except PermissionError:
|
||
errprint('PermissionError', conf_err_mess)
|
||
exit(1)
|
||
except UnicodeDecodeError:
|
||
errprint('UnicodeDecodeError', conf_err_mess)
|
||
exit(1)
|
||
except IsADirectoryError:
|
||
errprint('IsADirectoryError', conf_err_mess)
|
||
exit(1)
|
||
except IndexError:
|
||
errprint('IndexError', conf_err_mess)
|
||
exit(1)
|
||
except FileNotFoundError:
|
||
errprint('FileNotFoundError', conf_err_mess)
|
||
exit(1)
|
||
|
||
|
||
if badness_adj_re_name_list == []:
|
||
regex_matching = False
|
||
else:
|
||
regex_matching = True
|
||
|
||
|
||
if badness_adj_re_cmdline_list == []:
|
||
re_match_cmdline = False
|
||
else:
|
||
re_match_cmdline = True
|
||
|
||
|
||
if badness_adj_re_uid_list == []:
|
||
re_match_uid = False
|
||
else:
|
||
re_match_uid = True
|
||
|
||
|
||
if badness_adj_re_environ_list == []:
|
||
re_match_environ = False
|
||
else:
|
||
re_match_environ = True
|
||
|
||
|
||
if badness_adj_re_realpath_list == []:
|
||
re_match_realpath = False
|
||
else:
|
||
re_match_realpath = True
|
||
|
||
|
||
if badness_adj_re_cgroup_v1_list == []:
|
||
re_match_cgroup_v1 = False
|
||
else:
|
||
re_match_cgroup_v1 = True
|
||
|
||
if badness_adj_re_cgroup_v2_list == []:
|
||
re_match_cgroup_v2 = False
|
||
else:
|
||
re_match_cgroup_v2 = True
|
||
|
||
|
||
# print(badness_adj_re_name_list)
|
||
# print(badness_adj_re_cmdline_list)
|
||
# print(badness_adj_re_uid_list)
|
||
# print(badness_adj_re_environ_list)
|
||
# print(badness_adj_re_realpath_list)
|
||
# print(badness_adj_re_cgroup_v1_list)
|
||
# print(badness_adj_re_cgroup_v2_list)
|
||
|
||
# print(soft_actions_list)
|
||
|
||
if soft_actions_list == []:
|
||
soft_actions = False
|
||
else:
|
||
soft_actions = True
|
||
|
||
# print('soft_actions:', soft_actions)
|
||
|
||
##########################################################################
|
||
|
||
|
||
# extracting parameters from the dictionary
|
||
# check for all necessary parameters
|
||
# validation of all parameters
|
||
psi_debug = conf_parse_bool('psi_debug')
|
||
print_total_stat = conf_parse_bool('print_total_stat')
|
||
print_proc_table = conf_parse_bool('print_proc_table')
|
||
forbid_negative_badness = conf_parse_bool('forbid_negative_badness')
|
||
print_victim_info = conf_parse_bool('print_victim_info')
|
||
|
||
print_victim_cmdline = conf_parse_bool('print_victim_cmdline')
|
||
|
||
|
||
print_config = conf_parse_bool('print_config')
|
||
print_mem_check_results = conf_parse_bool('print_mem_check_results')
|
||
print_sleep_periods = conf_parse_bool('print_sleep_periods')
|
||
gui_low_memory_warnings = conf_parse_bool('gui_low_memory_warnings')
|
||
gui_notifications = conf_parse_bool('gui_notifications')
|
||
decrease_oom_score_adj = conf_parse_bool('decrease_oom_score_adj')
|
||
ignore_psi = conf_parse_bool('ignore_psi')
|
||
ignore_zram = conf_parse_bool('ignore_zram')
|
||
|
||
|
||
(mem_min_sigterm_kb, mem_min_sigterm_mb, mem_min_sigterm_percent
|
||
) = calculate_percent('mem_min_sigterm')
|
||
|
||
(mem_min_sigkill_kb, mem_min_sigkill_mb, mem_min_sigkill_percent
|
||
) = calculate_percent('mem_min_sigkill')
|
||
|
||
(zram_max_sigterm_kb, zram_max_sigterm_mb, zram_max_sigterm_percent
|
||
) = calculate_percent('zram_max_sigterm')
|
||
|
||
(zram_max_sigkill_kb, zram_max_sigkill_mb, zram_max_sigkill_percent
|
||
) = calculate_percent('zram_max_sigkill')
|
||
|
||
(mem_min_warnings_kb, mem_min_warnings_mb, mem_min_warnings_percent
|
||
) = calculate_percent('mem_min_warnings')
|
||
|
||
(zram_max_warnings_kb, zram_max_warnings_mb, zram_max_warnings_percent
|
||
) = calculate_percent('zram_max_warnings')
|
||
|
||
|
||
if 'rate_mem' in config_dict:
|
||
rate_mem = string_to_float_convert_test(config_dict['rate_mem'])
|
||
if rate_mem is None:
|
||
errprint('Invalid rate_mem value, not float\nExit')
|
||
exit(1)
|
||
if rate_mem <= 0:
|
||
errprint('rate_mem MUST be > 0\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('rate_mem not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'rate_swap' in config_dict:
|
||
rate_swap = string_to_float_convert_test(config_dict['rate_swap'])
|
||
if rate_swap is None:
|
||
errprint('Invalid rate_swap value, not float\nExit')
|
||
exit(1)
|
||
if rate_swap <= 0:
|
||
errprint('rate_swap MUST be > 0\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('rate_swap not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'rate_zram' in config_dict:
|
||
rate_zram = string_to_float_convert_test(config_dict['rate_zram'])
|
||
if rate_zram is None:
|
||
errprint('Invalid rate_zram value, not float\nExit')
|
||
exit(1)
|
||
if rate_zram <= 0:
|
||
errprint('rate_zram MUST be > 0\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('rate_zram not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'swap_min_sigterm' in config_dict:
|
||
swap_min_sigterm = config_dict['swap_min_sigterm']
|
||
else:
|
||
errprint('swap_min_sigterm not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'swap_min_sigkill' in config_dict:
|
||
swap_min_sigkill = config_dict['swap_min_sigkill']
|
||
else:
|
||
errprint('swap_min_sigkill not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'min_delay_after_sigterm' in config_dict:
|
||
min_delay_after_sigterm = string_to_float_convert_test(
|
||
config_dict['min_delay_after_sigterm'])
|
||
if min_delay_after_sigterm is None:
|
||
errprint('Invalid min_delay_after_sigterm value, not float\nExit')
|
||
exit(1)
|
||
if min_delay_after_sigterm < 0:
|
||
errprint('min_delay_after_sigterm must be positiv\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('min_delay_after_sigterm not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'psi_post_action_delay' in config_dict:
|
||
psi_post_action_delay = string_to_float_convert_test(
|
||
config_dict['psi_post_action_delay'])
|
||
if psi_post_action_delay is None:
|
||
errprint('Invalid psi_post_action_delay value, not float\nExit')
|
||
exit(1)
|
||
if psi_post_action_delay < 0:
|
||
errprint('psi_post_action_delay must be positive\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('psi_post_action_delay not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'sigkill_psi_threshold' in config_dict:
|
||
sigkill_psi_threshold = string_to_float_convert_test(
|
||
config_dict['sigkill_psi_threshold'])
|
||
if sigkill_psi_threshold is None:
|
||
errprint('Invalid sigkill_psi_threshold value, not float\nExit')
|
||
exit(1)
|
||
if sigkill_psi_threshold < 0 or sigkill_psi_threshold > 100:
|
||
errprint('sigkill_psi_threshold must be in the range [0; 100]\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('sigkill_psi_threshold not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'sigterm_psi_threshold' in config_dict:
|
||
sigterm_psi_threshold = string_to_float_convert_test(
|
||
config_dict['sigterm_psi_threshold'])
|
||
if sigterm_psi_threshold is None:
|
||
errprint('Invalid sigterm_psi_threshold value, not float\nExit')
|
||
exit(1)
|
||
if sigterm_psi_threshold < 0 or sigterm_psi_threshold > 100:
|
||
errprint('sigterm_psi_threshold must be in the range [0; 100]\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('sigterm_psi_threshold not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'psi_avg_warnings' in config_dict:
|
||
psi_avg_warnings = string_to_float_convert_test(
|
||
config_dict['psi_avg_warnings'])
|
||
if psi_avg_warnings is None:
|
||
errprint('Invalid psi_avg_warnings value, not float\nExit')
|
||
exit(1)
|
||
if psi_avg_warnings < 0 or psi_avg_warnings > 100:
|
||
errprint('psi_avg_warnings must be in the range [0; 100]\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('psi_avg_warnings not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'min_badness' in config_dict:
|
||
min_badness = string_to_int_convert_test(
|
||
config_dict['min_badness'])
|
||
if min_badness is None:
|
||
errprint('Invalid min_badness value, not integer\nExit')
|
||
exit(1)
|
||
if min_badness < 0 or min_badness > 1000:
|
||
errprint('Invalud min_badness value\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('min_badness not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'oom_score_adj_max' in config_dict:
|
||
oom_score_adj_max = string_to_int_convert_test(
|
||
config_dict['oom_score_adj_max'])
|
||
if oom_score_adj_max is None:
|
||
errprint('Invalid oom_score_adj_max value, not integer\nExit')
|
||
exit(1)
|
||
if oom_score_adj_max < 0 or oom_score_adj_max > 1000:
|
||
errprint('Invalid oom_score_adj_max value\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('oom_score_adj_max not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'min_time_between_warnings' in config_dict:
|
||
min_time_between_warnings = string_to_float_convert_test(
|
||
config_dict['min_time_between_warnings'])
|
||
if min_time_between_warnings is None:
|
||
errprint('Invalid min_time_between_warnings value, not float\nExit')
|
||
exit(1)
|
||
if min_time_between_warnings < 1 or min_time_between_warnings > 300:
|
||
errprint('min_time_between_warnings value out of range [1; 300]\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('min_time_between_warnings not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'swap_min_warnings' in config_dict:
|
||
swap_min_warnings = config_dict['swap_min_warnings']
|
||
else:
|
||
errprint('swap_min_warnings not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'max_ancestry_depth' in config_dict:
|
||
max_ancestry_depth = string_to_int_convert_test(
|
||
config_dict['max_ancestry_depth'])
|
||
if min_badness is None:
|
||
errprint('Invalid max_ancestry_depth value, not integer\nExit')
|
||
exit(1)
|
||
if max_ancestry_depth < 1:
|
||
errprint('Invalud max_ancestry_depth value\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('max_ancestry_depth is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'max_post_sigterm_victim_lifetime' in config_dict:
|
||
max_post_sigterm_victim_lifetime = string_to_float_convert_test(
|
||
config_dict['max_post_sigterm_victim_lifetime'])
|
||
if max_post_sigterm_victim_lifetime is None:
|
||
errprint('Invalid max_post_sigterm_victim_lifetime val'
|
||
'ue, not float\nExit')
|
||
exit(1)
|
||
if max_post_sigterm_victim_lifetime < 0:
|
||
errprint('max_post_sigterm_victim_lifetime must be non-n'
|
||
'egative number\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('max_post_sigterm_victim_lifetime is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'post_kill_exe' in config_dict:
|
||
post_kill_exe = config_dict['post_kill_exe']
|
||
else:
|
||
errprint('post_kill_exe is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'psi_path' in config_dict:
|
||
psi_path = config_dict['psi_path']
|
||
else:
|
||
errprint('psi_path is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'psi_metrics' in config_dict:
|
||
psi_metrics = config_dict['psi_metrics']
|
||
else:
|
||
errprint('psi_metrics is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'warning_exe' in config_dict:
|
||
warning_exe = config_dict['warning_exe']
|
||
if warning_exe != '':
|
||
check_warning_exe = True
|
||
else:
|
||
check_warning_exe = False
|
||
else:
|
||
errprint('warning_exe is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'extra_table_info' in config_dict:
|
||
extra_table_info = config_dict['extra_table_info']
|
||
if (extra_table_info != 'None' and
|
||
extra_table_info != 'cgroup_v1' and
|
||
extra_table_info != 'cgroup_v2' and
|
||
extra_table_info != 'cmdline' and
|
||
extra_table_info != 'environ' and
|
||
extra_table_info != 'realpath' and
|
||
extra_table_info != 'All'):
|
||
|
||
errprint('Invalid config: invalid extra_table_info value\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('Invalid config: extra_table_info is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
separate_log = conf_parse_bool('separate_log')
|
||
|
||
if separate_log:
|
||
|
||
import logging
|
||
from logging import basicConfig
|
||
from logging import info
|
||
|
||
log_dir = '/var/log/nohang'
|
||
|
||
try:
|
||
os.mkdir(log_dir)
|
||
except PermissionError:
|
||
print('ERROR: can not create log dir')
|
||
except FileExistsError:
|
||
pass
|
||
|
||
logfile = log_dir + '/nohang.log'
|
||
|
||
try:
|
||
with open(logfile, 'a') as f:
|
||
pass
|
||
except FileNotFoundError:
|
||
print('ERROR: log FileNotFoundError')
|
||
except PermissionError:
|
||
print('ERROR: log PermissionError')
|
||
|
||
try:
|
||
basicConfig(
|
||
filename=logfile,
|
||
level=logging.INFO,
|
||
format="%(asctime)s: %(message)s")
|
||
except PermissionError:
|
||
errprint('ERROR: Permission denied: {}'.format(logfile))
|
||
except FileNotFoundError:
|
||
errprint('ERROR: FileNotFoundError: {}'.format(logfile))
|
||
|
||
|
||
if 'min_mem_report_interval' in config_dict:
|
||
min_mem_report_interval = string_to_float_convert_test(
|
||
config_dict['min_mem_report_interval'])
|
||
if min_mem_report_interval is None:
|
||
errprint('Invalid min_mem_report_interval value, not float\nExit')
|
||
exit(1)
|
||
if min_mem_report_interval < 0:
|
||
errprint('min_mem_report_interval must be non-negative number\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('min_mem_report_interval is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'psi_excess_duration' in config_dict:
|
||
psi_excess_duration = string_to_float_convert_test(
|
||
config_dict['psi_excess_duration'])
|
||
if psi_excess_duration is None:
|
||
errprint('Invalid psi_excess_duration value, not float\nExit')
|
||
exit(1)
|
||
if psi_excess_duration < 0:
|
||
errprint('psi_excess_duration must be non-negative number\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('psi_excess_duration is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'max_sleep' in config_dict:
|
||
max_sleep = string_to_float_convert_test(
|
||
config_dict['max_sleep'])
|
||
if max_sleep is None:
|
||
errprint('Invalid max_sleep value, not float\nExit')
|
||
exit(1)
|
||
if max_sleep <= 0:
|
||
errprint('max_sleep must be positive number\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('max_sleep is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'min_sleep' in config_dict:
|
||
min_sleep = string_to_float_convert_test(
|
||
config_dict['min_sleep'])
|
||
if min_sleep is None:
|
||
errprint('Invalid min_sleep value, not float\nExit')
|
||
exit(1)
|
||
if min_sleep <= 0:
|
||
errprint('min_sleep must be positive number\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('min_sleep is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'over_sleep' in config_dict:
|
||
over_sleep = string_to_float_convert_test(
|
||
config_dict['over_sleep'])
|
||
if over_sleep is None:
|
||
errprint('Invalid over_sleep value, not float\nExit')
|
||
exit(1)
|
||
if over_sleep <= 0:
|
||
errprint('over_sleep must be positive number\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('over_sleep is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if max_sleep < min_sleep:
|
||
errprint(
|
||
'max_sleep value must not exceed min_sleep value.\nExit'
|
||
)
|
||
exit(1)
|
||
|
||
|
||
if min_sleep < over_sleep:
|
||
errprint(
|
||
'min_sleep value must not exceed over_sleep value.\nExit'
|
||
)
|
||
exit(1)
|
||
|
||
|
||
if max_sleep == min_sleep:
|
||
stable_sleep = True
|
||
else:
|
||
stable_sleep = False
|
||
|
||
|
||
if print_proc_table_flag:
|
||
|
||
if not root:
|
||
log('WARNING: effective UID != 0; euid={}; processes with other e'
|
||
'uids will be invisible for nohang'.format(self_uid))
|
||
|
||
func_print_proc_table()
|
||
|
||
|
||
##########################################################################
|
||
|
||
|
||
psi_support = os.path.exists(psi_path)
|
||
|
||
|
||
##########################################################################
|
||
|
||
# Get KiB levels if it's possible.
|
||
|
||
swap_min_sigterm_tuple = get_swap_threshold_tuple(swap_min_sigterm)
|
||
swap_min_sigkill_tuple = get_swap_threshold_tuple(swap_min_sigkill)
|
||
swap_min_warnings_tuple = get_swap_threshold_tuple(swap_min_warnings)
|
||
|
||
|
||
swap_kb_dict = dict()
|
||
|
||
swap_term_is_percent = swap_min_sigterm_tuple[1]
|
||
if swap_term_is_percent:
|
||
swap_min_sigterm_percent = swap_min_sigterm_tuple[0]
|
||
else:
|
||
swap_min_sigterm_kb = swap_min_sigterm_tuple[0]
|
||
swap_kb_dict['swap_min_sigterm_kb'] = swap_min_sigterm_kb
|
||
|
||
swap_kill_is_percent = swap_min_sigkill_tuple[1]
|
||
if swap_kill_is_percent:
|
||
swap_min_sigkill_percent = swap_min_sigkill_tuple[0]
|
||
else:
|
||
swap_min_sigkill_kb = swap_min_sigkill_tuple[0]
|
||
swap_kb_dict['swap_min_sigkill_kb'] = swap_min_sigkill_kb
|
||
|
||
|
||
swap_warn_is_percent = swap_min_warnings_tuple[1]
|
||
if swap_warn_is_percent:
|
||
swap_min_warnings_percent = swap_min_warnings_tuple[0]
|
||
else:
|
||
swap_min_warnings_kb = swap_min_warnings_tuple[0]
|
||
swap_kb_dict['swap_min_warnings_kb'] = swap_min_warnings_kb
|
||
|
||
|
||
##########################################################################
|
||
|
||
if print_config:
|
||
|
||
log('#' * 79)
|
||
|
||
log('0. Common zram settings')
|
||
|
||
log(' ignore_zram: {}'.format(ignore_zram))
|
||
|
||
log('1. Thresholds below which a signal should be sent to the victim')
|
||
|
||
log(' mem_min_sigterm: {} MiB, {} %'.format(
|
||
round(mem_min_sigterm_mb), round(mem_min_sigterm_percent, 1)))
|
||
log(' mem_min_sigkill: {} MiB, {} %'.format(
|
||
round(mem_min_sigkill_mb), round(mem_min_sigkill_percent, 1)))
|
||
|
||
log(' swap_min_sigterm: {}'.format(swap_min_sigterm))
|
||
log(' swap_min_sigkill: {}'.format(swap_min_sigkill))
|
||
|
||
log(' zram_max_sigterm: {} MiB, {} %'.format(
|
||
round(zram_max_sigterm_mb), round(zram_max_sigterm_percent, 1)))
|
||
log(' zram_max_sigkill: {} MiB, {} %'.format(
|
||
round(zram_max_sigkill_mb), round(zram_max_sigkill_percent, 1)))
|
||
|
||
log('2. Response on PSI memory metrics')
|
||
|
||
log(' ignore_psi: {}'.format(ignore_psi))
|
||
log(' psi_path: {}'.format(psi_path))
|
||
log(' psi_metrics: {}'.format(psi_metrics))
|
||
log(' sigterm_psi_threshold: {}'.format(sigterm_psi_threshold))
|
||
log(' sigkill_psi_threshold: {}'.format(sigkill_psi_threshold))
|
||
log(' psi_excess_duration: {} sec'.format(psi_excess_duration))
|
||
log(' psi_post_action_delay: {} sec'.format(psi_post_action_delay))
|
||
|
||
log('3. The frequency of checking the amount of available memory')
|
||
|
||
log(' rate_mem: {}'.format(rate_mem))
|
||
log(' rate_swap: {}'.format(rate_swap))
|
||
log(' rate_zram: {}'.format(rate_zram))
|
||
log(' max_sleep: {} sec'.format(max_sleep))
|
||
log(' min_sleep: {} sec'.format(min_sleep))
|
||
log(' over_sleep: {} sec'.format(over_sleep))
|
||
|
||
log('4. The prevention of killing innocent victims')
|
||
|
||
log(' min_badness: {}'.format(min_badness))
|
||
log(' min_delay_after_sigterm: {} sec'.format(min_delay_after_sigterm))
|
||
log(' decrease_oom_score_adj: {}'.format(decrease_oom_score_adj))
|
||
log(' oom_score_adj_max: {}'.format(oom_score_adj_max))
|
||
|
||
log('5. Impact on the badness of processes')
|
||
|
||
log('5.1. Matching process names with RE patterns')
|
||
if len(badness_adj_re_name_list) > 0:
|
||
log(' regexp: badness_adj:')
|
||
for i in badness_adj_re_name_list:
|
||
log(' {} {}'.format(i[1], i[0]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('5.2. Matching CGroup_v1-line with RE patterns')
|
||
if len(badness_adj_re_cgroup_v1_list) > 0:
|
||
log(' regexp: badness_adj:')
|
||
for i in badness_adj_re_cgroup_v1_list:
|
||
log(' {} {}'.format(i[1], i[0]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('5.3. Matching CGroup_v2-line with RE patterns')
|
||
if len(badness_adj_re_cgroup_v2_list) > 0:
|
||
log(' regexp: badness_adj:')
|
||
for i in badness_adj_re_cgroup_v1_list:
|
||
log(' {} {}'.format(i[1], i[0]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('5.4. Matching eUIDs with RE patterns')
|
||
if len(badness_adj_re_cgroup_v2_list) > 0:
|
||
log(' regexp: badness_adj:')
|
||
for i in badness_adj_re_uid_list:
|
||
log(' {} {}'.format(i[1], i[0]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('5.5. Matching realpath with RE patterns')
|
||
if len(badness_adj_re_cgroup_v2_list) > 0:
|
||
log(' regexp: badness_adj:')
|
||
for i in badness_adj_re_realpath_list:
|
||
log(' {} {}'.format(i[1], i[0]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('5.6. Matching cmdlines with RE patterns')
|
||
if len(badness_adj_re_cgroup_v2_list) > 0:
|
||
log(' regexp: badness_adj:')
|
||
for i in badness_adj_re_cmdline_list:
|
||
log(' {} {}'.format(i[1], i[0]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('5.7. Matching environ with RE patterns')
|
||
if len(badness_adj_re_cgroup_v2_list) > 0:
|
||
log(' regexp: badness_adj:')
|
||
for i in badness_adj_re_environ_list:
|
||
log(' {} {}'.format(i[1], i[0]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('6. Customize corrective actions.')
|
||
|
||
if len(soft_actions_list) > 0:
|
||
log(' Match by: regexp: command: ')
|
||
for i in soft_actions_list:
|
||
log(' {} {} {}'.format(i[0], i[1], i[2]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('7. GUI notifications')
|
||
|
||
log(' gui_notifications: {}'.format(gui_notifications))
|
||
log(' gui_low_memory_warnings: {}'.format(gui_low_memory_warnings))
|
||
log(' warning_exe: {}'.format(warning_exe))
|
||
log(' mem_min_warnings: {} MiB, {} %'.format(
|
||
round(mem_min_warnings_mb), round(mem_min_warnings_percent, 1)))
|
||
log(' swap_min_warnings: {}'.format(swap_min_warnings))
|
||
log(' zram_max_warnings: {} MiB, {} %'.format(
|
||
round(zram_max_warnings_mb), round(zram_max_warnings_percent, 1)))
|
||
log(' psi_avg_warnings: {}'.format(psi_avg_warnings))
|
||
log(' min_time_between_warnings: {}'.format(min_time_between_warnings))
|
||
|
||
log('8. Verbosity')
|
||
|
||
log(' print_config: {}'.format(print_config))
|
||
log(' print_mem_check_results: {}'.format(print_mem_check_results))
|
||
log(' min_mem_report_interval: {}'.format(min_mem_report_interval))
|
||
log(' print_sleep_periods: {}'.format(print_sleep_periods))
|
||
log(' print_total_stat: {}'.format(print_total_stat))
|
||
log(' print_proc_table: {}'.format(print_proc_table))
|
||
log(' extra_table_info: {}'.format(extra_table_info))
|
||
log(' print_victim_info: {}'.format(print_victim_info))
|
||
log(' print_victim_cmdline: {}'.format(print_victim_cmdline))
|
||
log(' max_ancestry_depth: {}'.format(max_ancestry_depth))
|
||
log(' separate_log: {}'.format(separate_log))
|
||
log(' psi_debug: {}'.format(psi_debug))
|
||
|
||
log('9. Misc')
|
||
|
||
log(' max_post_sigterm_victim_lifetime: {} sec'.format(
|
||
max_post_sigterm_victim_lifetime))
|
||
log(' post_kill_exe: {}'.format(post_kill_exe))
|
||
log(' forbid_negative_badness: {}'.format(
|
||
forbid_negative_badness))
|
||
|
||
# log(': {}'.format())
|
||
log('#' * 79)
|
||
|
||
|
||
##########################################################################
|
||
|
||
|
||
# for calculating the column width when printing mem and zram
|
||
mem_len = len(str(round(mem_total / 1024.0)))
|
||
|
||
if gui_notifications:
|
||
notify_sig_dict = {SIGKILL: 'Killing',
|
||
SIGTERM: 'Terminating'}
|
||
|
||
|
||
# convert rates from MiB/s to KiB/s
|
||
rate_mem = rate_mem * 1024
|
||
rate_swap = rate_swap * 1024
|
||
rate_zram = rate_zram * 1024
|
||
|
||
|
||
warn_time_now = 0
|
||
warn_time_delta = 1000
|
||
warn_timer = 0
|
||
|
||
|
||
##########################################################################
|
||
|
||
|
||
if not root:
|
||
log('WARNING: effective UID != 0; euid={}; processes with other e'
|
||
'uids will be invisible for nohang'.format(self_uid))
|
||
|
||
|
||
# Try to lock all memory
|
||
|
||
mlockall()
|
||
|
||
##########################################################################
|
||
|
||
|
||
# print_self_rss()
|
||
|
||
psi_avg_string = '' # will be overwritten if PSI monitoring enabled
|
||
|
||
mem_used_zram = 0
|
||
|
||
|
||
if print_mem_check_results:
|
||
|
||
# to find delta mem
|
||
wt2 = 0
|
||
new_mem = 0
|
||
|
||
# init mem report interval
|
||
report0 = 0
|
||
|
||
|
||
# handle signals
|
||
for i in sig_list:
|
||
signal(i, signal_handler)
|
||
|
||
|
||
x0 = time()
|
||
delta0 = 0
|
||
|
||
|
||
threshold = None
|
||
mem_info = None
|
||
|
||
|
||
#print(x0, 'x0')
|
||
|
||
CHECK_PSI = False
|
||
if psi_support and not ignore_psi:
|
||
CHECK_PSI = True
|
||
|
||
psi_kill_exceeded_timer = 0
|
||
psi_term_exceeded_timer = 0
|
||
psi_t0 = time()
|
||
psi_threshold = zram_threshold = zram_info = psi_info = None
|
||
|
||
|
||
CHECK_ZRAM = not ignore_zram
|
||
|
||
log('Monitoring has started!')
|
||
|
||
stdout.flush()
|
||
|
||
|
||
##########################################################################
|
||
|
||
|
||
while True:
|
||
|
||
(masf_threshold, masf_info, mem_available, swap_min_sigkill_kb,
|
||
swap_min_sigterm_kb, swap_free, swap_total) = check_mem_swap_ex()
|
||
|
||
if CHECK_ZRAM:
|
||
zram_threshold, zram_info, mem_used_zram = check_zram_ex()
|
||
|
||
if CHECK_PSI:
|
||
(psi_threshold, psi_info, psi_t0, psi_kill_exceeded_timer,
|
||
psi_term_exceeded_timer, x0) = check_psi_ex(
|
||
psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0)
|
||
|
||
if print_mem_check_results:
|
||
|
||
if CHECK_PSI:
|
||
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
|
||
if time() - psi_t0 >= psi_post_action_delay:
|
||
psi_post_action_delay_exceeded = True
|
||
else:
|
||
psi_post_action_delay_exceeded = False
|
||
|
||
if print_mem_check_results:
|
||
psi_avg_string = 'PSI avg: {} | '.format(
|
||
str(psi_avg_value).rjust(6))
|
||
|
||
wt1 = time()
|
||
|
||
delta = (mem_available + swap_free) - new_mem
|
||
|
||
t_cycle = wt1 - wt2
|
||
|
||
report_delta = wt1 - report0
|
||
|
||
if report_delta >= min_mem_report_interval:
|
||
|
||
mem_report = True
|
||
new_mem = mem_available + swap_free
|
||
|
||
report0 = wt1
|
||
|
||
else:
|
||
mem_report = False
|
||
|
||
wt2 = time()
|
||
|
||
if mem_report:
|
||
|
||
speed = delta / 1024.0 / report_delta
|
||
speed_info = ' | dMem: {} M/s'.format(
|
||
str(round(speed)).rjust(5)
|
||
)
|
||
|
||
# Calculate 'swap-column' width
|
||
swap_len = len(str(round(swap_total / 1024.0)))
|
||
|
||
# Output available mem sizes
|
||
if swap_total == 0 and mem_used_zram == 0:
|
||
log('{}MemAvail: {} M, {} %{}'.format(
|
||
psi_avg_string,
|
||
human(mem_available, mem_len),
|
||
just_percent_mem(mem_available / mem_total),
|
||
speed_info
|
||
)
|
||
)
|
||
|
||
elif swap_total > 0 and mem_used_zram == 0:
|
||
log('{}MemAvail: {} M, {} % | SwapFree: {} M, {} %{}'.format(
|
||
psi_avg_string,
|
||
human(mem_available, mem_len),
|
||
just_percent_mem(mem_available / mem_total),
|
||
human(swap_free, swap_len),
|
||
just_percent_swap(swap_free / (swap_total + 0.1)),
|
||
speed_info
|
||
)
|
||
)
|
||
|
||
else:
|
||
log('{}MemAvail: {} M, {} % | SwapFree: {} M, {} % | Mem'
|
||
'UsedZram: {} M, {} %{}'.format(
|
||
psi_avg_string,
|
||
human(mem_available, mem_len),
|
||
just_percent_mem(mem_available / mem_total),
|
||
human(swap_free, swap_len),
|
||
just_percent_swap(swap_free / (swap_total + 0.1)),
|
||
human(mem_used_zram, mem_len),
|
||
just_percent_mem(mem_used_zram / mem_total),
|
||
speed_info
|
||
)
|
||
)
|
||
|
||
if masf_threshold is SIGKILL or zram_threshold is SIGKILL or psi_threshold is SIGKILL:
|
||
|
||
threshold = SIGKILL
|
||
mem_info_list = []
|
||
|
||
if masf_info is not None:
|
||
mem_info_list.append(masf_info)
|
||
|
||
if zram_info is not None:
|
||
mem_info_list.append(zram_info)
|
||
|
||
if psi_info is not None:
|
||
mem_info_list.append(psi_info)
|
||
|
||
psi_t0 = implement_corrective_action(
|
||
threshold,
|
||
mem_info_list,
|
||
psi_t0,
|
||
psi_kill_exceeded_timer,
|
||
psi_term_exceeded_timer,
|
||
x0, psi_threshold, zram_threshold, zram_info, psi_info)
|
||
continue
|
||
|
||
if masf_threshold is SIGTERM or zram_threshold is SIGTERM or psi_threshold is SIGTERM:
|
||
|
||
threshold = SIGTERM
|
||
mem_info_list = []
|
||
|
||
if masf_info is not None:
|
||
mem_info_list.append(masf_info)
|
||
|
||
if zram_info is not None:
|
||
mem_info_list.append(zram_info)
|
||
|
||
if psi_info is not None:
|
||
mem_info_list.append(psi_info)
|
||
|
||
psi_t0 = implement_corrective_action(
|
||
threshold,
|
||
mem_info_list,
|
||
psi_t0,
|
||
psi_kill_exceeded_timer,
|
||
psi_term_exceeded_timer,
|
||
x0, psi_threshold, zram_threshold, zram_info, psi_info)
|
||
continue
|
||
|
||
if gui_low_memory_warnings:
|
||
|
||
if masf_threshold == 'WARN' or zram_threshold == 'WARN' or psi_threshold == 'WARN':
|
||
|
||
warn_time_delta = time() - warn_time_now
|
||
warn_time_now = time()
|
||
warn_timer += warn_time_delta
|
||
if warn_timer > min_time_between_warnings:
|
||
|
||
send_notify_warn()
|
||
|
||
warn_timer = 0
|
||
|
||
# print(a_dict)
|
||
# print(v_dict)
|
||
sleep_after_check_mem()
|