nohang/nohang
Alexey Avramov e0dbf486d1 fix output
2019-06-23 01:40:51 +09:00

3258 lines
96 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""A daemon that prevents OOM in Linux systems."""
import os
from ctypes import CDLL
from time import sleep, time
from operator import itemgetter
from sys import stdout, stderr, argv, exit
from re import search
from sre_constants import error as invalid_re
from signal import signal, SIGKILL, SIGTERM, SIGINT, SIGQUIT, SIGHUP
##########################################################################
# define functions
def get_swap_threshold_tuple(string):
# re (Num %, True) or (Num KiB, False)
"""Returns KiB value if abs val was set in config, or tuple with %"""
# return tuple with abs and bool: (abs %, True) or (abs MiB, False)
if string.endswith('%'):
valid = string_to_float_convert_test(string[:-1])
if valid is None:
errprint('somewhere swap unit is not float_%')
exit(1)
value = float(string[:-1].strip())
if value < 0 or value > 100:
errprint('invalid value, must be from the range[0; 100] %')
exit(1)
return value, True
elif string.endswith('M'):
valid = string_to_float_convert_test(string[:-1])
if valid is None:
errprint('somewhere swap unit is not float_M')
exit(1)
value = float(string[:-1].strip()) * 1024
if value < 0:
errprint('invalid unit in config (negative value)')
exit(1)
return value, False
else:
errprint(
'Invalid config file. There are invalid units somewhere\nExit')
exit(1)
def find_cgroup_indexes():
""" Find cgroup-line positions in /proc/*/cgroup file.
"""
cgroup_v1_index = cgroup_v2_index = None
with open('/proc/self/cgroup') as f:
for index, line in enumerate(f):
if ':name=' in line:
cgroup_v1_index = index
if line.startswith('0::'):
cgroup_v2_index = index
return cgroup_v1_index, cgroup_v2_index
def pid_to_rss(pid):
"""
"""
try:
rss = int(rline1(
'/proc/{}/statm'.format(pid)).split(' ')[1]) * SC_PAGESIZE
except IndexError:
rss = None
except FileNotFoundError:
rss = None
except ProcessLookupError:
rss = None
return rss
def pid_to_vm_size(pid):
"""
"""
try:
vm_size = int(rline1(
'/proc/{}/statm'.format(pid)).partition(' ')[0]) * SC_PAGESIZE
except IndexError:
vm_size = None
except FileNotFoundError:
vm_size = None
except ProcessLookupError:
vm_size = None
return vm_size
def signal_handler(signum, frame):
"""
"""
for i in sig_list:
signal(i, signal_handler_inner)
log('Signal handler called with the {} signal '.format(
sig_dict[signum]))
update_stat_dict_and_print(None)
log('Exit')
exit()
def signal_handler_inner(signum, frame):
"""
"""
log('Signal handler called with the {} signal (ignored) '.format(
sig_dict[signum]))
def exe(cmd):
"""
"""
log('Execute the command: {}'.format(cmd))
t0 = time()
write_self_oom_score_adj(self_oom_score_adj_max)
err = os.system(cmd)
write_self_oom_score_adj(self_oom_score_adj_min)
dt = time() - t0
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
return err
def write(path, string):
"""
"""
with open(path, 'w') as f:
f.write(string)
def write_self_oom_score_adj(new_value):
"""
"""
if root:
write('/proc/self/oom_score_adj', new_value)
def valid_re(reg_exp):
"""Validate regular expression.
"""
try:
search(reg_exp, '')
except invalid_re:
log('Invalid config: invalid regexp: {}'.format(reg_exp))
exit(1)
def func_print_proc_table():
"""
"""
print_proc_table = True
find_victim(print_proc_table)
exit()
def log(*msg):
"""
"""
try:
print(*msg)
except OSError:
sleep(0.01)
if separate_log:
try:
info(*msg)
except OSError:
sleep(0.01)
def print_version():
"""
"""
try:
v = rline1('/etc/nohang/version')
except FileNotFoundError:
v = None
if v is None:
print('Nohang unknown version')
else:
print('Nohang ' + v)
exit()
def pid_to_cgroup_v1(pid):
"""
"""
cgroup_v1 = ''
try:
with open('/proc/' + pid + '/cgroup') as f:
for index, line in enumerate(f):
if index == cgroup_v1_index:
cgroup_v1 = '/' + line.partition('/')[2][:-1]
return cgroup_v1
except FileNotFoundError:
return ''
def pid_to_cgroup_v2(pid):
"""
"""
cgroup_v2 = ''
try:
with open('/proc/' + pid + '/cgroup') as f:
for index, line in enumerate(f):
if index == cgroup_v2_index:
cgroup_v2 = line[3:-1]
return cgroup_v2
except FileNotFoundError:
return ''
def pid_to_starttime(pid):
""" handle FNF error!
"""
try:
starttime = rline1('/proc/' + pid + '/stat').rpartition(')')[
2].split(' ')[20]
except UnicodeDecodeError:
# print('LOL')
with open('/proc/' + pid + '/stat', 'rb') as f:
starttime = f.read().decode('utf-8', 'ignore').rpartition(
')')[2].split(' ')[20]
return float(starttime) / SC_CLK_TCK
def get_victim_id(pid):
"""victim_id is starttime + pid"""
try:
return rline1('/proc/' + pid + '/stat').rpartition(
')')[2].split(' ')[20] + '_pid' + pid
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
def pid_to_state(pid):
""" Handle FNF error! (BTW it already handled in find_victim_info())
"""
return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1]
def pid_to_name(pid):
"""
"""
try:
with open('/proc/' + pid + '/comm', 'rb') as f:
return f.read().decode('utf-8', 'ignore')[:-1]
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
def pid_to_ppid(pid):
"""
"""
try:
with open('/proc/' + pid + '/status') as f:
for n, line in enumerate(f):
if n is ppid_index:
return line.split('\t')[1].strip()
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
except UnicodeDecodeError:
with open('/proc/' + pid + '/status', 'rb') as f:
f_list = f.read().decode('utf-8', 'ignore').split('\n')
for i in range(len(f_list)):
if i is ppid_index:
return f_list[i].split('\t')[1]
def pid_to_ancestry(pid, max_ancestry_depth=1):
"""
"""
if max_ancestry_depth == 1:
ppid = pid_to_ppid(pid)
pname = pid_to_name(ppid)
return '\n PPID: {} ({})'.format(ppid, pname)
if max_ancestry_depth == 0:
return ''
anc_list = []
for i in range(max_ancestry_depth):
ppid = pid_to_ppid(pid)
pname = pid_to_name(ppid)
anc_list.append((ppid, pname))
if ppid == '1':
break
pid = ppid
a = ''
for i in anc_list:
a = a + ' <= PID {} ({})'.format(i[0], i[1])
return '\n Ancestry: ' + a[4:]
def pid_to_cmdline(pid):
"""
Get process cmdline by pid.
pid: str pid of required process
returns string cmdline
"""
try:
with open('/proc/' + pid + '/cmdline') as f:
return f.read().replace('\x00', ' ').rstrip()
except FileNotFoundError:
return ''
def pid_to_environ(pid):
"""
Get process environ by pid.
pid: str pid of required process
returns string environ
"""
try:
with open('/proc/' + pid + '/environ') as f:
return f.read().replace('\x00', ' ').rstrip()
except FileNotFoundError:
return ''
def pid_to_realpath(pid):
"""
"""
try:
return os.path.realpath('/proc/' + pid + '/exe')
except FileNotFoundError:
return ''
def pid_to_uid(pid):
"""return euid"""
try:
with open('/proc/' + pid + '/status') as f:
for n, line in enumerate(f):
if n is uid_index:
return line.split('\t')[2]
except UnicodeDecodeError:
with open('/proc/' + pid + '/status', 'rb') as f:
f_list = f.read().decode('utf-8', 'ignore').split('\n')
return f_list[uid_index].split('\t')[2]
except FileNotFoundError:
return ''
def pid_to_badness(pid):
"""Find and modify badness (if it needs)."""
try:
oom_score = int(rline1('/proc/' + pid + '/oom_score'))
badness = oom_score
if decrease_oom_score_adj:
oom_score_adj = int(rline1('/proc/' + pid + '/oom_score_adj'))
if badness > oom_score_adj_max and oom_score_adj > 0:
badness = badness - oom_score_adj + oom_score_adj_max
if regex_matching:
name = pid_to_name(pid)
for re_tup in badness_adj_re_name_list:
if search(re_tup[1], name) is not None:
badness += int(re_tup[0])
if re_match_cgroup_v1:
cgroup_v1 = pid_to_cgroup_v1(pid)
for re_tup in badness_adj_re_cgroup_v1_list:
if search(re_tup[1], cgroup_v1) is not None:
badness += int(re_tup[0])
if re_match_cgroup_v2:
cgroup_v2 = pid_to_cgroup_v2(pid)
for re_tup in badness_adj_re_cgroup_v2_list:
if search(re_tup[1], cgroup_v2) is not None:
badness += int(re_tup[0])
if re_match_realpath:
realpath = pid_to_realpath(pid)
for re_tup in badness_adj_re_realpath_list:
if search(re_tup[1], realpath) is not None:
badness += int(re_tup[0])
if re_match_cmdline:
cmdline = pid_to_cmdline(pid)
for re_tup in badness_adj_re_cmdline_list:
if search(re_tup[1], cmdline) is not None:
badness += int(re_tup[0])
if re_match_environ:
environ = pid_to_environ(pid)
for re_tup in badness_adj_re_environ_list:
if search(re_tup[1], environ) is not None:
badness += int(re_tup[0])
if re_match_uid:
uid = pid_to_uid(pid)
for re_tup in badness_adj_re_uid_list:
if search(re_tup[1], uid) is not None:
badness += int(re_tup[0])
if forbid_negative_badness:
if badness < 0:
badness = 0
return badness, oom_score
except FileNotFoundError:
return None, None
except ProcessLookupError:
return None, None
def pid_to_status(pid):
"""
"""
try:
with open('/proc/' + pid + '/status') as f:
for n, line in enumerate(f):
if n is 0:
name = line.split('\t')[1][:-1]
if n is state_index:
state = line.split('\t')[1][0]
continue
if n is ppid_index:
ppid = line.split('\t')[1][:-1]
continue
if n is uid_index:
uid = line.split('\t')[2]
continue
if n is vm_size_index:
vm_size = kib_to_mib(int(line.split('\t')[1][:-4]))
continue
if n is vm_rss_index:
vm_rss = kib_to_mib(int(line.split('\t')[1][:-4]))
continue
if n is vm_swap_index:
vm_swap = kib_to_mib(int(line.split('\t')[1][:-4]))
break
return name, state, ppid, uid, vm_size, vm_rss, vm_swap
except UnicodeDecodeError:
return pid_to_status_unicode(pid)
except FileNotFoundError:
return None
except ProcessLookupError:
return None
except ValueError:
return None
def pid_to_status_unicode(pid):
"""
"""
try:
with open('/proc/' + pid + '/status', 'rb') as f:
f_list = f.read().decode('utf-8', 'ignore').split('\n')
for i in range(len(f_list)):
if i is 0:
name = f_list[i].split('\t')[1]
if i is state_index:
state = f_list[i].split('\t')[1][0]
if i is ppid_index:
ppid = f_list[i].split('\t')[1]
if i is uid_index:
uid = f_list[i].split('\t')[2]
if i is vm_size_index:
vm_size = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if i is vm_rss_index:
vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3]))
if i is vm_swap_index:
vm_swap = kib_to_mib(int(f_list[i].split('\t')[1][:-3]))
return name, state, ppid, uid, vm_size, vm_rss, vm_swap
except FileNotFoundError:
return None
except ProcessLookupError:
return None
except ValueError:
return None
def uptime():
"""
"""
return float(rline1('/proc/uptime').split(' ')[0])
def errprint(*text):
"""
"""
print(*text, file=stderr, flush=True)
def mlockall():
"""Lock all memory to prevent swapping nohang process."""
MCL_CURRENT = 1
MCL_FUTURE = 2
MCL_ONFAULT = 4
libc = CDLL('libc.so.6', use_errno=True)
result = libc.mlockall(
MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT
)
if result != 0:
result = libc.mlockall(
MCL_CURRENT | MCL_FUTURE
)
if result != 0:
log('WARNING: cannot lock all memory')
else:
pass
# log('All memory locked with MCL_CURRENT | MCL_FUTURE')
else:
pass
# log('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT')
def update_stat_dict_and_print(key):
"""
"""
if key is not None:
if key not in stat_dict:
stat_dict.update({key: 1})
else:
new_value = stat_dict[key] + 1
stat_dict.update({key: new_value})
if print_total_stat:
stats_msg = 'Total stat (what happened in the last {}):'.format(
format_time(time() - start_time))
for i in stat_dict:
stats_msg += '\n {}: {}'.format(i, stat_dict[i])
log(stats_msg)
def find_psi_metrics_value(psi_path, psi_metrics):
"""
"""
if psi_support:
if psi_metrics == 'some_avg10':
return float(rline1(psi_path).split(' ')[1].split('=')[1])
if psi_metrics == 'some_avg60':
return float(rline1(psi_path).split(' ')[2].split('=')[1])
if psi_metrics == 'some_avg300':
return float(rline1(psi_path).split(' ')[3].split('=')[1])
if psi_metrics == 'full_avg10':
with open(psi_path) as f:
psi_list = f.readlines()
return float(psi_list[1].split(' ')[1].split('=')[1])
if psi_metrics == 'full_avg60':
with open(psi_path) as f:
psi_list = f.readlines()
return float(psi_list[1].split(' ')[2].split('=')[1])
if psi_metrics == 'full_avg300':
with open(psi_path) as f:
psi_list = f.readlines()
return float(psi_list[1].split(' ')[3].split('=')[1])
def check_mem_and_swap():
"""find mem_available, swap_total, swap_free"""
with open('/proc/meminfo') as f:
for n, line in enumerate(f):
if n is 2:
mem_available = int(line.split(':')[1][:-4])
continue
if n is swap_total_index:
swap_total = int(line.split(':')[1][:-4])
continue
if n is swap_free_index:
swap_free = int(line.split(':')[1][:-4])
break
return mem_available, swap_total, swap_free
def check_zram():
"""find MemUsedZram"""
disksize_sum = 0
mem_used_total_sum = 0
for dev in os.listdir('/sys/block'):
if dev.startswith('zram'):
stat = zram_stat(dev)
disksize_sum += int(stat[0])
mem_used_total_sum += int(stat[1])
# Means that when setting zram disksize = 1 GiB available memory
# decrease by 0.0042 GiB.
# Found experimentally, requires clarification with different kernaels and
# architectures.
# On small disk drives (up to gigabyte) it can be more, up to 0.0045.
# The creator of the zram module claims that ZRAM_DISKSIZE_FACTOR should
# be 0.001:
# ("zram uses about 0.1% of the size of the disk"
# - https://www.kernel.org/doc/Documentation/blockdev/zram.txt),
# but this statement contradicts the experimental data.
# ZRAM_DISKSIZE_FACTOR = deltaMemAvailavle / disksize
# Found experimentally.
ZRAM_DISKSIZE_FACTOR = 0.0042
return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0
def format_time(t):
"""
"""
t = int(t)
if t < 60:
return '{} sec'.format(t)
elif t >= 60 and t < 3600:
m = t // 60
s = t % 60
return '{} min {} sec'.format(m, s)
else:
h = t // 3600
s0 = t - h * 3600
m = s0 // 60
s = s0 % 60
return '{} h {} min {} sec'.format(h, m, s)
def string_to_float_convert_test(string):
"""Try to interprete string values as floats."""
try:
return float(string)
except ValueError:
return None
def string_to_int_convert_test(string):
"""Try to interpret string values as integers."""
try:
return int(string)
except ValueError:
return None
def conf_parse_string(param):
"""
Get string parameters from the config dict.
param: config_dict key
returns config_dict[param].strip()
"""
if param in config_dict:
return config_dict[param].strip()
else:
errprint('All the necessary parameters must be in the config')
errprint('There is no "{}" parameter in the config'.format(param))
exit(1)
def conf_parse_bool(param):
"""
Get bool parameters from the config_dict.
param: config_dicst key
returns bool
"""
if param in config_dict:
param_str = config_dict[param]
if param_str == 'True':
return True
elif param_str == 'False':
return False
else:
errprint('Invalid value of the "{}" parameter.'.format(param))
errprint('Valid values are True and False.')
errprint('Exit')
exit(1)
else:
errprint('All the necessary parameters must be in the config')
errprint('There is no "{}" parameter in the config'.format(param))
exit(1)
def rline1(path):
"""read 1st line from path."""
try:
with open(path) as f:
for line in f:
return line[:-1]
except UnicodeDecodeError:
with open(path, 'rb') as f:
return f.read(999).decode(
'utf-8', 'ignore').split('\n')[0] # use partition()!
def kib_to_mib(num):
"""Convert KiB values to MiB values."""
return round(num / 1024.0)
def percent(num):
"""Interprete num as percentage."""
return round(num * 100, 1)
def just_percent_mem(num):
"""convert num to percent and justify"""
return str(round(num * 100, 1)).rjust(4, ' ')
def just_percent_swap(num):
"""
"""
return str(round(num * 100, 1)).rjust(5, ' ')
def human(num, lenth):
"""Convert KiB values to MiB values with right alignment"""
return str(round(num / 1024)).rjust(lenth, ' ')
def zram_stat(zram_id):
"""
Get zram state.
zram_id: str zram block-device id
returns bytes diskcize, str mem_used_total
"""
try:
disksize = rline1('/sys/block/' + zram_id + '/disksize')
except FileNotFoundError:
return '0', '0'
if disksize == ['0\n']:
return '0', '0'
try:
mm_stat = rline1('/sys/block/' + zram_id + '/mm_stat').split(' ')
mm_stat_list = []
for i in mm_stat:
if i != '':
mm_stat_list.append(i)
mem_used_total = mm_stat_list[2]
except FileNotFoundError:
mem_used_total = rline1('/sys/block/' + zram_id + '/mem_used_total')
return disksize, mem_used_total # BYTES, str
def send_notify_warn():
"""
Look for process with maximum 'badness' and warn user with notification.
(implement Low memory warnings)
"""
log('Warning threshold exceeded')
if check_warning_exe:
exe(warning_exe)
else:
title = 'Low memory'
body = 'MemAvail: {}%\nSwapFree: {}%'.format(
round(mem_available / mem_total * 100),
round(swap_free / (swap_total + 0.1) * 100)
)
send_notification(title, body)
def send_notify(threshold, name, pid):
"""
Notificate about OOM Preventing.
threshold: key for notify_sig_dict
name: str process name
pid: str process pid
"""
# wait for memory release after corrective action
# may be useful if free memory was about 0 immediately after
# corrective action
sleep(0.05)
title = 'Freeze prevention'
body = '<b>{}</b> [{}] <b>{}</b>'.format(
notify_sig_dict[threshold],
pid,
name.replace(
# symbol '&' can break notifications in some themes,
# therefore it is replaced by '*'
'&', '*'
)
)
send_notification(title, body)
def send_notify_etc(pid, name, command):
"""
Notificate about OOM Preventing.
command: str command that will be executed
name: str process name
pid: str process pid
"""
title = 'Freeze prevention'
body = '<b>Victim is</b> [{}] <b>{}</b>\nExecute the co' \
'mmand:\n<b>{}</b>'.format(
pid, name.replace('&', '*'), command.replace('&', '*'))
send_notification(title, body)
def send_notification(title, body):
"""
"""
split_by = '#' * 16
t000 = time()
path_to_cache = '/dev/shm/nohang_notify_cache_uid{}_time{}'.format(
str(self_uid), t000
)
text = '{}{}{}'.format(title, split_by, body)
try:
with open(path_to_cache, 'w') as f:
f.write(text)
os.chmod(path_to_cache, 0o600)
except OSError:
log('OSError while send notification '
'(No space left on device: /dev/shm)')
return None
cmd = '{} --uid {} --time {} &'.format(notify_helper_path, self_uid, t000)
exe(cmd)
def get_pid_list():
"""
Find pid list expect kthreads and zombies
"""
pid_list = []
for pid in os.listdir('/proc'):
if os.path.exists('/proc/' + pid + '/exe') is True:
pid_list.append(pid)
return pid_list
def get_non_decimal_pids():
"""
"""
non_decimal_list = []
for pid in pid_list:
if pid[0].isdecimal() is False:
non_decimal_list.append(pid)
return non_decimal_list
def find_victim(_print_proc_table):
"""
Find the process with highest badness and its badness adjustment
Return pid and badness
"""
ft1 = time()
pid_list = get_pid_list()
pid_list.remove(self_pid)
if '1' in pid_list:
pid_list.remove('1')
non_decimal_list = get_non_decimal_pids()
for i in non_decimal_list:
if i in pid_list:
pid_list.remove(i)
pid_badness_list = []
if _print_proc_table:
if extra_table_info == 'None':
extra_table_title = ''
elif extra_table_info == 'cgroup_v1':
extra_table_title = 'CGroup_v1'
elif extra_table_info == 'cgroup_v2':
extra_table_title = 'CGroup_v2'
elif extra_table_info == 'cmdline':
extra_table_title = 'cmdline'
elif extra_table_info == 'environ':
extra_table_title = 'environ'
elif extra_table_info == 'realpath':
extra_table_title = 'realpath'
elif extra_table_info == 'All':
extra_table_title = '[CGroup] [CmdLine] [RealPath]'
else:
extra_table_title = ''
hr = '#' * 115
log(hr)
log('# PID PPID badness oom_score oom_score_adj e'
'UID S VmSize VmRSS VmSwap Name {}'.format(
extra_table_title))
log('#------- ------- ------- --------- ------------- -------'
'--- - ------ ----- ------ --------------- --------')
for pid in pid_list:
badness = pid_to_badness(pid)[0]
if badness is None:
continue
if _print_proc_table:
try:
oom_score = rline1('/proc/' + pid + '/oom_score')
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
except FileNotFoundError:
continue
if pid_to_status(pid) is None:
continue
else:
(name, state, ppid, uid, vm_size, vm_rss,
vm_swap) = pid_to_status(pid)
if extra_table_info == 'None':
extra_table_line = ''
elif extra_table_info == 'cgroup_v1':
extra_table_line = pid_to_cgroup_v1(pid)
elif extra_table_info == 'cgroup_v2':
extra_table_line = pid_to_cgroup_v2(pid)
elif extra_table_info == 'cmdline':
extra_table_line = pid_to_cmdline(pid)
elif extra_table_info == 'environ':
extra_table_line = pid_to_environ(pid)
elif extra_table_info == 'realpath':
extra_table_line = pid_to_realpath(pid)
elif extra_table_info == 'All':
extra_table_line = '[CG: {}] [CL: {}] [RP: {}]'.format(
pid_to_cgroup_v1(pid),
pid_to_cmdline(pid),
pid_to_realpath(pid)
)
else:
extra_table_line = ''
log('#{} {} {} {} {} {} {} {} {} {} {} {}'.format(
pid.rjust(7),
ppid.rjust(7),
str(badness).rjust(7),
oom_score.rjust(9),
oom_score_adj.rjust(13),
uid.rjust(10),
state,
str(vm_size).rjust(6),
str(vm_rss).rjust(5),
str(vm_swap).rjust(6),
name.ljust(15),
extra_table_line
)
)
pid_badness_list.append((pid, badness))
real_proc_num = len(pid_badness_list)
# Make list of (pid, badness) tuples, sorted by 'badness' values
# print(pid_badness_list)
pid_tuple_list = sorted(
pid_badness_list,
key=itemgetter(1),
reverse=True
)[0]
pid = pid_tuple_list[0]
victim_id = get_victim_id(pid)
# Get maximum 'badness' value
victim_badness = pid_tuple_list[1]
victim_name = pid_to_name(pid)
if _print_proc_table:
log(hr)
log('Found {} processes with existing /proc/[pid]/exe'.format(
real_proc_num))
log(
'Process with highest badness (found in {} ms):\n PID: {}, Na'
'me: {}, badness: {}'.format(
round((time() - ft1) * 1000),
pid,
victim_name,
victim_badness
)
)
return pid, victim_badness, victim_name, victim_id
def find_victim_info(pid, victim_badness, name):
"""
"""
status0 = time()
try:
with open('/proc/' + pid + '/status') as f:
for n, line in enumerate(f):
if n is state_index:
state = line.split('\t')[1].rstrip()
continue
if n is ppid_index:
ppid = line.split('\t')[1]
continue
if n is uid_index:
uid = line.split('\t')[2]
continue
if n is vm_size_index:
vm_size = kib_to_mib(int(line.split('\t')[1][:-4]))
continue
if n is vm_rss_index:
vm_rss = kib_to_mib(int(line.split('\t')[1][:-4]))
continue
if detailed_rss:
if n is anon_index:
anon_rss = kib_to_mib(
int(line.split('\t')[1][:-4]))
continue
if n is file_index:
file_rss = kib_to_mib(
int(line.split('\t')[1][:-4]))
continue
if n is shmem_index:
shmem_rss = kib_to_mib(
int(line.split('\t')[1][:-4]))
continue
if n is vm_swap_index:
vm_swap = kib_to_mib(int(line.split('\t')[1][:-4]))
break
if print_victim_cmdline:
cmdline = pid_to_cmdline(pid)
oom_score = rline1('/proc/' + pid + '/oom_score')
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
except FileNotFoundError:
log('The victim died in the search process: FileNotFoundError')
update_stat_dict_and_print(
'The victim died in the search process: FileNotFoundError')
return None
except ProcessLookupError:
log('The victim died in the search process: ProcessLookupError')
update_stat_dict_and_print(
'The victim died in the search process: ProcessLookupError')
return None
except UnicodeDecodeError:
with open('/proc/' + pid + '/status', 'rb') as f:
f_list = f.read().decode('utf-8', 'ignore').split('\n')
for i in range(len(f_list)):
if i is state_index:
state = f_list[i].split('\t')[1].rstrip()
if i is ppid_index:
ppid = f_list[i].split('\t')[1]
if i is uid_index:
uid = f_list[i].split('\t')[2]
if i is vm_size_index:
vm_size = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if i is vm_rss_index:
vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3]))
if detailed_rss:
if i is anon_index:
anon_rss = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if i is file_index:
file_rss = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if i is shmem_index:
shmem_rss = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if i is vm_swap_index:
vm_swap = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if print_victim_cmdline:
cmdline = pid_to_cmdline(pid)
oom_score = rline1('/proc/' + pid + '/oom_score')
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
except IndexError:
log('The victim died in the search process: IndexError')
update_stat_dict_and_print(
'The victim died in the search process: IndexError')
return None
except ValueError:
log('The victim died in the search process: ValueError')
update_stat_dict_and_print(
'The victim died in the search process: ValueError')
return None
except FileNotFoundError:
log('The victim died in the search process: FileNotFoundError')
update_stat_dict_and_print(
'The victim died in the search process: FileNotFoundError')
return None
except ProcessLookupError:
log('The victim died in the search process: ProcessLookupError')
update_stat_dict_and_print(
'The victim died in the search process: ProcessLookupError')
return None
len_vm = len(str(vm_size))
try:
realpath = os.path.realpath('/proc/' + pid + '/exe')
victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
victim_cgroup_v1 = pid_to_cgroup_v1(pid)
victim_cgroup_v2 = pid_to_cgroup_v2(pid)
except FileNotFoundError:
print('The victim died in the search process: FileNotFoundError')
update_stat_dict_and_print(
'The victim died in the search process: FileNotFoundError')
return None
ancestry = pid_to_ancestry(pid, max_ancestry_depth)
if print_victim_cmdline is False:
cmdline = ''
c1 = ''
else:
c1 = '\n Cmdline: '
if detailed_rss:
detailed_rss_info = ' (' \
'Anon: {} MiB, ' \
'File: {} MiB, ' \
'Shmem: {} MiB)'.format(
anon_rss,
file_rss,
shmem_rss)
else:
detailed_rss_info = ''
victim_info = 'Victim information (found in {} ms):' \
'\n Name: {}' \
'\n State: {}' \
'\n PID: {}' \
'{}' \
'\n EUID: {}' \
'\n badness: {}, ' \
'oom_score: {}, ' \
'oom_score_adj: {}' \
'\n VmSize: {} MiB' \
'\n VmRSS: {} MiB {}' \
'\n VmSwap: {} MiB' \
'\n CGroup_v1: {}' \
'\n CGroup_v2: {}' \
'\n Realpath: {}' \
'{}{}' \
'\n Lifetime: {}'.format(
round((time() - status0) * 1000),
name,
state,
pid,
ancestry,
uid,
victim_badness,
oom_score,
oom_score_adj,
vm_size,
str(vm_rss).rjust(len_vm),
detailed_rss_info,
str(vm_swap).rjust(len_vm),
victim_cgroup_v1,
victim_cgroup_v2,
realpath,
c1, cmdline,
victim_lifetime)
return victim_info
def check_mem_swap_ex():
"""
Check: is mem and swap threshold exceeded?
Return: None, (SIGTERM, meminfo), (SIGKILL, meminfo)
"""
mem_available, swap_total, swap_free = check_mem_and_swap()
# if swap_min_sigkill is set in percent
if swap_kill_is_percent:
swap_min_sigkill_kb = swap_total * swap_min_sigkill_percent / 100.0
else:
swap_min_sigkill_kb = swap_kb_dict['swap_min_sigkill_kb']
if swap_term_is_percent:
swap_min_sigterm_kb = swap_total * swap_min_sigterm_percent / 100.0
else:
swap_min_sigterm_kb = swap_kb_dict['swap_min_sigterm_kb']
if swap_warn_is_percent:
swap_min_warnings_kb = swap_total * swap_min_warnings_percent / 100.0
else:
swap_min_warnings_kb = swap_kb_dict['swap_min_warnings_kb']
if swap_total > swap_min_sigkill_kb:
swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1))
else:
swap_sigkill_pc = '-'
if swap_total > swap_min_sigterm_kb:
swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1))
else:
swap_sigterm_pc = '-'
if (mem_available <= mem_min_sigkill_kb and
swap_free <= swap_min_sigkill_kb):
mem_info = 'Memory status that requ' \
'ires corrective actions (hard threshold exceeded):' \
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
'p_min_sigkill [{} MiB, {} %]'.format(
kib_to_mib(mem_available),
percent(mem_available / mem_total),
kib_to_mib(mem_min_sigkill_kb),
percent(mem_min_sigkill_kb / mem_total),
kib_to_mib(swap_free),
percent(swap_free / (swap_total + 0.1)),
kib_to_mib(swap_min_sigkill_kb),
swap_sigkill_pc)
return SIGKILL, mem_info, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
if (mem_available <= mem_min_sigterm_kb and
swap_free <= swap_min_sigterm_kb):
mem_info = 'Memory status that requi' \
'res corrective actions (soft threshold exceeded):' \
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
'p_min_sigterm [{} MiB, {} %]'.format(
kib_to_mib(mem_available),
percent(mem_available / mem_total),
kib_to_mib(mem_min_sigterm_kb),
round(mem_min_sigterm_percent, 1),
kib_to_mib(swap_free),
percent(swap_free / (swap_total + 0.1)),
kib_to_mib(swap_min_sigterm_kb),
swap_sigterm_pc)
return SIGTERM, mem_info, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
if gui_low_memory_warnings:
if (mem_available <= mem_min_warnings_kb and swap_free <=
swap_min_warnings_kb + 0.1):
return 'WARN', None, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
return None, None, mem_available, swap_min_sigkill_kb, swap_min_sigterm_kb, swap_free, swap_total
def check_zram_ex():
"""
"""
mem_used_zram = check_zram()
if mem_used_zram >= zram_max_sigkill_kb:
mem_info = 'Memory status that requir' \
'es corrective actions (hard threshold exceeded):' \
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
'kill [{} MiB, {} %]'.format(
kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigkill_kb),
percent(zram_max_sigkill_kb / mem_total))
return SIGKILL, mem_info, mem_used_zram
if mem_used_zram >= zram_max_sigterm_kb:
mem_info = 'Memory status that require' \
's corrective actions (soft threshold exceeded):\n MemUsedZram [{} MiB, {} %] >= zra' \
'm_max_sigterm [{} M, {} %]'.format(
kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigterm_kb),
percent(zram_max_sigterm_kb / mem_total))
return SIGTERM, mem_info, mem_used_zram
if gui_low_memory_warnings:
if mem_used_zram >= zram_max_warnings_kb:
return 'WARN', None, mem_used_zram
return None, None, mem_used_zram
def check_psi_ex(psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0):
"""
"""
delta0 = time() - x0
x0 = time()
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
# print(psi_avg_value)
psi_post_action_delay_timer = time() - psi_t0
if psi_post_action_delay_timer >= psi_post_action_delay:
psi_post_action_delay_exceeded = True
else:
psi_post_action_delay_exceeded = False
if psi_avg_value >= sigkill_psi_threshold:
sigkill_psi_exceeded = True
psi_kill_exceeded_timer += delta0
else:
sigkill_psi_exceeded = False
psi_kill_exceeded_timer = 0
if psi_debug:
log('psi_post_action_delay_timer: {}'.format(
round(psi_post_action_delay_timer, 3)))
log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded'
': {}\npsi_kill_exceeded_timer: {}'.format(
psi_post_action_delay_exceeded,
sigkill_psi_exceeded,
round(psi_kill_exceeded_timer, 1)
)
)
if (psi_kill_exceeded_timer >= psi_excess_duration and
psi_post_action_delay_exceeded):
mem_info = 'PSI avg ({}) > sigkill_psi_threshold ({})\n' \
'PSI avg exceeded psi_excess_duration (value' \
' = {} sec) for {} seconds'.format(
psi_avg_value,
sigkill_psi_threshold,
psi_excess_duration,
round(psi_kill_exceeded_timer, 1)
)
# psi_t0 = time() # ВОТ ГДЕ ПРОБЛЕМА. Таймер надо сбрасывать именно после применения корректирующего действия. Именно ПОСЛЕ. А не здесь.
# Или после любого применения, или после успешного.
# Если жертва умерла в процессе поиска - сбрасываем. Если отправлен
# сигнал - сбрасываем.
return SIGKILL, mem_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
if psi_avg_value >= sigterm_psi_threshold:
sigterm_psi_exceeded = True
psi_term_exceeded_timer += delta0
else:
sigterm_psi_exceeded = False
psi_term_exceeded_timer = 0
if psi_debug:
log('sigterm_psi_exceeded: {}\n'
'psi_term_exceeded_timer: {}\n'.format(
sigterm_psi_exceeded,
round(psi_term_exceeded_timer, 1)
)
)
if (psi_term_exceeded_timer >= psi_excess_duration and
psi_post_action_delay_exceeded):
mem_info = 'PSI avg ({}) > sigterm_psi_threshold ({})\n' \
'PSI avg exceeded psi_excess_duration (value' \
' = {} sec) for {} seconds'.format(
psi_avg_value,
sigterm_psi_threshold,
psi_excess_duration,
round(psi_term_exceeded_timer, 1)
)
return SIGTERM, mem_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
if gui_low_memory_warnings:
if psi_avg_value >= psi_avg_warnings:
return 'WARN', None, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
return None, None, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0
def is_victim_alive(pid):
"""
Проверка статуса жертвы:
1 - жива
0 - полное исчезновение
2 - умирает, освобождает память, зомби
"""
exe_exists = os.path.exists('/proc/{}/exe'.format(pid))
if exe_exists:
return 1
statm_exists = os.path.exists('/proc/{}/statm'.format(pid))
if statm_exists:
return 2
else:
return 0
def implement_corrective_action(
threshold,
mem_info_list,
psi_t0,
# да это ж тупо время последнего коррект действия. В идеале - время оконч
# действия. Любого.
psi_kill_exceeded_timer,
psi_term_exceeded_timer,
x0, psi_threshold, zram_threshold, zram_info, psi_info):
"""
Find victim with highest badness and send SIGTERM/SIGKILL
"""
# Ёбаная запутанная фция. Распутать всё нахуй. Выделить части в отдельн фции.
# Разбить саму фцию на части. Сделать ее структуру простой и понятной.
time0 = time() # начало корр действия. Для вычисл времени действия.
# выходим из фции, если для SIGTERM порога не превышено время
# min_delay_after_sigterm и спим в течение over_sleep
# если хард порог превышен - идем дальше.
if threshold is SIGTERM:
dt = time() - actions_time_dict['action_handled'][0]
if dt < min_delay_after_sigterm:
log('min_delay_after_sigterm IS NOT EXCEEDED ({} < {})'.format(
round(dt, 3), min_delay_after_sigterm))
if print_sleep_periods:
log('Sleep {} sec [in implement_corrective_action()]'.format(
over_sleep))
sleep(over_sleep)
return psi_t0 # время задержки между действиями не истекло
else:
log('min_delay_after_sigterm IS EXCEEDED, it is time to action')
"""
При заходе в фцию проверяем права на сигтерм. Права на сигкилл у нас
всегда есть
(потому что идем дальше только после полн освободж памяти после
смерти жертвы)
actions_time_dict[action_handled] = time()
actions_time_dict[veto] = True
actions_time_dict['action_handled'] = [time(), victim_id]
"""
for i in mem_info_list:
log(i)
# ищем жертву с ее бэднес.
pid, victim_badness, name, victim_id = find_victim(print_proc_table)
# sleep(0.1)
log('Recheck memory levels...')
# перепроверяем пороги: они могли измениться за время поиска жертвы
(masf_threshold, masf_info, mem_available, swap_min_sigkill_kb,
swap_min_sigterm_kb, swap_free, swap_total) = check_mem_swap_ex()
if CHECK_ZRAM:
zram_threshold, zram_info, mem_used_zram = check_zram_ex()
if CHECK_PSI:
(psi_threshold, psi_info, psi_t0, psi_kill_exceeded_timer,
psi_term_exceeded_timer, x0) = check_psi_ex(
psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0)
if masf_threshold is SIGKILL or zram_threshold is SIGKILL or psi_threshold is SIGKILL:
new_threshold = SIGKILL
mem_info_list = []
if masf_threshold is SIGKILL or masf_threshold is SIGTERM:
mem_info_list.append(masf_info)
if zram_threshold is SIGKILL or zram_threshold is SIGTERM:
mem_info_list.append(zram_info)
if psi_threshold is SIGKILL or psi_threshold is SIGTERM:
mem_info_list.append(psi_info)
elif masf_threshold is SIGTERM or zram_threshold is SIGTERM or psi_threshold is SIGTERM:
new_threshold = SIGTERM
mem_info_list = []
if masf_threshold is SIGKILL or masf_threshold is SIGTERM:
mem_info_list.append(masf_info)
if zram_threshold is SIGKILL or zram_threshold is SIGTERM:
mem_info_list.append(zram_info)
if psi_threshold is SIGKILL or psi_threshold is SIGTERM:
mem_info_list.append(psi_info)
else:
log('Thresholds is not exceeded now')
return psi_t0
# печать порогов
for i in mem_info_list:
log(i)
# может это излишне
if new_threshold is None or new_threshold == 'WARN':
log('Thresholds is not exceeded now')
return psi_t0
threshold = new_threshold
if victim_badness >= min_badness:
psi_t0 = time() # так себе идея
if print_victim_info:
victim_info = find_victim_info(pid, victim_badness, name)
log(victim_info)
# kill the victim if it doesn't respond to SIGTERM В ТЕЧЕНИЕ
# ЗАДАННОГО ВРЕМЕНИ
# переопределяем сигнал для старых жертв
if threshold is SIGTERM:
if victim_id in victim_dict:
dt = time() - victim_dict[victim_id]
if dt > max_post_sigterm_victim_lifetime:
print('max_post_sigterm_victim_lifetime exceeded: the '
'victim will get SIGKILL')
threshold = SIGKILL
# matching with re to customize corrective actions
soft_match = False
if soft_actions and threshold is SIGTERM:
name = pid_to_name(pid)
cgroup_v1 = pid_to_cgroup_v1(pid)
service = ''
cgroup_v1_tail = cgroup_v1.rpartition('/')[2]
if cgroup_v1_tail.endswith('.service'):
service = cgroup_v1_tail
for i in soft_actions_list:
unit = i[0]
if unit == 'name':
u = name
else:
u = cgroup_v1
regexp = i[1]
command = i[2]
if search(regexp, u) is not None:
log("Regexp '{}' matches with {} '{}'".format(
regexp, unit, u))
soft_match = True
break
if soft_match: # ПЕРЕОПРЕДЕЛЕНИЕ МЯГКОГО КОРРЕКТИРУЮЩЕГО ДЕЙСТВИЯ
# todo: make new func
m = check_mem_and_swap()
ma = int(m[0]) / 1024.0
sf = int(m[2]) / 1024.0
log('Memory status before implementing a corrective act'
'ion:\n MemAvailable'
': {} MiB, SwapFree: {} MiB'.format(
round(ma, 1), round(sf, 1)
)
)
cmd = command.replace(
'$PID',
pid).replace(
'$NAME',
pid_to_name(pid)).replace(
'$SERVICE',
service)
exit_status = exe(cmd)
exit_status = str(exit_status)
response_time = time() - time0
# тут надо, как и при дефолтном действии, проверять существование
# жертвы, ее реакцию на действие,
# и время ее смерти в случае успеха, о обновление таймстемпов
# действия
etc_info = 'Implement a corrective act' \
'ion:\n Run the command: {}' \
'\n Exit status: {}; total response ' \
'time: {} ms'.format(
cmd,
exit_status,
round(response_time * 1000))
log(etc_info)
key = "Run the command '{}'".format(cmd)
update_stat_dict_and_print(key)
if gui_notifications:
send_notify_etc(
pid,
name,
command.replace('$PID', pid).replace(
'$NAME', pid_to_name(pid)))
else:
# обычное действие через сигнал
# вот тут поработать. Тут ебаный цикл. Нахуй его.
try: # вот тут засрано, в блоке try должно быть только kill(), остальное ниже за пределами
os.kill(int(pid), threshold)
a_dict[threshold] = time()
v_dict[victim_id] = time()
kill_timestamp = time()
response_time = kill_timestamp - time0
while True:
victim_alive = is_victim_alive(pid)
dt = time() - kill_timestamp
if victim_alive == 2 or dt > 0.02:
# print(dt)
break
sleep(0.002)
if dt > 0.02:
log('Timer (value = 0.02 sec) expired; victim does not respond on action in 0.02 sec')
actions_time_dict['action_handled'] = [
time(), get_victim_id(pid)]
if victim_id not in victim_dict: # хз как надо.
victim_dict.update({victim_id: time()})
# log('actions_time_dict', actions_time_dict)
# log('victim_dict', victim_dict)
else:
log('Process exited (VmRSS = 0) in {} sec'.format(
round(dt, 5)))
if threshold is SIGKILL or victim_alive == 2:
# жертва умирает от SIGKILL. Дожидаемся ее полной смерти.
while True:
sleep(0.002)
rss = pid_to_rss(pid)
if rss is None: # процесс исчез
break
t1 = time()
kill_duration = t1 - kill_timestamp
log('The victim died in {} sec'.format(
round(kill_duration, 3)))
mem_available, swap_total, swap_free = check_mem_and_swap()
ma_mib = int(mem_available) / 1024.0
sf_mib = int(swap_free) / 1024.0
log('Memory status after implementing a corrective act'
'ion:\n MemAvailable'
': {} MiB, SwapFree: {} MiB'.format(
round(ma_mib, 1), round(sf_mib, 1)
)
)
send_result = 'total response time: {} ms'.format(
round(response_time * 1000))
preventing_oom_message = 'Implement a corrective action:' \
'\n Send {} to the victim; {}'.format(
sig_dict[threshold], send_result)
key = 'Send {} to {}'.format(sig_dict[threshold], name)
if threshold is SIGKILL and post_kill_exe != '':
cmd = post_kill_exe.replace('$PID', pid).replace(
'$NAME', pid_to_name(pid))
log('Execute post_kill_exe')
exe(cmd)
if gui_notifications:
send_notify(threshold, name, pid)
except FileNotFoundError:
response_time = time() - time0
send_result = 'no such process; response time: {} ms'.format(
round(response_time * 1000))
key = 'The victim died in the search process: FileNotFoundError'
except ProcessLookupError:
response_time = time() - time0
send_result = 'no such process; response time: {} ms'.format(
round(response_time * 1000))
key = 'The victim died in the search process: ProcessLookupError'
try:
log(preventing_oom_message)
except UnboundLocalError: # какой позор
preventing_oom_message = key
update_stat_dict_and_print(key)
# нехуй делать, бэднес жертвы слишком мал
else:
# может эту часть наверх отправить через if
response_time = time() - time0
victim_badness_is_too_small = 'victim badness {} < min_b' \
'adness {}; nothing to do; response time: {} ms'.format(
victim_badness,
min_badness,
round(response_time * 1000))
log(victim_badness_is_too_small)
# update stat_dict
key = 'victim badness < min_badness'
update_stat_dict_and_print(key)
# тут надо поспать хорошенько. а может и счетчики поправить.
# херню несу. во-первых, внезапно может кто-то появиться c блльшим
# бэднес.. Далее надо минимизировать аутпут спам.
sleep(over_sleep)
# обновлять время не на каждый кил, а только на килл той жертвы,
# которая не отвечала на софт экшн.
# Вывод: ко времени действия прилагать также виктим айди.
print('##################################################################')
sleep(over_sleep) # Спать если бэднес жертвы мал
# Что делать с psi_t0 если у мертвы мал бэднес? Ничего, потому что кор действия не было.
# демон может жрать 10% цпу при этом. Можно отдельн парам ввести. А можно
# не вводить. кек
return psi_t0
def sleep_after_check_mem():
"""Specify sleep times depends on rates and avialable memory."""
if stable_sleep:
if print_sleep_periods:
log('Sleep {} sec'.format(min_sleep))
stdout.flush()
sleep(min_sleep)
return None
if mem_min_sigkill_kb < mem_min_sigterm_kb:
mem_point = mem_available - mem_min_sigterm_kb
else:
mem_point = mem_available - mem_min_sigkill_kb
if swap_min_sigkill_kb < swap_min_sigterm_kb:
swap_point = swap_free - swap_min_sigterm_kb
else:
swap_point = swap_free - swap_min_sigkill_kb
if swap_point < 0:
swap_point = 0
if mem_point < 0:
mem_point = 0
t_mem = mem_point / rate_mem
t_swap = swap_point / rate_swap
if CHECK_ZRAM:
t_zram = (mem_total * 0.8 - mem_used_zram) / rate_zram
if t_zram < 0:
t_zram = 0
t_mem_zram = t_mem + t_zram
z = ', t_zram={}'.format(round(t_zram, 2))
else:
z = ''
t_mem_swap = t_mem + t_swap
if CHECK_ZRAM:
if t_mem_swap <= t_mem_zram:
t = t_mem_swap
else:
t = t_mem_zram
else:
t = t_mem_swap
if t > max_sleep:
t = max_sleep
elif t < min_sleep:
t = min_sleep
else:
pass
if print_sleep_periods:
log(
'Sleep {} sec (t_mem={}, t_swap={}{})'.format(
round(t, 2), round(t_mem, 2), round(t_swap, 2), z)
)
try:
stdout.flush()
except OSError:
pass
sleep(t)
def calculate_percent(arg_key):
"""
parse conf dict
Calculate mem_min_KEY_percent.
Try use this one)
arg_key: str key for config_dict
returns int mem_min_percent or NoneType if got some error
"""
if arg_key in config_dict:
mem_min = config_dict[arg_key]
if mem_min.endswith('%'):
# truncate percents, so we have a number
mem_min_percent = mem_min[:-1].strip()
# then 'float test'
mem_min_percent = string_to_float_convert_test(mem_min_percent)
if mem_min_percent is None:
errprint('Invalid {} value, not float\nExit'.format(arg_key))
exit(1)
# Final validations...
if mem_min_percent < 0 or mem_min_percent > 100:
errprint(
'{}, as percents value, out of ran'
'ge [0; 100]\nExit'.format(arg_key))
exit(1)
# mem_min_sigterm_percent is clean and valid float percentage. Can
# translate into Kb
mem_min_kb = mem_min_percent / 100 * mem_total
mem_min_mb = round(mem_min_kb / 1024)
elif mem_min.endswith('M'):
mem_min_mb = string_to_float_convert_test(mem_min[:-1].strip())
if mem_min_mb is None:
errprint('Invalid {} value, not float\nExit'.format(arg_key))
exit(1)
mem_min_kb = mem_min_mb * 1024
if mem_min_kb > mem_total:
errprint(
'{} value can not be greater then MemT'
'otal ({} MiB)\nExit'.format(
arg_key, round(
mem_total / 1024)))
exit(1)
mem_min_percent = mem_min_kb / mem_total * 100
else:
log('Invalid {} units in config.\n Exit'.format(arg_key))
exit(1)
mem_min_percent = None
else:
log('{} not in config\nExit'.format(arg_key))
exit(1)
mem_min_percent = None
return mem_min_kb, mem_min_mb, mem_min_percent
##########################################################################
victim_dict = dict()
victim_id = None
actions_time_dict = dict()
actions_time_dict['action_handled'] = [time(), victim_id]
# print(actions_time_dict)
# (victim_id : {SIGKILL: ts, SIGTERM: ts}}
v_dict = dict()
# {SIGTERM: timestamp, SIGKILL: timestamp, 'last_action_ts': ts}
a_dict = dict()
a_dict['last_action_ts'] = a_dict[SIGTERM] = a_dict[SIGKILL] = time()
# print(a_dict)
start_time = time()
help_mess = """usage: nohang [-h] [-v] [-t] [-p] [-c CONFIG]
optional arguments:
-h, --help show this help message and exit
-v, --version print version
-p, --print-proc-table
print table of processes with their badness values
-c CONFIG, --config CONFIG
path to the config file, default values:
./nohang.conf, /etc/nohang/nohang.conf"""
SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE'])
conf_err_mess = 'Invalid config. Exit.'
sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP]
sig_dict = {
SIGKILL: 'SIGKILL',
SIGINT: 'SIGINT',
SIGQUIT: 'SIGQUIT',
SIGHUP: 'SIGHUP',
SIGTERM: 'SIGTERM'
}
self_pid = str(os.getpid())
self_uid = os.geteuid()
if self_uid == 0:
root = True
else:
root = False
if os.path.exists('./nohang_notify_helper'):
notify_helper_path = './nohang_notify_helper'
else:
notify_helper_path = '/usr/sbin/nohang_notify_helper'
# will store corrective actions stat
stat_dict = dict()
separate_log = False # will be overwritten after parse config
cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
self_oom_score_adj_min = '-600'
self_oom_score_adj_max = '-6'
write_self_oom_score_adj(self_oom_score_adj_min)
pid_list = get_pid_list()
print_proc_table_flag = False
if len(argv) == 1:
if os.path.exists('./nohang.conf'):
config = os.getcwd() + '/nohang.conf'
else:
config = '/etc/nohang/nohang.conf'
elif len(argv) == 2:
if argv[1] == '--help' or argv[1] == '-h':
print(help_mess)
exit()
elif argv[1] == '--version' or argv[1] == '-v':
print_version()
elif argv[1] == '--print-proc-table' or argv[1] == '-p':
print_proc_table_flag = True
if os.path.exists('./nohang.conf'):
config = os.getcwd() + '/nohang.conf'
else:
config = '/etc/nohang/nohang.conf'
else:
errprint('Unknown option: {}'.format(argv[1]))
exit(1)
elif len(argv) == 3:
if argv[1] == '--config' or argv[1] == '-c':
config = argv[2]
else:
errprint('Unknown option: {}'.format(argv[1]))
exit(1)
else:
errprint('Invalid CLI input: too many options')
exit(1)
# find mem_total
# find positions of SwapFree and SwapTotal in /proc/meminfo
with open('/proc/meminfo') as f:
mem_list = f.readlines()
mem_list_names = []
for s in mem_list:
mem_list_names.append(s.split(':')[0])
if mem_list_names[2] != 'MemAvailable':
errprint('WARNING: Your Linux kernel is too old, Linux 3.14+ requied')
# exit(1)
swap_total_index = mem_list_names.index('SwapTotal')
swap_free_index = swap_total_index + 1
mem_total = int(mem_list[0].split(':')[1][:-4])
# Get names from /proc/*/status to be able to get VmRSS and VmSwap values
with open('/proc/self/status') as file:
status_list = file.readlines()
status_names = []
for s in status_list:
status_names.append(s.split(':')[0])
ppid_index = status_names.index('PPid')
vm_size_index = status_names.index('VmSize')
vm_rss_index = status_names.index('VmRSS')
vm_swap_index = status_names.index('VmSwap')
uid_index = status_names.index('Uid')
state_index = status_names.index('State')
try:
anon_index = status_names.index('RssAnon')
file_index = status_names.index('RssFile')
shmem_index = status_names.index('RssShmem')
detailed_rss = True
# print(detailed_rss, 'detailed_rss')
except ValueError:
detailed_rss = False
# print('It is not Linux 4.5+')
log('Config: ' + config)
##########################################################################
# parsing the config with obtaining the parameters dictionary
# conf_parameters_dict
# conf_restart_dict
# dictionary with config options
config_dict = dict()
badness_adj_re_name_list = []
badness_adj_re_cmdline_list = []
badness_adj_re_environ_list = []
badness_adj_re_uid_list = []
badness_adj_re_cgroup_v1_list = []
badness_adj_re_cgroup_v2_list = []
badness_adj_re_realpath_list = []
soft_actions_list = []
# separator for optional parameters (that starts with @)
opt_separator = '///'
# stupid conf parsing, need refactoring
try:
with open(config) as f:
for line in f:
a = line.startswith('#')
b = line.startswith('\n')
c = line.startswith('\t')
d = line.startswith(' ')
etc = line.startswith('@SOFT_ACTION_RE_NAME')
etc2 = line.startswith('@SOFT_ACTION_RE_CGROUP_V1')
if not a and not b and not c and not d and not etc and not etc2:
a = line.partition('=')
key = a[0].strip()
value = a[2].strip()
if key not in config_dict:
config_dict[key] = value
else:
log('ERROR: config key duplication: {}'.format(key))
exit(1)
if etc:
a = line.partition('@SOFT_ACTION_RE_NAME')[
2].partition(opt_separator)
a1 = 'name'
a2 = a[0].strip()
valid_re(a2)
a3 = a[2].strip()
zzz = (a1, a2, a3)
soft_actions_list.append(zzz)
if etc2:
a = line.partition('@SOFT_ACTION_RE_CGROUP_V1')[
2].partition(opt_separator)
a1 = 'cgroup_v1'
a2 = a[0].strip()
valid_re(a2)
a3 = a[2].strip()
zzz = (a1, a2, a3)
soft_actions_list.append(zzz)
if line.startswith('@BADNESS_ADJ_RE_NAME'):
a = line.partition('@BADNESS_ADJ_RE_NAME')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_name_list.append((badness_adj, reg_exp))
if line.startswith('@BADNESS_ADJ_RE_CMDLINE'):
a = line.partition('@BADNESS_ADJ_RE_CMDLINE')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_cmdline_list.append((badness_adj, reg_exp))
if line.startswith('@BADNESS_ADJ_RE_UID'):
a = line.partition('@BADNESS_ADJ_RE_UID')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_uid_list.append((badness_adj, reg_exp))
if line.startswith('@BADNESS_ADJ_RE_CGROUP_V1'):
a = line.partition('@BADNESS_ADJ_RE_CGROUP_V1')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_cgroup_v1_list.append((badness_adj, reg_exp))
if line.startswith('@BADNESS_ADJ_RE_CGROUP_V2'):
a = line.partition('@BADNESS_ADJ_RE_CGROUP_V2')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_cgroup_v2_list.append((badness_adj, reg_exp))
if line.startswith('@BADNESS_ADJ_RE_REALPATH'):
a = line.partition('@BADNESS_ADJ_RE_REALPATH')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_realpath_list.append((badness_adj, reg_exp))
if line.startswith('@BADNESS_ADJ_RE_ENVIRON'):
a = line.partition('@BADNESS_ADJ_RE_ENVIRON')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_environ_list.append((badness_adj, reg_exp))
except PermissionError:
errprint('PermissionError', conf_err_mess)
exit(1)
except UnicodeDecodeError:
errprint('UnicodeDecodeError', conf_err_mess)
exit(1)
except IsADirectoryError:
errprint('IsADirectoryError', conf_err_mess)
exit(1)
except IndexError:
errprint('IndexError', conf_err_mess)
exit(1)
except FileNotFoundError:
errprint('FileNotFoundError', conf_err_mess)
exit(1)
if badness_adj_re_name_list == []:
regex_matching = False
else:
regex_matching = True
if badness_adj_re_cmdline_list == []:
re_match_cmdline = False
else:
re_match_cmdline = True
if badness_adj_re_uid_list == []:
re_match_uid = False
else:
re_match_uid = True
if badness_adj_re_environ_list == []:
re_match_environ = False
else:
re_match_environ = True
if badness_adj_re_realpath_list == []:
re_match_realpath = False
else:
re_match_realpath = True
if badness_adj_re_cgroup_v1_list == []:
re_match_cgroup_v1 = False
else:
re_match_cgroup_v1 = True
if badness_adj_re_cgroup_v2_list == []:
re_match_cgroup_v2 = False
else:
re_match_cgroup_v2 = True
# print(badness_adj_re_name_list)
# print(badness_adj_re_cmdline_list)
# print(badness_adj_re_uid_list)
# print(badness_adj_re_environ_list)
# print(badness_adj_re_realpath_list)
# print(badness_adj_re_cgroup_v1_list)
# print(badness_adj_re_cgroup_v2_list)
# print(soft_actions_list)
if soft_actions_list == []:
soft_actions = False
else:
soft_actions = True
# print('soft_actions:', soft_actions)
##########################################################################
# extracting parameters from the dictionary
# check for all necessary parameters
# validation of all parameters
psi_debug = conf_parse_bool('psi_debug')
print_total_stat = conf_parse_bool('print_total_stat')
print_proc_table = conf_parse_bool('print_proc_table')
forbid_negative_badness = conf_parse_bool('forbid_negative_badness')
print_victim_info = conf_parse_bool('print_victim_info')
print_victim_cmdline = conf_parse_bool('print_victim_cmdline')
print_config = conf_parse_bool('print_config')
print_mem_check_results = conf_parse_bool('print_mem_check_results')
print_sleep_periods = conf_parse_bool('print_sleep_periods')
gui_low_memory_warnings = conf_parse_bool('gui_low_memory_warnings')
gui_notifications = conf_parse_bool('gui_notifications')
decrease_oom_score_adj = conf_parse_bool('decrease_oom_score_adj')
ignore_psi = conf_parse_bool('ignore_psi')
ignore_zram = conf_parse_bool('ignore_zram')
(mem_min_sigterm_kb, mem_min_sigterm_mb, mem_min_sigterm_percent
) = calculate_percent('mem_min_sigterm')
(mem_min_sigkill_kb, mem_min_sigkill_mb, mem_min_sigkill_percent
) = calculate_percent('mem_min_sigkill')
(zram_max_sigterm_kb, zram_max_sigterm_mb, zram_max_sigterm_percent
) = calculate_percent('zram_max_sigterm')
(zram_max_sigkill_kb, zram_max_sigkill_mb, zram_max_sigkill_percent
) = calculate_percent('zram_max_sigkill')
(mem_min_warnings_kb, mem_min_warnings_mb, mem_min_warnings_percent
) = calculate_percent('mem_min_warnings')
(zram_max_warnings_kb, zram_max_warnings_mb, zram_max_warnings_percent
) = calculate_percent('zram_max_warnings')
if 'rate_mem' in config_dict:
rate_mem = string_to_float_convert_test(config_dict['rate_mem'])
if rate_mem is None:
errprint('Invalid rate_mem value, not float\nExit')
exit(1)
if rate_mem <= 0:
errprint('rate_mem MUST be > 0\nExit')
exit(1)
else:
errprint('rate_mem not in config\nExit')
exit(1)
if 'rate_swap' in config_dict:
rate_swap = string_to_float_convert_test(config_dict['rate_swap'])
if rate_swap is None:
errprint('Invalid rate_swap value, not float\nExit')
exit(1)
if rate_swap <= 0:
errprint('rate_swap MUST be > 0\nExit')
exit(1)
else:
errprint('rate_swap not in config\nExit')
exit(1)
if 'rate_zram' in config_dict:
rate_zram = string_to_float_convert_test(config_dict['rate_zram'])
if rate_zram is None:
errprint('Invalid rate_zram value, not float\nExit')
exit(1)
if rate_zram <= 0:
errprint('rate_zram MUST be > 0\nExit')
exit(1)
else:
errprint('rate_zram not in config\nExit')
exit(1)
if 'swap_min_sigterm' in config_dict:
swap_min_sigterm = config_dict['swap_min_sigterm']
else:
errprint('swap_min_sigterm not in config\nExit')
exit(1)
if 'swap_min_sigkill' in config_dict:
swap_min_sigkill = config_dict['swap_min_sigkill']
else:
errprint('swap_min_sigkill not in config\nExit')
exit(1)
if 'min_delay_after_sigterm' in config_dict:
min_delay_after_sigterm = string_to_float_convert_test(
config_dict['min_delay_after_sigterm'])
if min_delay_after_sigterm is None:
errprint('Invalid min_delay_after_sigterm value, not float\nExit')
exit(1)
if min_delay_after_sigterm < 0:
errprint('min_delay_after_sigterm must be positiv\nExit')
exit(1)
else:
errprint('min_delay_after_sigterm not in config\nExit')
exit(1)
if 'psi_post_action_delay' in config_dict:
psi_post_action_delay = string_to_float_convert_test(
config_dict['psi_post_action_delay'])
if psi_post_action_delay is None:
errprint('Invalid psi_post_action_delay value, not float\nExit')
exit(1)
if psi_post_action_delay < 0:
errprint('psi_post_action_delay must be positive\nExit')
exit(1)
else:
errprint('psi_post_action_delay not in config\nExit')
exit(1)
if 'sigkill_psi_threshold' in config_dict:
sigkill_psi_threshold = string_to_float_convert_test(
config_dict['sigkill_psi_threshold'])
if sigkill_psi_threshold is None:
errprint('Invalid sigkill_psi_threshold value, not float\nExit')
exit(1)
if sigkill_psi_threshold < 0 or sigkill_psi_threshold > 100:
errprint('sigkill_psi_threshold must be in the range [0; 100]\nExit')
exit(1)
else:
errprint('sigkill_psi_threshold not in config\nExit')
exit(1)
if 'sigterm_psi_threshold' in config_dict:
sigterm_psi_threshold = string_to_float_convert_test(
config_dict['sigterm_psi_threshold'])
if sigterm_psi_threshold is None:
errprint('Invalid sigterm_psi_threshold value, not float\nExit')
exit(1)
if sigterm_psi_threshold < 0 or sigterm_psi_threshold > 100:
errprint('sigterm_psi_threshold must be in the range [0; 100]\nExit')
exit(1)
else:
errprint('sigterm_psi_threshold not in config\nExit')
exit(1)
if 'psi_avg_warnings' in config_dict:
psi_avg_warnings = string_to_float_convert_test(
config_dict['psi_avg_warnings'])
if psi_avg_warnings is None:
errprint('Invalid psi_avg_warnings value, not float\nExit')
exit(1)
if psi_avg_warnings < 0 or psi_avg_warnings > 100:
errprint('psi_avg_warnings must be in the range [0; 100]\nExit')
exit(1)
else:
errprint('psi_avg_warnings not in config\nExit')
exit(1)
if 'min_badness' in config_dict:
min_badness = string_to_int_convert_test(
config_dict['min_badness'])
if min_badness is None:
errprint('Invalid min_badness value, not integer\nExit')
exit(1)
if min_badness < 0 or min_badness > 1000:
errprint('Invalud min_badness value\nExit')
exit(1)
else:
errprint('min_badness not in config\nExit')
exit(1)
if 'oom_score_adj_max' in config_dict:
oom_score_adj_max = string_to_int_convert_test(
config_dict['oom_score_adj_max'])
if oom_score_adj_max is None:
errprint('Invalid oom_score_adj_max value, not integer\nExit')
exit(1)
if oom_score_adj_max < 0 or oom_score_adj_max > 1000:
errprint('Invalid oom_score_adj_max value\nExit')
exit(1)
else:
errprint('oom_score_adj_max not in config\nExit')
exit(1)
if 'min_time_between_warnings' in config_dict:
min_time_between_warnings = string_to_float_convert_test(
config_dict['min_time_between_warnings'])
if min_time_between_warnings is None:
errprint('Invalid min_time_between_warnings value, not float\nExit')
exit(1)
if min_time_between_warnings < 1 or min_time_between_warnings > 300:
errprint('min_time_between_warnings value out of range [1; 300]\nExit')
exit(1)
else:
errprint('min_time_between_warnings not in config\nExit')
exit(1)
if 'swap_min_warnings' in config_dict:
swap_min_warnings = config_dict['swap_min_warnings']
else:
errprint('swap_min_warnings not in config\nExit')
exit(1)
if 'max_ancestry_depth' in config_dict:
max_ancestry_depth = string_to_int_convert_test(
config_dict['max_ancestry_depth'])
if min_badness is None:
errprint('Invalid max_ancestry_depth value, not integer\nExit')
exit(1)
if max_ancestry_depth < 1:
errprint('Invalud max_ancestry_depth value\nExit')
exit(1)
else:
errprint('max_ancestry_depth is not in config\nExit')
exit(1)
if 'max_post_sigterm_victim_lifetime' in config_dict:
max_post_sigterm_victim_lifetime = string_to_float_convert_test(
config_dict['max_post_sigterm_victim_lifetime'])
if max_post_sigterm_victim_lifetime is None:
errprint('Invalid max_post_sigterm_victim_lifetime val'
'ue, not float\nExit')
exit(1)
if max_post_sigterm_victim_lifetime < 0:
errprint('max_post_sigterm_victim_lifetime must be non-n'
'egative number\nExit')
exit(1)
else:
errprint('max_post_sigterm_victim_lifetime is not in config\nExit')
exit(1)
if 'post_kill_exe' in config_dict:
post_kill_exe = config_dict['post_kill_exe']
else:
errprint('post_kill_exe is not in config\nExit')
exit(1)
if 'psi_path' in config_dict:
psi_path = config_dict['psi_path']
else:
errprint('psi_path is not in config\nExit')
exit(1)
if 'psi_metrics' in config_dict:
psi_metrics = config_dict['psi_metrics']
else:
errprint('psi_metrics is not in config\nExit')
exit(1)
if 'warning_exe' in config_dict:
warning_exe = config_dict['warning_exe']
if warning_exe != '':
check_warning_exe = True
else:
check_warning_exe = False
else:
errprint('warning_exe is not in config\nExit')
exit(1)
if 'extra_table_info' in config_dict:
extra_table_info = config_dict['extra_table_info']
if (extra_table_info != 'None' and
extra_table_info != 'cgroup_v1' and
extra_table_info != 'cgroup_v2' and
extra_table_info != 'cmdline' and
extra_table_info != 'environ' and
extra_table_info != 'realpath' and
extra_table_info != 'All'):
errprint('Invalid config: invalid extra_table_info value\nExit')
exit(1)
else:
errprint('Invalid config: extra_table_info is not in config\nExit')
exit(1)
separate_log = conf_parse_bool('separate_log')
if separate_log:
import logging
from logging import basicConfig
from logging import info
log_dir = '/var/log/nohang'
try:
os.mkdir(log_dir)
except PermissionError:
print('ERROR: can not create log dir')
except FileExistsError:
pass
logfile = log_dir + '/nohang.log'
try:
with open(logfile, 'a') as f:
pass
except FileNotFoundError:
print('ERROR: log FileNotFoundError')
except PermissionError:
print('ERROR: log PermissionError')
try:
basicConfig(
filename=logfile,
level=logging.INFO,
format="%(asctime)s: %(message)s")
except PermissionError:
errprint('ERROR: Permission denied: {}'.format(logfile))
except FileNotFoundError:
errprint('ERROR: FileNotFoundError: {}'.format(logfile))
if 'min_mem_report_interval' in config_dict:
min_mem_report_interval = string_to_float_convert_test(
config_dict['min_mem_report_interval'])
if min_mem_report_interval is None:
errprint('Invalid min_mem_report_interval value, not float\nExit')
exit(1)
if min_mem_report_interval < 0:
errprint('min_mem_report_interval must be non-negative number\nExit')
exit(1)
else:
errprint('min_mem_report_interval is not in config\nExit')
exit(1)
if 'psi_excess_duration' in config_dict:
psi_excess_duration = string_to_float_convert_test(
config_dict['psi_excess_duration'])
if psi_excess_duration is None:
errprint('Invalid psi_excess_duration value, not float\nExit')
exit(1)
if psi_excess_duration < 0:
errprint('psi_excess_duration must be non-negative number\nExit')
exit(1)
else:
errprint('psi_excess_duration is not in config\nExit')
exit(1)
if 'max_sleep' in config_dict:
max_sleep = string_to_float_convert_test(
config_dict['max_sleep'])
if max_sleep is None:
errprint('Invalid max_sleep value, not float\nExit')
exit(1)
if max_sleep <= 0:
errprint('max_sleep must be positive number\nExit')
exit(1)
else:
errprint('max_sleep is not in config\nExit')
exit(1)
if 'min_sleep' in config_dict:
min_sleep = string_to_float_convert_test(
config_dict['min_sleep'])
if min_sleep is None:
errprint('Invalid min_sleep value, not float\nExit')
exit(1)
if min_sleep <= 0:
errprint('min_sleep must be positive number\nExit')
exit(1)
else:
errprint('min_sleep is not in config\nExit')
exit(1)
if 'over_sleep' in config_dict:
over_sleep = string_to_float_convert_test(
config_dict['over_sleep'])
if over_sleep is None:
errprint('Invalid over_sleep value, not float\nExit')
exit(1)
if over_sleep <= 0:
errprint('over_sleep must be positive number\nExit')
exit(1)
else:
errprint('over_sleep is not in config\nExit')
exit(1)
if max_sleep < min_sleep:
errprint(
'max_sleep value must not exceed min_sleep value.\nExit'
)
exit(1)
if min_sleep < over_sleep:
errprint(
'min_sleep value must not exceed over_sleep value.\nExit'
)
exit(1)
if max_sleep == min_sleep:
stable_sleep = True
else:
stable_sleep = False
if print_proc_table_flag:
if not root:
log('WARNING: effective UID != 0; euid={}; processes with other e'
'uids will be invisible for nohang'.format(self_uid))
func_print_proc_table()
##########################################################################
psi_support = os.path.exists(psi_path)
##########################################################################
# Get KiB levels if it's possible.
swap_min_sigterm_tuple = get_swap_threshold_tuple(swap_min_sigterm)
swap_min_sigkill_tuple = get_swap_threshold_tuple(swap_min_sigkill)
swap_min_warnings_tuple = get_swap_threshold_tuple(swap_min_warnings)
swap_kb_dict = dict()
swap_term_is_percent = swap_min_sigterm_tuple[1]
if swap_term_is_percent:
swap_min_sigterm_percent = swap_min_sigterm_tuple[0]
else:
swap_min_sigterm_kb = swap_min_sigterm_tuple[0]
swap_kb_dict['swap_min_sigterm_kb'] = swap_min_sigterm_kb
swap_kill_is_percent = swap_min_sigkill_tuple[1]
if swap_kill_is_percent:
swap_min_sigkill_percent = swap_min_sigkill_tuple[0]
else:
swap_min_sigkill_kb = swap_min_sigkill_tuple[0]
swap_kb_dict['swap_min_sigkill_kb'] = swap_min_sigkill_kb
swap_warn_is_percent = swap_min_warnings_tuple[1]
if swap_warn_is_percent:
swap_min_warnings_percent = swap_min_warnings_tuple[0]
else:
swap_min_warnings_kb = swap_min_warnings_tuple[0]
swap_kb_dict['swap_min_warnings_kb'] = swap_min_warnings_kb
##########################################################################
if print_config:
log('#' * 79)
log('0. Common zram settings')
log(' ignore_zram: {}'.format(ignore_zram))
log('1. Thresholds below which a signal should be sent to the victim')
log(' mem_min_sigterm: {} MiB, {} %'.format(
round(mem_min_sigterm_mb), round(mem_min_sigterm_percent, 1)))
log(' mem_min_sigkill: {} MiB, {} %'.format(
round(mem_min_sigkill_mb), round(mem_min_sigkill_percent, 1)))
log(' swap_min_sigterm: {}'.format(swap_min_sigterm))
log(' swap_min_sigkill: {}'.format(swap_min_sigkill))
log(' zram_max_sigterm: {} MiB, {} %'.format(
round(zram_max_sigterm_mb), round(zram_max_sigterm_percent, 1)))
log(' zram_max_sigkill: {} MiB, {} %'.format(
round(zram_max_sigkill_mb), round(zram_max_sigkill_percent, 1)))
log('2. Response on PSI memory metrics')
log(' ignore_psi: {}'.format(ignore_psi))
log(' psi_path: {}'.format(psi_path))
log(' psi_metrics: {}'.format(psi_metrics))
log(' sigterm_psi_threshold: {}'.format(sigterm_psi_threshold))
log(' sigkill_psi_threshold: {}'.format(sigkill_psi_threshold))
log(' psi_excess_duration: {} sec'.format(psi_excess_duration))
log(' psi_post_action_delay: {} sec'.format(psi_post_action_delay))
log('3. The frequency of checking the amount of available memory')
log(' rate_mem: {}'.format(rate_mem))
log(' rate_swap: {}'.format(rate_swap))
log(' rate_zram: {}'.format(rate_zram))
log(' max_sleep: {} sec'.format(max_sleep))
log(' min_sleep: {} sec'.format(min_sleep))
log(' over_sleep: {} sec'.format(over_sleep))
log('4. The prevention of killing innocent victims')
log(' min_badness: {}'.format(min_badness))
log(' min_delay_after_sigterm: {} sec'.format(min_delay_after_sigterm))
log(' decrease_oom_score_adj: {}'.format(decrease_oom_score_adj))
log(' oom_score_adj_max: {}'.format(oom_score_adj_max))
log('5. Impact on the badness of processes')
log('5.1. Matching process names with RE patterns')
if len(badness_adj_re_name_list) > 0:
log(' regexp: badness_adj:')
for i in badness_adj_re_name_list:
log(' {} {}'.format(i[1], i[0]))
else:
log(' (not set)')
log('5.2. Matching CGroup_v1-line with RE patterns')
if len(badness_adj_re_cgroup_v1_list) > 0:
log(' regexp: badness_adj:')
for i in badness_adj_re_cgroup_v1_list:
log(' {} {}'.format(i[1], i[0]))
else:
log(' (not set)')
log('5.3. Matching CGroup_v2-line with RE patterns')
if len(badness_adj_re_cgroup_v2_list) > 0:
log(' regexp: badness_adj:')
for i in badness_adj_re_cgroup_v1_list:
log(' {} {}'.format(i[1], i[0]))
else:
log(' (not set)')
log('5.4. Matching eUIDs with RE patterns')
if len(badness_adj_re_cgroup_v2_list) > 0:
log(' regexp: badness_adj:')
for i in badness_adj_re_uid_list:
log(' {} {}'.format(i[1], i[0]))
else:
log(' (not set)')
log('5.5. Matching realpath with RE patterns')
if len(badness_adj_re_cgroup_v2_list) > 0:
log(' regexp: badness_adj:')
for i in badness_adj_re_realpath_list:
log(' {} {}'.format(i[1], i[0]))
else:
log(' (not set)')
log('5.6. Matching cmdlines with RE patterns')
if len(badness_adj_re_cgroup_v2_list) > 0:
log(' regexp: badness_adj:')
for i in badness_adj_re_cmdline_list:
log(' {} {}'.format(i[1], i[0]))
else:
log(' (not set)')
log('5.7. Matching environ with RE patterns')
if len(badness_adj_re_cgroup_v2_list) > 0:
log(' regexp: badness_adj:')
for i in badness_adj_re_environ_list:
log(' {} {}'.format(i[1], i[0]))
else:
log(' (not set)')
log('6. Customize corrective actions.')
if len(soft_actions_list) > 0:
log(' Match by: regexp: command: ')
for i in soft_actions_list:
log(' {} {} {}'.format(i[0], i[1], i[2]))
else:
log(' (not set)')
log('7. GUI notifications')
log(' gui_notifications: {}'.format(gui_notifications))
log(' gui_low_memory_warnings: {}'.format(gui_low_memory_warnings))
log(' warning_exe: {}'.format(warning_exe))
log(' mem_min_warnings: {} MiB, {} %'.format(
round(mem_min_warnings_mb), round(mem_min_warnings_percent, 1)))
log(' swap_min_warnings: {}'.format(swap_min_warnings))
log(' zram_max_warnings: {} MiB, {} %'.format(
round(zram_max_warnings_mb), round(zram_max_warnings_percent, 1)))
log(' psi_avg_warnings: {}'.format(psi_avg_warnings))
log(' min_time_between_warnings: {}'.format(min_time_between_warnings))
log('8. Verbosity')
log(' print_config: {}'.format(print_config))
log(' print_mem_check_results: {}'.format(print_mem_check_results))
log(' min_mem_report_interval: {}'.format(min_mem_report_interval))
log(' print_sleep_periods: {}'.format(print_sleep_periods))
log(' print_total_stat: {}'.format(print_total_stat))
log(' print_proc_table: {}'.format(print_proc_table))
log(' extra_table_info: {}'.format(extra_table_info))
log(' print_victim_info: {}'.format(print_victim_info))
log(' print_victim_cmdline: {}'.format(print_victim_cmdline))
log(' max_ancestry_depth: {}'.format(max_ancestry_depth))
log(' separate_log: {}'.format(separate_log))
log(' psi_debug: {}'.format(psi_debug))
log('9. Misc')
log(' max_post_sigterm_victim_lifetime: {} sec'.format(
max_post_sigterm_victim_lifetime))
log(' post_kill_exe: {}'.format(post_kill_exe))
log(' forbid_negative_badness: {}'.format(
forbid_negative_badness))
# log(': {}'.format())
log('#' * 79)
##########################################################################
# for calculating the column width when printing mem and zram
mem_len = len(str(round(mem_total / 1024.0)))
if gui_notifications:
notify_sig_dict = {SIGKILL: 'Killing',
SIGTERM: 'Terminating'}
# convert rates from MiB/s to KiB/s
rate_mem = rate_mem * 1024
rate_swap = rate_swap * 1024
rate_zram = rate_zram * 1024
warn_time_now = 0
warn_time_delta = 1000
warn_timer = 0
##########################################################################
if not root:
log('WARNING: effective UID != 0; euid={}; processes with other e'
'uids will be invisible for nohang'.format(self_uid))
# Try to lock all memory
mlockall()
##########################################################################
# print_self_rss()
psi_avg_string = '' # will be overwritten if PSI monitoring enabled
mem_used_zram = 0
if print_mem_check_results:
# to find delta mem
wt2 = 0
new_mem = 0
# init mem report interval
report0 = 0
# handle signals
for i in sig_list:
signal(i, signal_handler)
x0 = time()
delta0 = 0
threshold = None
mem_info = None
#print(x0, 'x0')
CHECK_PSI = False
if psi_support and not ignore_psi:
CHECK_PSI = True
psi_kill_exceeded_timer = 0
psi_term_exceeded_timer = 0
psi_t0 = time()
psi_threshold = zram_threshold = zram_info = psi_info = None
CHECK_ZRAM = not ignore_zram
log('Monitoring has started!')
stdout.flush()
##########################################################################
while True:
(masf_threshold, masf_info, mem_available, swap_min_sigkill_kb,
swap_min_sigterm_kb, swap_free, swap_total) = check_mem_swap_ex()
if CHECK_ZRAM:
zram_threshold, zram_info, mem_used_zram = check_zram_ex()
if CHECK_PSI:
(psi_threshold, psi_info, psi_t0, psi_kill_exceeded_timer,
psi_term_exceeded_timer, x0) = check_psi_ex(
psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0)
if print_mem_check_results:
if CHECK_PSI:
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
if time() - psi_t0 >= psi_post_action_delay:
psi_post_action_delay_exceeded = True
else:
psi_post_action_delay_exceeded = False
if print_mem_check_results:
psi_avg_string = 'PSI avg: {} | '.format(
str(psi_avg_value).rjust(6))
wt1 = time()
delta = (mem_available + swap_free) - new_mem
t_cycle = wt1 - wt2
report_delta = wt1 - report0
if report_delta >= min_mem_report_interval:
mem_report = True
new_mem = mem_available + swap_free
report0 = wt1
else:
mem_report = False
wt2 = time()
if mem_report:
speed = delta / 1024.0 / report_delta
speed_info = ' | dMem: {} M/s'.format(
str(round(speed)).rjust(5)
)
# Calculate 'swap-column' width
swap_len = len(str(round(swap_total / 1024.0)))
# Output available mem sizes
if swap_total == 0 and mem_used_zram == 0:
log('{}MemAvail: {} M, {} %{}'.format(
psi_avg_string,
human(mem_available, mem_len),
just_percent_mem(mem_available / mem_total),
speed_info
)
)
elif swap_total > 0 and mem_used_zram == 0:
log('{}MemAvail: {} M, {} % | SwapFree: {} M, {} %{}'.format(
psi_avg_string,
human(mem_available, mem_len),
just_percent_mem(mem_available / mem_total),
human(swap_free, swap_len),
just_percent_swap(swap_free / (swap_total + 0.1)),
speed_info
)
)
else:
log('{}MemAvail: {} M, {} % | SwapFree: {} M, {} % | Mem'
'UsedZram: {} M, {} %{}'.format(
psi_avg_string,
human(mem_available, mem_len),
just_percent_mem(mem_available / mem_total),
human(swap_free, swap_len),
just_percent_swap(swap_free / (swap_total + 0.1)),
human(mem_used_zram, mem_len),
just_percent_mem(mem_used_zram / mem_total),
speed_info
)
)
if masf_threshold is SIGKILL or zram_threshold is SIGKILL or psi_threshold is SIGKILL:
threshold = SIGKILL
mem_info_list = []
if masf_info is not None:
mem_info_list.append(masf_info)
if zram_info is not None:
mem_info_list.append(zram_info)
if psi_info is not None:
mem_info_list.append(psi_info)
psi_t0 = implement_corrective_action(
threshold,
mem_info_list,
psi_t0,
psi_kill_exceeded_timer,
psi_term_exceeded_timer,
x0, psi_threshold, zram_threshold, zram_info, psi_info)
continue
if masf_threshold is SIGTERM or zram_threshold is SIGTERM or psi_threshold is SIGTERM:
threshold = SIGTERM
mem_info_list = []
if masf_info is not None:
mem_info_list.append(masf_info)
if zram_info is not None:
mem_info_list.append(zram_info)
if psi_info is not None:
mem_info_list.append(psi_info)
psi_t0 = implement_corrective_action(
threshold,
mem_info_list,
psi_t0,
psi_kill_exceeded_timer,
psi_term_exceeded_timer,
x0, psi_threshold, zram_threshold, zram_info, psi_info)
continue
if gui_low_memory_warnings:
if masf_threshold == 'WARN' or zram_threshold == 'WARN' or psi_threshold == 'WARN':
warn_time_delta = time() - warn_time_now
warn_time_now = time()
warn_timer += warn_time_delta
if warn_timer > min_time_between_warnings:
send_notify_warn()
warn_timer = 0
# print(a_dict)
# print(v_dict)
sleep_after_check_mem()