3370 lines
96 KiB
Python
Executable File
3370 lines
96 KiB
Python
Executable File
#!/usr/bin/env python3
|
||
"""A daemon that prevents OOM in Linux systems."""
|
||
|
||
import os
|
||
from ctypes import CDLL
|
||
from time import sleep, time
|
||
from operator import itemgetter
|
||
from sys import stdout, stderr, argv, exit
|
||
from re import search
|
||
from sre_constants import error as invalid_re
|
||
from signal import signal, SIGKILL, SIGTERM, SIGINT, SIGQUIT, SIGHUP
|
||
|
||
|
||
##########################################################################
|
||
|
||
# define functions
|
||
|
||
|
||
|
||
|
||
def check_config():
|
||
"""
|
||
"""
|
||
|
||
log('#' * 79)
|
||
|
||
log('0. Common zram settings')
|
||
|
||
log(' zram_checking_enabled: {}'.format(zram_checking_enabled))
|
||
|
||
log('1. Thresholds below which a signal should be sent to the victim')
|
||
|
||
log(' soft_threshold_min_mem: {} MiB, {} %'.format(round(soft_threshold_min_mem_mb), round(soft_threshold_min_mem_percent, 1)))
|
||
log(' hard_threshold_min_mem: {} MiB, {} %'.format(round(hard_threshold_min_mem_mb), round(hard_threshold_min_mem_percent, 1)))
|
||
log(' soft_threshold_min_swap: {}'.format(soft_threshold_min_swap))
|
||
log(' hard_threshold_min_swap: {}'.format(hard_threshold_min_swap))
|
||
log(' soft_threshold_max_zram: {} MiB, {} %'.format(round(soft_threshold_max_zram_mb), round(soft_threshold_max_zram_percent, 1)))
|
||
log(' hard_threshold_max_zram: {} MiB, {} %'.format(round(hard_threshold_max_zram_mb), round(hard_threshold_max_zram_percent, 1)))
|
||
|
||
log('2. Response on PSI memory metrics')
|
||
|
||
log(' psi_checking_enabled: {}'.format(psi_checking_enabled))
|
||
log(' psi_path: {}'.format(psi_path))
|
||
log(' psi_metrics: {}'.format(psi_metrics))
|
||
log(' soft_threshold_max_psi: {}'.format(soft_threshold_max_psi))
|
||
log(' hard_threshold_max_psi: {}'.format(hard_threshold_max_psi))
|
||
log(' psi_excess_duration: {} sec'.format(psi_excess_duration))
|
||
log(' psi_post_action_delay: {} sec'.format(psi_post_action_delay))
|
||
|
||
log('3. The frequency of checking the amount of available memory')
|
||
|
||
log(' fill_rate_mem: {}'.format(fill_rate_mem))
|
||
log(' fill_rate_swap: {}'.format(fill_rate_swap))
|
||
log(' fill_rate_zram: {}'.format(fill_rate_zram))
|
||
log(' max_sleep: {} sec'.format(max_sleep))
|
||
log(' min_sleep: {} sec'.format(min_sleep))
|
||
log(' over_sleep: {} sec'.format(over_sleep))
|
||
|
||
log('4. The prevention of killing innocent victims')
|
||
|
||
log(' min_badness: {}'.format(min_badness))
|
||
log(' post_soft_action_delay: {} sec'.format(post_soft_action_delay))
|
||
log(' post_zombie_delay: {} sec'.format(post_zombie_delay))
|
||
log(' victim_cache_time: {} sec'.format(victim_cache_time))
|
||
log(' ignore_positive_oom_score_adj: {}'.format(ignore_positive_oom_score_adj))
|
||
|
||
log('5. Impact on the badness of processes')
|
||
|
||
log('5.1. Matching process names with RE patterns')
|
||
if len(badness_adj_re_name_list) > 0:
|
||
log(' regexp: badness_adj:')
|
||
for i in badness_adj_re_name_list:
|
||
log(' {} {}'.format(i[1], i[0]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('5.2. Matching CGroup_v1-line with RE patterns')
|
||
if len(badness_adj_re_cgroup_v1_list) > 0:
|
||
log(' regexp: badness_adj:')
|
||
for i in badness_adj_re_cgroup_v1_list:
|
||
log(' {} {}'.format(i[1], i[0]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('5.3. Matching CGroup_v2-line with RE patterns')
|
||
if len(badness_adj_re_cgroup_v2_list) > 0:
|
||
log(' regexp: badness_adj:')
|
||
for i in badness_adj_re_cgroup_v1_list:
|
||
log(' {} {}'.format(i[1], i[0]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('5.4. Matching eUIDs with RE patterns')
|
||
if len(badness_adj_re_cgroup_v2_list) > 0:
|
||
log(' regexp: badness_adj:')
|
||
for i in badness_adj_re_uid_list:
|
||
log(' {} {}'.format(i[1], i[0]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('5.5. Matching realpath with RE patterns')
|
||
if len(badness_adj_re_cgroup_v2_list) > 0:
|
||
log(' regexp: badness_adj:')
|
||
for i in badness_adj_re_realpath_list:
|
||
log(' {} {}'.format(i[1], i[0]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('5.6. Matching cmdlines with RE patterns')
|
||
if len(badness_adj_re_cgroup_v2_list) > 0:
|
||
log(' regexp: badness_adj:')
|
||
for i in badness_adj_re_cmdline_list:
|
||
log(' {} {}'.format(i[1], i[0]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('5.7. Matching environ with RE patterns')
|
||
if len(badness_adj_re_cgroup_v2_list) > 0:
|
||
log(' regexp: badness_adj:')
|
||
for i in badness_adj_re_environ_list:
|
||
log(' {} {}'.format(i[1], i[0]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('6. Customize corrective actions')
|
||
|
||
if len(soft_actions_list) > 0:
|
||
log(' Match by: regexp: command: ')
|
||
for i in soft_actions_list:
|
||
log(' {} {} {}'.format(i[0], i[1], i[2]))
|
||
else:
|
||
log(' (not set)')
|
||
|
||
log('7. GUI notifications')
|
||
|
||
log(' post_action_gui_notifications: {}'.format(post_action_gui_notifications))
|
||
log(' low_memory_warnings_enabled: {}'.format(low_memory_warnings_enabled))
|
||
log(' warning_exe: {}'.format(warning_exe))
|
||
log(' warning_threshold_min_mem: {} MiB, {} %'.format(round(warning_threshold_min_mem_mb), round(warning_threshold_min_mem_percent, 1)))
|
||
log(' warning_threshold_min_swap: {}'.format(warning_threshold_min_swap))
|
||
log(' warning_threshold_max_zram: {} MiB, {} %'.format(round(warning_threshold_max_zram_mb), round(warning_threshold_max_zram_percent, 1)))
|
||
log(' warning_threshold_max_psi: {}'.format(warning_threshold_max_psi))
|
||
log(' min_post_warning_delay: {} sec'.format(min_post_warning_delay))
|
||
|
||
log('8. Verbosity')
|
||
|
||
log(' print_config_at_startup: {}'.format(print_config_at_startup))
|
||
log(' print_mem_check_results: {}'.format(print_mem_check_results))
|
||
log(' min_mem_report_interval: {} sec'.format(min_mem_report_interval))
|
||
log(' debug_sleep: {}'.format(debug_sleep))
|
||
log(' print_statistics: {}'.format(print_statistics))
|
||
log(' print_proc_table: {}'.format(print_proc_table))
|
||
log(' extra_table_info: {}'.format(extra_table_info))
|
||
log(' print_victim_status: {}'.format(print_victim_status))
|
||
log(' print_victim_cmdline: {}'.format(print_victim_cmdline))
|
||
log(' max_victim_ancestry_depth: {}'.format(max_victim_ancestry_depth))
|
||
log(' debug_gui_notifications: {}'.format(debug_gui_notifications))
|
||
log(' separate_log: {}'.format(separate_log))
|
||
log(' debug_psi: {}'.format(debug_psi))
|
||
|
||
log('9. Misc')
|
||
|
||
log(' max_soft_exit_time: {} sec'.format(max_soft_exit_time))
|
||
log(' post_kill_exe: {}'.format(post_kill_exe))
|
||
log(' forbid_negative_badness: {}'.format(
|
||
forbid_negative_badness))
|
||
|
||
# log(': {}'.format())
|
||
log('#' * 79)
|
||
|
||
if check_config_flag:
|
||
log('config is OK')
|
||
exit()
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
def encoder(string):
|
||
"""
|
||
"""
|
||
encoded = ''
|
||
for i in string:
|
||
encoded += str(ord(i)) + ':'
|
||
return encoded[:-1]
|
||
|
||
|
||
def get_swap_threshold_tuple(string):
|
||
# re (Num %, True) or (Num KiB, False)
|
||
"""Returns KiB value if abs val was set in config, or tuple with %"""
|
||
# return tuple with abs and bool: (abs %, True) or (abs MiB, False)
|
||
|
||
if string.endswith('%'):
|
||
valid = string_to_float_convert_test(string[:-1])
|
||
if valid is None:
|
||
errprint('somewhere swap unit is not float_%')
|
||
exit(1)
|
||
|
||
value = float(string[:-1].strip())
|
||
if value < 0 or value > 100:
|
||
errprint('invalid value, must be from the range[0; 100] %')
|
||
exit(1)
|
||
|
||
return value, True
|
||
|
||
elif string.endswith('M'):
|
||
valid = string_to_float_convert_test(string[:-1])
|
||
if valid is None:
|
||
errprint('somewhere swap unit is not float_M')
|
||
exit(1)
|
||
|
||
value = float(string[:-1].strip()) * 1024
|
||
if value < 0:
|
||
errprint('invalid unit in config (negative value)')
|
||
exit(1)
|
||
|
||
return value, False
|
||
|
||
else:
|
||
errprint(
|
||
'Invalid config file. There are invalid units somewhere\nExit')
|
||
exit(1)
|
||
|
||
|
||
def find_cgroup_indexes():
|
||
""" Find cgroup-line positions in /proc/*/cgroup file.
|
||
"""
|
||
|
||
cgroup_v1_index = cgroup_v2_index = None
|
||
|
||
with open('/proc/self/cgroup') as f:
|
||
for index, line in enumerate(f):
|
||
if ':name=' in line:
|
||
cgroup_v1_index = index
|
||
if line.startswith('0::'):
|
||
cgroup_v2_index = index
|
||
|
||
return cgroup_v1_index, cgroup_v2_index
|
||
|
||
|
||
def pid_to_rss(pid):
|
||
"""
|
||
"""
|
||
try:
|
||
rss = int(rline1(
|
||
'/proc/{}/statm'.format(pid)).split(' ')[1]) * SC_PAGESIZE
|
||
except IndexError:
|
||
rss = None
|
||
except FileNotFoundError:
|
||
rss = None
|
||
except ProcessLookupError:
|
||
rss = None
|
||
return rss
|
||
|
||
|
||
def pid_to_vm_size(pid):
|
||
"""
|
||
"""
|
||
try:
|
||
vm_size = int(rline1(
|
||
'/proc/{}/statm'.format(pid)).partition(' ')[0]) * SC_PAGESIZE
|
||
except IndexError:
|
||
vm_size = None
|
||
except FileNotFoundError:
|
||
vm_size = None
|
||
except ProcessLookupError:
|
||
vm_size = None
|
||
return vm_size
|
||
|
||
|
||
def signal_handler(signum, frame):
|
||
"""
|
||
"""
|
||
for i in sig_list:
|
||
signal(i, signal_handler_inner)
|
||
log('Signal handler called with the {} signal '.format(
|
||
sig_dict[signum]))
|
||
update_stat_dict_and_print(None)
|
||
log('Exit')
|
||
exit()
|
||
|
||
|
||
def signal_handler_inner(signum, frame):
|
||
"""
|
||
"""
|
||
log('Signal handler called with the {} signal (ignored) '.format(
|
||
sig_dict[signum]))
|
||
|
||
|
||
def exe(cmd):
|
||
"""
|
||
"""
|
||
log('Execute the command: {}'.format(cmd))
|
||
t0 = time()
|
||
write_self_oom_score_adj(self_oom_score_adj_max)
|
||
err = os.system(cmd)
|
||
write_self_oom_score_adj(self_oom_score_adj_min)
|
||
dt = time() - t0
|
||
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
|
||
return err
|
||
|
||
|
||
def write(path, string):
|
||
"""
|
||
"""
|
||
with open(path, 'w') as f:
|
||
f.write(string)
|
||
|
||
|
||
def write_self_oom_score_adj(new_value):
|
||
"""
|
||
"""
|
||
if root:
|
||
write('/proc/self/oom_score_adj', new_value)
|
||
|
||
|
||
def valid_re(reg_exp):
|
||
"""Validate regular expression.
|
||
"""
|
||
try:
|
||
search(reg_exp, '')
|
||
except invalid_re:
|
||
log('Invalid config: invalid regexp: {}'.format(reg_exp))
|
||
exit(1)
|
||
|
||
|
||
def func_print_proc_table():
|
||
"""
|
||
"""
|
||
print_proc_table = True
|
||
find_victim(print_proc_table)
|
||
exit()
|
||
|
||
|
||
def log(*msg):
|
||
"""
|
||
"""
|
||
try:
|
||
print(*msg)
|
||
except OSError:
|
||
sleep(0.01)
|
||
if separate_log:
|
||
try:
|
||
info(*msg)
|
||
except OSError:
|
||
sleep(0.01)
|
||
|
||
|
||
def print_version():
|
||
"""
|
||
"""
|
||
try:
|
||
v = rline1('/etc/nohang/version')
|
||
except FileNotFoundError:
|
||
v = None
|
||
if v is None:
|
||
print('Nohang unknown version')
|
||
else:
|
||
print('Nohang ' + v)
|
||
exit()
|
||
|
||
|
||
def pid_to_cgroup_v1(pid):
|
||
"""
|
||
"""
|
||
cgroup_v1 = ''
|
||
try:
|
||
with open('/proc/' + pid + '/cgroup') as f:
|
||
for index, line in enumerate(f):
|
||
if index == cgroup_v1_index:
|
||
cgroup_v1 = '/' + line.partition('/')[2][:-1]
|
||
return cgroup_v1
|
||
except FileNotFoundError:
|
||
return ''
|
||
|
||
|
||
def pid_to_cgroup_v2(pid):
|
||
"""
|
||
"""
|
||
cgroup_v2 = ''
|
||
try:
|
||
with open('/proc/' + pid + '/cgroup') as f:
|
||
for index, line in enumerate(f):
|
||
if index == cgroup_v2_index:
|
||
cgroup_v2 = line[3:-1]
|
||
return cgroup_v2
|
||
except FileNotFoundError:
|
||
return ''
|
||
|
||
|
||
def pid_to_starttime(pid):
|
||
""" handle FNF error!
|
||
"""
|
||
try:
|
||
starttime = rline1('/proc/' + pid + '/stat').rpartition(')')[
|
||
2].split(' ')[20]
|
||
|
||
except UnicodeDecodeError:
|
||
with open('/proc/' + pid + '/stat', 'rb') as f:
|
||
starttime = f.read().decode('utf-8', 'ignore').rpartition(
|
||
')')[2].split(' ')[20]
|
||
|
||
return float(starttime) / SC_CLK_TCK
|
||
|
||
|
||
def get_victim_id(pid):
|
||
"""victim_id is starttime + pid"""
|
||
try:
|
||
return rline1('/proc/' + pid + '/stat').rpartition(
|
||
')')[2].split(' ')[20] + '_pid' + pid
|
||
except FileNotFoundError:
|
||
return ''
|
||
except ProcessLookupError:
|
||
return ''
|
||
|
||
|
||
def pid_to_state(pid):
|
||
"""
|
||
"""
|
||
try:
|
||
with open('/proc/' + pid + '/stat', 'rb') as f:
|
||
return f.read(20).decode('utf-8', 'ignore').rpartition(')')[2][1]
|
||
except FileNotFoundError:
|
||
return ''
|
||
except ProcessLookupError:
|
||
return ''
|
||
|
||
|
||
def pid_to_name(pid):
|
||
"""
|
||
"""
|
||
try:
|
||
with open('/proc/' + pid + '/comm', 'rb') as f:
|
||
return f.read().decode('utf-8', 'ignore')[:-1]
|
||
except FileNotFoundError:
|
||
return ''
|
||
except ProcessLookupError:
|
||
return ''
|
||
|
||
|
||
def pid_to_ppid(pid):
|
||
"""
|
||
"""
|
||
try:
|
||
with open('/proc/' + pid + '/status') as f:
|
||
for n, line in enumerate(f):
|
||
if n is ppid_index:
|
||
return line.split('\t')[1].strip()
|
||
except FileNotFoundError:
|
||
return ''
|
||
except ProcessLookupError:
|
||
return ''
|
||
except UnicodeDecodeError:
|
||
with open('/proc/' + pid + '/status', 'rb') as f:
|
||
f_list = f.read().decode('utf-8', 'ignore').split('\n')
|
||
for i in range(len(f_list)):
|
||
if i is ppid_index:
|
||
return f_list[i].split('\t')[1]
|
||
|
||
|
||
def pid_to_ancestry(pid, max_victim_ancestry_depth=1):
|
||
"""
|
||
"""
|
||
if max_victim_ancestry_depth == 1:
|
||
ppid = pid_to_ppid(pid)
|
||
pname = pid_to_name(ppid)
|
||
return '\n PPID: {} ({})'.format(ppid, pname)
|
||
if max_victim_ancestry_depth == 0:
|
||
return ''
|
||
anc_list = []
|
||
for i in range(max_victim_ancestry_depth):
|
||
ppid = pid_to_ppid(pid)
|
||
pname = pid_to_name(ppid)
|
||
anc_list.append((ppid, pname))
|
||
if ppid == '1':
|
||
break
|
||
pid = ppid
|
||
a = ''
|
||
for i in anc_list:
|
||
a = a + ' <= PID {} ({})'.format(i[0], i[1])
|
||
return '\n Ancestry: ' + a[4:]
|
||
|
||
|
||
def pid_to_cmdline(pid):
|
||
"""
|
||
Get process cmdline by pid.
|
||
|
||
pid: str pid of required process
|
||
returns string cmdline
|
||
"""
|
||
try:
|
||
with open('/proc/' + pid + '/cmdline') as f:
|
||
return f.read().replace('\x00', ' ').rstrip()
|
||
except FileNotFoundError:
|
||
return ''
|
||
|
||
|
||
def pid_to_environ(pid):
|
||
"""
|
||
Get process environ by pid.
|
||
|
||
pid: str pid of required process
|
||
returns string environ
|
||
"""
|
||
try:
|
||
with open('/proc/' + pid + '/environ') as f:
|
||
return f.read().replace('\x00', ' ').rstrip()
|
||
except FileNotFoundError:
|
||
return ''
|
||
|
||
|
||
def pid_to_realpath(pid):
|
||
"""
|
||
"""
|
||
try:
|
||
return os.path.realpath('/proc/' + pid + '/exe')
|
||
except FileNotFoundError:
|
||
return ''
|
||
|
||
|
||
def pid_to_uid(pid):
|
||
"""return euid"""
|
||
try:
|
||
with open('/proc/' + pid + '/status') as f:
|
||
for n, line in enumerate(f):
|
||
if n is uid_index:
|
||
return line.split('\t')[2]
|
||
except UnicodeDecodeError:
|
||
with open('/proc/' + pid + '/status', 'rb') as f:
|
||
f_list = f.read().decode('utf-8', 'ignore').split('\n')
|
||
return f_list[uid_index].split('\t')[2]
|
||
except FileNotFoundError:
|
||
return ''
|
||
|
||
|
||
def pid_to_badness(pid):
|
||
"""Find and modify badness (if it needs)."""
|
||
|
||
try:
|
||
|
||
oom_score = int(rline1('/proc/' + pid + '/oom_score'))
|
||
badness = oom_score
|
||
|
||
if ignore_positive_oom_score_adj:
|
||
oom_score_adj = int(rline1('/proc/' + pid + '/oom_score_adj'))
|
||
if oom_score_adj > 0:
|
||
badness = badness - oom_score_adj
|
||
|
||
if regex_matching:
|
||
name = pid_to_name(pid)
|
||
for re_tup in badness_adj_re_name_list:
|
||
if search(re_tup[1], name) is not None:
|
||
badness += int(re_tup[0])
|
||
|
||
if re_match_cgroup_v1:
|
||
cgroup_v1 = pid_to_cgroup_v1(pid)
|
||
for re_tup in badness_adj_re_cgroup_v1_list:
|
||
if search(re_tup[1], cgroup_v1) is not None:
|
||
badness += int(re_tup[0])
|
||
|
||
if re_match_cgroup_v2:
|
||
cgroup_v2 = pid_to_cgroup_v2(pid)
|
||
for re_tup in badness_adj_re_cgroup_v2_list:
|
||
if search(re_tup[1], cgroup_v2) is not None:
|
||
badness += int(re_tup[0])
|
||
|
||
if re_match_realpath:
|
||
realpath = pid_to_realpath(pid)
|
||
for re_tup in badness_adj_re_realpath_list:
|
||
if search(re_tup[1], realpath) is not None:
|
||
badness += int(re_tup[0])
|
||
|
||
if re_match_cmdline:
|
||
cmdline = pid_to_cmdline(pid)
|
||
for re_tup in badness_adj_re_cmdline_list:
|
||
if search(re_tup[1], cmdline) is not None:
|
||
badness += int(re_tup[0])
|
||
|
||
if re_match_environ:
|
||
environ = pid_to_environ(pid)
|
||
for re_tup in badness_adj_re_environ_list:
|
||
if search(re_tup[1], environ) is not None:
|
||
badness += int(re_tup[0])
|
||
|
||
if re_match_uid:
|
||
uid = pid_to_uid(pid)
|
||
for re_tup in badness_adj_re_uid_list:
|
||
if search(re_tup[1], uid) is not None:
|
||
badness += int(re_tup[0])
|
||
|
||
if forbid_negative_badness:
|
||
if badness < 0:
|
||
badness = 0
|
||
|
||
return badness, oom_score
|
||
|
||
except FileNotFoundError:
|
||
return None, None
|
||
except ProcessLookupError:
|
||
return None, None
|
||
|
||
|
||
def pid_to_status(pid):
|
||
"""
|
||
"""
|
||
|
||
try:
|
||
|
||
with open('/proc/' + pid + '/status') as f:
|
||
|
||
for n, line in enumerate(f):
|
||
|
||
if n is 0:
|
||
name = line.split('\t')[1][:-1]
|
||
|
||
if n is state_index:
|
||
state = line.split('\t')[1][0]
|
||
continue
|
||
|
||
if n is ppid_index:
|
||
ppid = line.split('\t')[1][:-1]
|
||
continue
|
||
|
||
if n is uid_index:
|
||
uid = line.split('\t')[2]
|
||
continue
|
||
|
||
if n is vm_size_index:
|
||
vm_size = kib_to_mib(int(line.split('\t')[1][:-4]))
|
||
continue
|
||
|
||
if n is vm_rss_index:
|
||
vm_rss = kib_to_mib(int(line.split('\t')[1][:-4]))
|
||
continue
|
||
|
||
if n is vm_swap_index:
|
||
vm_swap = kib_to_mib(int(line.split('\t')[1][:-4]))
|
||
break
|
||
|
||
return name, state, ppid, uid, vm_size, vm_rss, vm_swap
|
||
|
||
except UnicodeDecodeError:
|
||
return pid_to_status_unicode(pid)
|
||
|
||
except FileNotFoundError:
|
||
return None
|
||
|
||
except ProcessLookupError:
|
||
return None
|
||
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def pid_to_status_unicode(pid):
|
||
"""
|
||
"""
|
||
try:
|
||
|
||
with open('/proc/' + pid + '/status', 'rb') as f:
|
||
f_list = f.read().decode('utf-8', 'ignore').split('\n')
|
||
|
||
for i in range(len(f_list)):
|
||
|
||
if i is 0:
|
||
name = f_list[i].split('\t')[1]
|
||
|
||
if i is state_index:
|
||
state = f_list[i].split('\t')[1][0]
|
||
|
||
if i is ppid_index:
|
||
ppid = f_list[i].split('\t')[1]
|
||
|
||
if i is uid_index:
|
||
uid = f_list[i].split('\t')[2]
|
||
|
||
if i is vm_size_index:
|
||
vm_size = kib_to_mib(
|
||
int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if i is vm_rss_index:
|
||
vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if i is vm_swap_index:
|
||
vm_swap = kib_to_mib(int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
return name, state, ppid, uid, vm_size, vm_rss, vm_swap
|
||
|
||
except FileNotFoundError:
|
||
return None
|
||
|
||
except ProcessLookupError:
|
||
return None
|
||
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def uptime():
|
||
"""
|
||
"""
|
||
return float(rline1('/proc/uptime').split(' ')[0])
|
||
|
||
|
||
def errprint(*text):
|
||
"""
|
||
"""
|
||
print(*text, file=stderr, flush=True)
|
||
|
||
|
||
def mlockall():
|
||
"""Lock all memory to prevent swapping nohang process."""
|
||
|
||
MCL_CURRENT = 1
|
||
MCL_FUTURE = 2
|
||
MCL_ONFAULT = 4
|
||
|
||
libc = CDLL('libc.so.6', use_errno=True)
|
||
|
||
result = libc.mlockall(
|
||
MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT
|
||
)
|
||
if result != 0:
|
||
result = libc.mlockall(
|
||
MCL_CURRENT | MCL_FUTURE
|
||
)
|
||
if result != 0:
|
||
log('WARNING: cannot lock all memory')
|
||
else:
|
||
pass
|
||
# log('All memory locked with MCL_CURRENT | MCL_FUTURE')
|
||
else:
|
||
pass
|
||
# log('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT')
|
||
|
||
|
||
|
||
def update_stat_dict_and_print(key):
|
||
"""
|
||
"""
|
||
|
||
if key is not None:
|
||
|
||
if key not in stat_dict:
|
||
|
||
stat_dict.update({key: 1})
|
||
|
||
else:
|
||
|
||
new_value = stat_dict[key] + 1
|
||
stat_dict.update({key: new_value})
|
||
|
||
if print_statistics:
|
||
|
||
stats_msg = 'Total stat (what happened in the last {}):'.format(
|
||
format_time(time() - start_time))
|
||
|
||
for i in stat_dict:
|
||
stats_msg += '\n {}: {}'.format(i, stat_dict[i])
|
||
|
||
log(stats_msg)
|
||
|
||
|
||
def find_psi_metrics_value(psi_path, psi_metrics):
|
||
"""
|
||
"""
|
||
|
||
if psi_support:
|
||
|
||
if psi_metrics == 'some_avg10':
|
||
return float(rline1(psi_path).split(' ')[1].split('=')[1])
|
||
if psi_metrics == 'some_avg60':
|
||
return float(rline1(psi_path).split(' ')[2].split('=')[1])
|
||
if psi_metrics == 'some_avg300':
|
||
return float(rline1(psi_path).split(' ')[3].split('=')[1])
|
||
|
||
if psi_metrics == 'full_avg10':
|
||
with open(psi_path) as f:
|
||
psi_list = f.readlines()
|
||
return float(psi_list[1].split(' ')[1].split('=')[1])
|
||
if psi_metrics == 'full_avg60':
|
||
with open(psi_path) as f:
|
||
psi_list = f.readlines()
|
||
return float(psi_list[1].split(' ')[2].split('=')[1])
|
||
if psi_metrics == 'full_avg300':
|
||
with open(psi_path) as f:
|
||
psi_list = f.readlines()
|
||
return float(psi_list[1].split(' ')[3].split('=')[1])
|
||
|
||
|
||
def check_mem_and_swap():
|
||
"""find mem_available, swap_total, swap_free"""
|
||
with open('/proc/meminfo') as f:
|
||
for n, line in enumerate(f):
|
||
if n is 2:
|
||
mem_available = int(line.split(':')[1][:-4])
|
||
continue
|
||
if n is swap_total_index:
|
||
swap_total = int(line.split(':')[1][:-4])
|
||
continue
|
||
if n is swap_free_index:
|
||
swap_free = int(line.split(':')[1][:-4])
|
||
break
|
||
return mem_available, swap_total, swap_free
|
||
|
||
|
||
def check_zram():
|
||
"""find MemUsedZram"""
|
||
disksize_sum = 0
|
||
mem_used_total_sum = 0
|
||
|
||
for dev in os.listdir('/sys/block'):
|
||
if dev.startswith('zram'):
|
||
stat = zram_stat(dev)
|
||
disksize_sum += int(stat[0])
|
||
mem_used_total_sum += int(stat[1])
|
||
|
||
# Means that when setting zram disksize = 1 GiB available memory
|
||
# decrease by 0.0042 GiB.
|
||
# Found experimentally, requires clarification with different kernaels and
|
||
# architectures.
|
||
# On small disk drives (up to gigabyte) it can be more, up to 0.0045.
|
||
# The creator of the zram module claims that ZRAM_DISKSIZE_FACTOR should
|
||
# be 0.001:
|
||
# ("zram uses about 0.1% of the size of the disk"
|
||
# - https://www.kernel.org/doc/Documentation/blockdev/zram.txt),
|
||
# but this statement contradicts the experimental data.
|
||
# ZRAM_DISKSIZE_FACTOR = deltaMemAvailavle / disksize
|
||
# Found experimentally.
|
||
ZRAM_DISKSIZE_FACTOR = 0.0042
|
||
|
||
return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0
|
||
|
||
|
||
def format_time(t):
|
||
"""
|
||
"""
|
||
t = int(t)
|
||
if t < 60:
|
||
return '{} sec'.format(t)
|
||
elif t >= 60 and t < 3600:
|
||
m = t // 60
|
||
s = t % 60
|
||
return '{} min {} sec'.format(m, s)
|
||
else:
|
||
h = t // 3600
|
||
s0 = t - h * 3600
|
||
m = s0 // 60
|
||
s = s0 % 60
|
||
return '{} h {} min {} sec'.format(h, m, s)
|
||
|
||
|
||
def string_to_float_convert_test(string):
|
||
"""Try to interprete string values as floats."""
|
||
try:
|
||
return float(string)
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def string_to_int_convert_test(string):
|
||
"""Try to interpret string values as integers."""
|
||
try:
|
||
return int(string)
|
||
except ValueError:
|
||
return None
|
||
|
||
|
||
def conf_parse_string(param):
|
||
"""
|
||
Get string parameters from the config dict.
|
||
|
||
param: config_dict key
|
||
returns config_dict[param].strip()
|
||
"""
|
||
if param in config_dict:
|
||
return config_dict[param].strip()
|
||
else:
|
||
errprint('All the necessary parameters must be in the config')
|
||
errprint('There is no "{}" parameter in the config'.format(param))
|
||
exit(1)
|
||
|
||
|
||
def conf_parse_bool(param):
|
||
"""
|
||
Get bool parameters from the config_dict.
|
||
|
||
param: config_dicst key
|
||
returns bool
|
||
"""
|
||
if param in config_dict:
|
||
param_str = config_dict[param]
|
||
if param_str == 'True':
|
||
return True
|
||
elif param_str == 'False':
|
||
return False
|
||
else:
|
||
errprint('Invalid value of the "{}" parameter.'.format(param))
|
||
errprint('Valid values are True and False.')
|
||
errprint('Exit')
|
||
exit(1)
|
||
else:
|
||
errprint('All the necessary parameters must be in the config')
|
||
errprint('There is no "{}" parameter in the config'.format(param))
|
||
exit(1)
|
||
|
||
|
||
def rline1(path):
|
||
"""read 1st line from path."""
|
||
try:
|
||
with open(path) as f:
|
||
for line in f:
|
||
return line[:-1]
|
||
except UnicodeDecodeError:
|
||
with open(path, 'rb') as f:
|
||
return f.read(999).decode(
|
||
'utf-8', 'ignore').split('\n')[0] # use partition()!
|
||
|
||
|
||
def kib_to_mib(num):
|
||
"""Convert KiB values to MiB values."""
|
||
return round(num / 1024.0)
|
||
|
||
|
||
def percent(num):
|
||
"""Interprete num as percentage."""
|
||
return round(num * 100, 1)
|
||
|
||
|
||
def just_percent_mem(num):
|
||
"""convert num to percent and justify"""
|
||
return str(round(num * 100, 1)).rjust(4, ' ')
|
||
|
||
|
||
def just_percent_swap(num):
|
||
"""
|
||
"""
|
||
return str(round(num * 100, 1)).rjust(5, ' ')
|
||
|
||
|
||
def human(num, lenth):
|
||
"""Convert KiB values to MiB values with right alignment"""
|
||
return str(round(num / 1024)).rjust(lenth, ' ')
|
||
|
||
|
||
def zram_stat(zram_id):
|
||
"""
|
||
Get zram state.
|
||
|
||
zram_id: str zram block-device id
|
||
returns bytes diskcize, str mem_used_total
|
||
"""
|
||
try:
|
||
disksize = rline1('/sys/block/' + zram_id + '/disksize')
|
||
except FileNotFoundError:
|
||
return '0', '0'
|
||
if disksize == ['0\n']:
|
||
return '0', '0'
|
||
try:
|
||
mm_stat = rline1('/sys/block/' + zram_id + '/mm_stat').split(' ')
|
||
mm_stat_list = []
|
||
for i in mm_stat:
|
||
if i != '':
|
||
mm_stat_list.append(i)
|
||
mem_used_total = mm_stat_list[2]
|
||
except FileNotFoundError:
|
||
mem_used_total = rline1('/sys/block/' + zram_id + '/mem_used_total')
|
||
return disksize, mem_used_total # BYTES, str
|
||
|
||
|
||
def send_notify_warn():
|
||
"""
|
||
Look for process with maximum 'badness' and warn user with notification.
|
||
(implement Low memory warnings)
|
||
"""
|
||
log('Warning threshold exceeded')
|
||
|
||
if check_warning_exe:
|
||
exe(warning_exe)
|
||
|
||
else:
|
||
|
||
title = 'Low memory'
|
||
|
||
body = 'MemAvail: {}%\nSwapFree: {}%'.format(
|
||
round(mem_available / mem_total * 100),
|
||
round(swap_free / (swap_total + 0.1) * 100)
|
||
)
|
||
|
||
send_notification(title, body)
|
||
|
||
|
||
def send_notify(threshold, name, pid):
|
||
"""
|
||
Notificate about OOM Preventing.
|
||
|
||
threshold: key for notify_sig_dict
|
||
name: str process name
|
||
pid: str process pid
|
||
"""
|
||
|
||
# wait for memory release after corrective action
|
||
# may be useful if free memory was about 0 immediately after
|
||
# corrective action
|
||
sleep(0.05)
|
||
|
||
title = 'Freeze prevention'
|
||
body = '<b>{}</b> [{}] <b>{}</b>'.format(
|
||
notify_sig_dict[threshold],
|
||
pid,
|
||
name.replace(
|
||
# symbol '&' can break notifications in some themes,
|
||
# therefore it is replaced by '*'
|
||
'&', '*'
|
||
)
|
||
)
|
||
|
||
send_notification(title, body)
|
||
|
||
|
||
def send_notify_etc(pid, name, command):
|
||
"""
|
||
Notificate about OOM Preventing.
|
||
|
||
command: str command that will be executed
|
||
name: str process name
|
||
pid: str process pid
|
||
"""
|
||
title = 'Freeze prevention'
|
||
body = '<b>Victim is</b> [{}] <b>{}</b>\nExecute the co' \
|
||
'mmand:\n<b>{}</b>'.format(
|
||
pid, name.replace('&', '*'), command.replace('&', '*'))
|
||
|
||
send_notification(title, body)
|
||
|
||
|
||
def send_notification(title, body):
|
||
"""
|
||
"""
|
||
cmd = '{} "--euid={}" "--debug={}" "--title={}" "--body={}" &'.format(
|
||
notify_helper_path,
|
||
self_uid,
|
||
debug_gui_notifications,
|
||
title,
|
||
encoder(body))
|
||
|
||
exe(cmd)
|
||
|
||
|
||
def get_pid_list():
|
||
"""
|
||
Find pid list expect kthreads and zombies
|
||
"""
|
||
pid_list = []
|
||
for pid in os.listdir('/proc'):
|
||
if os.path.exists('/proc/' + pid + '/exe') is True:
|
||
pid_list.append(pid)
|
||
return pid_list
|
||
|
||
|
||
def get_non_decimal_pids():
|
||
"""
|
||
"""
|
||
non_decimal_list = []
|
||
for pid in pid_list:
|
||
if pid[0].isdecimal() is False:
|
||
non_decimal_list.append(pid)
|
||
return non_decimal_list
|
||
|
||
|
||
def find_victim(_print_proc_table):
|
||
"""
|
||
Find the process with highest badness and its badness adjustment
|
||
Return pid and badness
|
||
"""
|
||
|
||
ft1 = time()
|
||
|
||
pid_list = get_pid_list()
|
||
|
||
pid_list.remove(self_pid)
|
||
|
||
if '1' in pid_list:
|
||
pid_list.remove('1')
|
||
|
||
non_decimal_list = get_non_decimal_pids()
|
||
|
||
for i in non_decimal_list:
|
||
if i in pid_list:
|
||
pid_list.remove(i)
|
||
|
||
pid_badness_list = []
|
||
|
||
if _print_proc_table:
|
||
|
||
if extra_table_info == 'None':
|
||
extra_table_title = ''
|
||
|
||
elif extra_table_info == 'cgroup_v1':
|
||
extra_table_title = 'CGroup_v1'
|
||
|
||
elif extra_table_info == 'cgroup_v2':
|
||
extra_table_title = 'CGroup_v2'
|
||
|
||
elif extra_table_info == 'cmdline':
|
||
extra_table_title = 'cmdline'
|
||
|
||
elif extra_table_info == 'environ':
|
||
extra_table_title = 'environ'
|
||
|
||
elif extra_table_info == 'realpath':
|
||
extra_table_title = 'realpath'
|
||
|
||
else:
|
||
extra_table_title = ''
|
||
|
||
hr = '#' * 107
|
||
|
||
log(hr)
|
||
log('# PID PPID badness oom_score oom_score_adj e'
|
||
'UID S VmSize VmRSS VmSwap Name {}'.format(
|
||
extra_table_title))
|
||
log('#------- ------- ------- --------- ------------- -------'
|
||
'--- - ------ ----- ------ ---------------')
|
||
|
||
for pid in pid_list:
|
||
|
||
badness = pid_to_badness(pid)[0]
|
||
|
||
if badness is None:
|
||
continue
|
||
|
||
if _print_proc_table:
|
||
|
||
try:
|
||
oom_score = rline1('/proc/' + pid + '/oom_score')
|
||
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
|
||
except FileNotFoundError:
|
||
continue
|
||
|
||
if pid_to_status(pid) is None:
|
||
continue
|
||
else:
|
||
(name, state, ppid, uid, vm_size, vm_rss,
|
||
vm_swap) = pid_to_status(pid)
|
||
|
||
if extra_table_info == 'None':
|
||
extra_table_line = ''
|
||
|
||
elif extra_table_info == 'cgroup_v1':
|
||
extra_table_line = pid_to_cgroup_v1(pid)
|
||
|
||
elif extra_table_info == 'cgroup_v2':
|
||
extra_table_line = pid_to_cgroup_v2(pid)
|
||
|
||
elif extra_table_info == 'cmdline':
|
||
extra_table_line = pid_to_cmdline(pid)
|
||
|
||
elif extra_table_info == 'environ':
|
||
extra_table_line = pid_to_environ(pid)
|
||
|
||
elif extra_table_info == 'realpath':
|
||
extra_table_line = pid_to_realpath(pid)
|
||
|
||
else:
|
||
extra_table_line = ''
|
||
|
||
log('#{} {} {} {} {} {} {} {} {} {} {} {}'.format(
|
||
pid.rjust(7),
|
||
ppid.rjust(7),
|
||
str(badness).rjust(7),
|
||
oom_score.rjust(9),
|
||
oom_score_adj.rjust(13),
|
||
uid.rjust(10),
|
||
state,
|
||
str(vm_size).rjust(6),
|
||
str(vm_rss).rjust(5),
|
||
str(vm_swap).rjust(6),
|
||
name.ljust(15),
|
||
extra_table_line
|
||
)
|
||
)
|
||
|
||
pid_badness_list.append((pid, badness))
|
||
|
||
real_proc_num = len(pid_badness_list)
|
||
|
||
# Make list of (pid, badness) tuples, sorted by 'badness' values
|
||
# print(pid_badness_list)
|
||
pid_tuple_list = sorted(
|
||
pid_badness_list,
|
||
key=itemgetter(1),
|
||
reverse=True
|
||
)[0]
|
||
|
||
pid = pid_tuple_list[0]
|
||
victim_id = get_victim_id(pid)
|
||
|
||
# Get maximum 'badness' value
|
||
victim_badness = pid_tuple_list[1]
|
||
victim_name = pid_to_name(pid)
|
||
|
||
if _print_proc_table:
|
||
log(hr)
|
||
|
||
log('Found {} processes with existing /proc/[pid]/exe realpath'.format(
|
||
real_proc_num))
|
||
|
||
log(
|
||
'Process with highest badness (found in {} ms):\n PID: {}, Na'
|
||
'me: {}, badness: {}'.format(
|
||
round((time() - ft1) * 1000),
|
||
pid,
|
||
victim_name,
|
||
victim_badness
|
||
)
|
||
)
|
||
|
||
return pid, victim_badness, victim_name, victim_id
|
||
|
||
|
||
def find_victim_info(pid, victim_badness, name):
|
||
"""
|
||
"""
|
||
status0 = time()
|
||
|
||
try:
|
||
|
||
with open('/proc/' + pid + '/status') as f:
|
||
|
||
for n, line in enumerate(f):
|
||
|
||
if n is state_index:
|
||
state = line.split('\t')[1].rstrip()
|
||
continue
|
||
|
||
if n is ppid_index:
|
||
ppid = line.split('\t')[1]
|
||
continue
|
||
|
||
if n is uid_index:
|
||
uid = line.split('\t')[2]
|
||
continue
|
||
|
||
if n is vm_size_index:
|
||
vm_size = kib_to_mib(int(line.split('\t')[1][:-4]))
|
||
continue
|
||
|
||
if n is vm_rss_index:
|
||
vm_rss = kib_to_mib(int(line.split('\t')[1][:-4]))
|
||
continue
|
||
|
||
if detailed_rss:
|
||
|
||
if n is anon_index:
|
||
anon_rss = kib_to_mib(
|
||
int(line.split('\t')[1][:-4]))
|
||
continue
|
||
|
||
if n is file_index:
|
||
file_rss = kib_to_mib(
|
||
int(line.split('\t')[1][:-4]))
|
||
continue
|
||
|
||
if n is shmem_index:
|
||
shmem_rss = kib_to_mib(
|
||
int(line.split('\t')[1][:-4]))
|
||
continue
|
||
|
||
if n is vm_swap_index:
|
||
vm_swap = kib_to_mib(int(line.split('\t')[1][:-4]))
|
||
break
|
||
|
||
if print_victim_cmdline:
|
||
cmdline = pid_to_cmdline(pid)
|
||
oom_score = rline1('/proc/' + pid + '/oom_score')
|
||
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
|
||
|
||
except FileNotFoundError:
|
||
log('The victim died in the search process: FileNotFoundError')
|
||
update_stat_dict_and_print(
|
||
'The victim died in the search process: FileNotFoundError')
|
||
return None
|
||
except ProcessLookupError:
|
||
log('The victim died in the search process: ProcessLookupError')
|
||
update_stat_dict_and_print(
|
||
'The victim died in the search process: ProcessLookupError')
|
||
return None
|
||
except UnicodeDecodeError:
|
||
|
||
with open('/proc/' + pid + '/status', 'rb') as f:
|
||
f_list = f.read().decode('utf-8', 'ignore').split('\n')
|
||
|
||
for i in range(len(f_list)):
|
||
|
||
if i is state_index:
|
||
state = f_list[i].split('\t')[1].rstrip()
|
||
|
||
if i is ppid_index:
|
||
ppid = f_list[i].split('\t')[1]
|
||
|
||
if i is uid_index:
|
||
uid = f_list[i].split('\t')[2]
|
||
|
||
if i is vm_size_index:
|
||
vm_size = kib_to_mib(
|
||
int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if i is vm_rss_index:
|
||
vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if detailed_rss:
|
||
|
||
if i is anon_index:
|
||
anon_rss = kib_to_mib(
|
||
int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if i is file_index:
|
||
file_rss = kib_to_mib(
|
||
int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if i is shmem_index:
|
||
shmem_rss = kib_to_mib(
|
||
int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if i is vm_swap_index:
|
||
vm_swap = kib_to_mib(
|
||
int(f_list[i].split('\t')[1][:-3]))
|
||
|
||
if print_victim_cmdline:
|
||
cmdline = pid_to_cmdline(pid)
|
||
oom_score = rline1('/proc/' + pid + '/oom_score')
|
||
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
|
||
|
||
except IndexError:
|
||
log('The victim died in the search process: IndexError')
|
||
update_stat_dict_and_print(
|
||
'The victim died in the search process: IndexError')
|
||
return None
|
||
except ValueError:
|
||
log('The victim died in the search process: ValueError')
|
||
update_stat_dict_and_print(
|
||
'The victim died in the search process: ValueError')
|
||
return None
|
||
except FileNotFoundError:
|
||
log('The victim died in the search process: FileNotFoundError')
|
||
update_stat_dict_and_print(
|
||
'The victim died in the search process: FileNotFoundError')
|
||
return None
|
||
except ProcessLookupError:
|
||
log('The victim died in the search process: ProcessLookupError')
|
||
update_stat_dict_and_print(
|
||
'The victim died in the search process: ProcessLookupError')
|
||
return None
|
||
|
||
len_vm = len(str(vm_size))
|
||
|
||
try:
|
||
realpath = os.path.realpath('/proc/' + pid + '/exe')
|
||
victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
|
||
victim_cgroup_v1 = pid_to_cgroup_v1(pid)
|
||
victim_cgroup_v2 = pid_to_cgroup_v2(pid)
|
||
|
||
except FileNotFoundError:
|
||
log('The victim died in the search process: FileNotFoundError')
|
||
update_stat_dict_and_print(
|
||
'The victim died in the search process: FileNotFoundError')
|
||
return None
|
||
|
||
ancestry = pid_to_ancestry(pid, max_victim_ancestry_depth)
|
||
|
||
if print_victim_cmdline is False:
|
||
cmdline = ''
|
||
c1 = ''
|
||
else:
|
||
c1 = '\n Cmdline: '
|
||
|
||
if detailed_rss:
|
||
detailed_rss_info = ' (' \
|
||
'Anon: {} MiB, ' \
|
||
'File: {} MiB, ' \
|
||
'Shmem: {} MiB)'.format(
|
||
anon_rss,
|
||
file_rss,
|
||
shmem_rss)
|
||
else:
|
||
detailed_rss_info = ''
|
||
|
||
victim_info = 'Victim status (found in {} ms):' \
|
||
'\n Name: {}' \
|
||
'\n State: {}' \
|
||
'\n PID: {}' \
|
||
'{}' \
|
||
'\n EUID: {}' \
|
||
'\n badness: {}, ' \
|
||
'oom_score: {}, ' \
|
||
'oom_score_adj: {}' \
|
||
'\n VmSize: {} MiB' \
|
||
'\n VmRSS: {} MiB {}' \
|
||
'\n VmSwap: {} MiB' \
|
||
'\n CGroup_v1: {}' \
|
||
'\n CGroup_v2: {}' \
|
||
'\n Realpath: {}' \
|
||
'{}{}' \
|
||
'\n Lifetime: {}'.format(
|
||
round((time() - status0) * 1000),
|
||
name,
|
||
state,
|
||
pid,
|
||
ancestry,
|
||
uid,
|
||
victim_badness,
|
||
oom_score,
|
||
oom_score_adj,
|
||
vm_size,
|
||
str(vm_rss).rjust(len_vm),
|
||
detailed_rss_info,
|
||
str(vm_swap).rjust(len_vm),
|
||
victim_cgroup_v1,
|
||
victim_cgroup_v2,
|
||
realpath,
|
||
c1, cmdline,
|
||
victim_lifetime)
|
||
|
||
return victim_info
|
||
|
||
|
||
def check_mem_swap_ex():
|
||
"""
|
||
Check: is mem and swap threshold exceeded?
|
||
Return: None, (SIGTERM, meminfo), (SIGKILL, meminfo)
|
||
"""
|
||
|
||
mem_available, swap_total, swap_free = check_mem_and_swap()
|
||
|
||
# if hard_threshold_min_swap is set in percent
|
||
if swap_kill_is_percent:
|
||
hard_threshold_min_swap_kb = swap_total * hard_threshold_min_swap_percent / 100.0
|
||
else:
|
||
hard_threshold_min_swap_kb = swap_kb_dict['hard_threshold_min_swap_kb']
|
||
|
||
if swap_term_is_percent:
|
||
soft_threshold_min_swap_kb = swap_total * soft_threshold_min_swap_percent / 100.0
|
||
else:
|
||
soft_threshold_min_swap_kb = swap_kb_dict['soft_threshold_min_swap_kb']
|
||
|
||
if swap_warn_is_percent:
|
||
warning_threshold_min_swap_kb = swap_total * warning_threshold_min_swap_percent / 100.0
|
||
else:
|
||
warning_threshold_min_swap_kb = swap_kb_dict['warning_threshold_min_swap_kb']
|
||
|
||
if swap_total > hard_threshold_min_swap_kb:
|
||
swap_sigkill_pc = percent(hard_threshold_min_swap_kb / (swap_total + 0.1))
|
||
else:
|
||
swap_sigkill_pc = '-'
|
||
|
||
if swap_total > soft_threshold_min_swap_kb:
|
||
swap_sigterm_pc = percent(soft_threshold_min_swap_kb / (swap_total + 0.1))
|
||
else:
|
||
swap_sigterm_pc = '-'
|
||
|
||
if (mem_available <= hard_threshold_min_mem_kb and
|
||
swap_free <= hard_threshold_min_swap_kb):
|
||
|
||
mem_info = 'Memory status that requ' \
|
||
'ires corrective actions (hard threshold exceeded):' \
|
||
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
||
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
||
'p_min_sigkill [{} MiB, {} %]'.format(
|
||
kib_to_mib(mem_available),
|
||
percent(mem_available / mem_total),
|
||
kib_to_mib(hard_threshold_min_mem_kb),
|
||
percent(hard_threshold_min_mem_kb / mem_total),
|
||
kib_to_mib(swap_free),
|
||
percent(swap_free / (swap_total + 0.1)),
|
||
kib_to_mib(hard_threshold_min_swap_kb),
|
||
swap_sigkill_pc)
|
||
|
||
return (SIGKILL, mem_info, mem_available, hard_threshold_min_swap_kb,
|
||
soft_threshold_min_swap_kb, swap_free, swap_total)
|
||
|
||
if (mem_available <= soft_threshold_min_mem_kb and
|
||
swap_free <= soft_threshold_min_swap_kb):
|
||
|
||
mem_info = 'Memory status that requi' \
|
||
'res corrective actions (soft threshold exceeded):' \
|
||
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
||
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
||
'p_min_sigterm [{} MiB, {} %]'.format(
|
||
kib_to_mib(mem_available),
|
||
percent(mem_available / mem_total),
|
||
kib_to_mib(soft_threshold_min_mem_kb),
|
||
round(soft_threshold_min_mem_percent, 1),
|
||
kib_to_mib(swap_free),
|
||
percent(swap_free / (swap_total + 0.1)),
|
||
kib_to_mib(soft_threshold_min_swap_kb),
|
||
swap_sigterm_pc)
|
||
|
||
return (SIGTERM, mem_info, mem_available, hard_threshold_min_swap_kb,
|
||
soft_threshold_min_swap_kb, swap_free, swap_total)
|
||
|
||
if low_memory_warnings_enabled:
|
||
|
||
if (mem_available <= warning_threshold_min_mem_kb and swap_free <=
|
||
warning_threshold_min_swap_kb + 0.1):
|
||
return ('WARN', None, mem_available, hard_threshold_min_swap_kb,
|
||
soft_threshold_min_swap_kb, swap_free, swap_total)
|
||
|
||
return (None, None, mem_available, hard_threshold_min_swap_kb,
|
||
soft_threshold_min_swap_kb, swap_free, swap_total)
|
||
|
||
|
||
def check_zram_ex():
|
||
"""
|
||
"""
|
||
mem_used_zram = check_zram()
|
||
|
||
if mem_used_zram >= hard_threshold_max_zram_kb:
|
||
|
||
mem_info = 'Memory status that requir' \
|
||
'es corrective actions (hard threshold exceeded):' \
|
||
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
|
||
'kill [{} MiB, {} %]'.format(
|
||
kib_to_mib(mem_used_zram),
|
||
percent(mem_used_zram / mem_total),
|
||
kib_to_mib(hard_threshold_max_zram_kb),
|
||
percent(hard_threshold_max_zram_kb / mem_total))
|
||
|
||
return SIGKILL, mem_info, mem_used_zram
|
||
|
||
if mem_used_zram >= soft_threshold_max_zram_kb:
|
||
|
||
mem_info = 'Memory status that requires corrective actions (soft th' \
|
||
'reshold exceeded):\n MemUsedZram [{} MiB, {} %] >= zram_max_s' \
|
||
'igterm [{} M, {} %]'.format(
|
||
kib_to_mib(mem_used_zram),
|
||
percent(mem_used_zram / mem_total),
|
||
kib_to_mib(soft_threshold_max_zram_kb),
|
||
percent(soft_threshold_max_zram_kb / mem_total))
|
||
|
||
return SIGTERM, mem_info, mem_used_zram
|
||
|
||
if low_memory_warnings_enabled:
|
||
if mem_used_zram >= warning_threshold_max_zram_kb:
|
||
return 'WARN', None, mem_used_zram
|
||
|
||
return None, None, mem_used_zram
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
def check_psi_ex(psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0):
|
||
"""
|
||
"""
|
||
|
||
delta0 = time() - x0
|
||
x0 = time()
|
||
|
||
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
|
||
# print(psi_avg_value)
|
||
|
||
psi_post_action_delay_timer = time() - last_action_dict['t'] # psi_t0
|
||
|
||
if psi_post_action_delay_timer >= psi_post_action_delay:
|
||
psi_post_action_delay_exceeded = True
|
||
else:
|
||
psi_post_action_delay_exceeded = False
|
||
|
||
if psi_avg_value >= hard_threshold_max_psi:
|
||
sigkill_psi_exceeded = True
|
||
psi_kill_exceeded_timer += delta0
|
||
else:
|
||
sigkill_psi_exceeded = False
|
||
psi_kill_exceeded_timer = 0
|
||
|
||
if debug_psi:
|
||
|
||
log('psi_post_action_delay_timer: {}'.format(
|
||
round(psi_post_action_delay_timer, 3)))
|
||
|
||
log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded'
|
||
': {}\npsi_kill_exceeded_timer: {}'.format(
|
||
psi_post_action_delay_exceeded,
|
||
sigkill_psi_exceeded,
|
||
round(psi_kill_exceeded_timer, 1)
|
||
)
|
||
)
|
||
|
||
if (psi_kill_exceeded_timer >= psi_excess_duration and
|
||
psi_post_action_delay_exceeded):
|
||
|
||
mem_info = 'PSI avg ({}) > hard_threshold_max_psi ({})\n' \
|
||
'PSI avg exceeded psi_excess_duration (value' \
|
||
' = {} sec) for {} seconds'.format(
|
||
psi_avg_value,
|
||
hard_threshold_max_psi,
|
||
psi_excess_duration,
|
||
round(psi_kill_exceeded_timer, 1)
|
||
)
|
||
|
||
return (SIGKILL, mem_info, psi_t0, psi_kill_exceeded_timer,
|
||
psi_term_exceeded_timer, x0)
|
||
|
||
if psi_avg_value >= soft_threshold_max_psi:
|
||
sigterm_psi_exceeded = True
|
||
psi_term_exceeded_timer += delta0
|
||
else:
|
||
sigterm_psi_exceeded = False
|
||
psi_term_exceeded_timer = 0
|
||
|
||
if debug_psi:
|
||
|
||
log('sigterm_psi_exceeded: {}\n'
|
||
'psi_term_exceeded_timer: {}\n'.format(
|
||
sigterm_psi_exceeded,
|
||
round(psi_term_exceeded_timer, 1)
|
||
)
|
||
)
|
||
|
||
if (psi_term_exceeded_timer >= psi_excess_duration and
|
||
psi_post_action_delay_exceeded):
|
||
|
||
mem_info = 'PSI avg ({}) > soft_threshold_max_psi ({})\n' \
|
||
'PSI avg exceeded psi_excess_duration (value' \
|
||
' = {} sec) for {} seconds'.format(
|
||
psi_avg_value,
|
||
soft_threshold_max_psi,
|
||
psi_excess_duration,
|
||
round(psi_term_exceeded_timer, 1)
|
||
)
|
||
|
||
return (SIGTERM, mem_info, psi_t0, psi_kill_exceeded_timer,
|
||
psi_term_exceeded_timer, x0)
|
||
|
||
if low_memory_warnings_enabled:
|
||
|
||
if psi_avg_value >= warning_threshold_max_psi:
|
||
return ('WARN', None, psi_t0, psi_kill_exceeded_timer,
|
||
psi_term_exceeded_timer, x0)
|
||
|
||
return (None, None, psi_t0, psi_kill_exceeded_timer,
|
||
psi_term_exceeded_timer, x0)
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
def is_victim_alive(victim_id):
|
||
"""
|
||
We do not have a reliable sign of the end of the release of memory:
|
||
https://github.com/rfjakob/earlyoom/issues/128#issuecomment-507023717
|
||
|
||
Варианты возврата:
|
||
0 X, nonexist, другой процесс (полн конец имплементации, можно не делать POST SIGKILL DELAY)
|
||
1 rp true
|
||
2 R освобождает память. Ждем смерти.
|
||
3 Z возможно уже освободил память. Конец отслеживания
|
||
"""
|
||
|
||
# Проверка целостности жертвы
|
||
starttime, pid = victim_id.split('_pid')
|
||
new_victim_id = get_victim_id(pid)
|
||
if victim_id != new_victim_id:
|
||
return 0
|
||
|
||
# Жива ли жертва?
|
||
exe_exists = os.path.exists('/proc/{}/exe'.format(pid))
|
||
if exe_exists:
|
||
return 1
|
||
|
||
# далее жертва смертельно ранена. Дифференцируемся по State.
|
||
# R -> 2 # отслеживать жертву дальше
|
||
# X, FNFE, PLE -> 0
|
||
|
||
state = pid_to_state(pid)
|
||
|
||
if state == 'R':
|
||
return 2
|
||
|
||
if state == 'Z':
|
||
return 3
|
||
|
||
if state == 'X' or state == '':
|
||
return 0
|
||
|
||
return 0
|
||
|
||
|
||
def implement_corrective_action(
|
||
threshold,
|
||
mem_info_list,
|
||
psi_t0,
|
||
psi_kill_exceeded_timer,
|
||
psi_term_exceeded_timer,
|
||
x0,
|
||
psi_threshold,
|
||
zram_threshold,
|
||
zram_info,
|
||
psi_info):
|
||
|
||
log('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
|
||
|
||
debug_corrective_action = True
|
||
|
||
time0 = time()
|
||
|
||
# 1. Очистка словаря от мертвых. Итерация по словарю, отслеживание умирающих.
|
||
# 2. Итерация по оставшемуся словарю. Поиск дельт. Если хоть у одного
|
||
# дельта НЕ истекла - ЖДЕМ, выход из фции.
|
||
|
||
# print(v_dict)
|
||
nu = []
|
||
|
||
for victim_id in v_dict:
|
||
iva = is_victim_alive(victim_id)
|
||
#print(iva, victim_id)
|
||
if iva == 0 or iva == 3:
|
||
nu.append(victim_id)
|
||
"""
|
||
continue
|
||
if iva == 1:
|
||
continue
|
||
if iva == 2:
|
||
pass # быстро отследить умирающего
|
||
"""
|
||
|
||
for i in nu:
|
||
if debug_corrective_action:
|
||
log('Remove {} from v_dict'.format(i))
|
||
v_dict.pop(i)
|
||
|
||
x = False
|
||
cache_list = []
|
||
#cache_list.append(('foo', 0.01))
|
||
#cache_list.append(('boo', 1111.01))
|
||
# 2
|
||
# print(v_dict)
|
||
|
||
for victim_id in v_dict:
|
||
tx = v_dict[victim_id]['time']
|
||
ddt = time() - tx
|
||
if ddt < victim_cache_time:
|
||
|
||
if debug_corrective_action:
|
||
log(
|
||
'victim_cache_time is not exceeded for {} ({} < {})'.format(
|
||
victim_id, round(ddt, 3), victim_cache_time
|
||
)
|
||
)
|
||
x = True
|
||
cache_list.append((victim_id, ddt))
|
||
break
|
||
|
||
if x:
|
||
# print(cache_list)
|
||
e = sorted(cache_list, key=itemgetter(1), reverse=False)
|
||
cached_victim_id = e[0][0]
|
||
|
||
for i in mem_info_list:
|
||
log(i)
|
||
|
||
if x:
|
||
victim_id = cached_victim_id
|
||
pid = victim_id.partition('_pid')[2]
|
||
victim_badness = pid_to_badness(pid)[0]
|
||
name = v_dict[victim_id]['name']
|
||
log('New victim is cached victim {} ({})'.format(pid, name))
|
||
else:
|
||
pid, victim_badness, name, victim_id = find_victim(print_proc_table)
|
||
|
||
log('Recheck memory levels...')
|
||
|
||
(masf_threshold, masf_info, mem_available, hard_threshold_min_swap_kb,
|
||
soft_threshold_min_swap_kb, swap_free, swap_total) = check_mem_swap_ex()
|
||
|
||
if CHECK_ZRAM:
|
||
zram_threshold, zram_info, mem_used_zram = check_zram_ex()
|
||
|
||
if CHECK_PSI:
|
||
(psi_threshold, psi_info, psi_t0, psi_kill_exceeded_timer,
|
||
psi_term_exceeded_timer, x0) = check_psi_ex(
|
||
psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0)
|
||
|
||
if (masf_threshold is SIGKILL or zram_threshold is SIGKILL or
|
||
psi_threshold is SIGKILL):
|
||
|
||
new_threshold = SIGKILL
|
||
mem_info_list = []
|
||
|
||
if masf_threshold is SIGKILL or masf_threshold is SIGTERM:
|
||
mem_info_list.append(masf_info)
|
||
|
||
if zram_threshold is SIGKILL or zram_threshold is SIGTERM:
|
||
mem_info_list.append(zram_info)
|
||
|
||
if psi_threshold is SIGKILL or psi_threshold is SIGTERM:
|
||
mem_info_list.append(psi_info)
|
||
|
||
elif (masf_threshold is SIGTERM or zram_threshold is SIGTERM or
|
||
psi_threshold is SIGTERM):
|
||
|
||
new_threshold = SIGTERM
|
||
mem_info_list = []
|
||
|
||
if masf_threshold is SIGKILL or masf_threshold is SIGTERM:
|
||
mem_info_list.append(masf_info)
|
||
|
||
if zram_threshold is SIGKILL or zram_threshold is SIGTERM:
|
||
mem_info_list.append(zram_info)
|
||
|
||
if psi_threshold is SIGKILL or psi_threshold is SIGTERM:
|
||
mem_info_list.append(psi_info)
|
||
|
||
else:
|
||
log('Thresholds is not exceeded now')
|
||
return psi_t0
|
||
|
||
for i in mem_info_list:
|
||
log(i)
|
||
|
||
if new_threshold is None or new_threshold == 'WARN':
|
||
log('Thresholds is not exceeded now')
|
||
return psi_t0
|
||
|
||
threshold = new_threshold
|
||
|
||
vwd = None # Victim Will Die
|
||
|
||
if victim_badness >= min_badness:
|
||
|
||
if threshold is SIGTERM:
|
||
if victim_id in v_dict:
|
||
dt = time() - v_dict[victim_id]['time']
|
||
if dt > max_soft_exit_time:
|
||
log('max_soft_exit_time is exceeded: the '
|
||
'victim will get SIGKILL')
|
||
threshold = SIGKILL
|
||
else:
|
||
log('max_soft_exit_time is not exceeded ('
|
||
'{} < {}) for the victim'.format(round(
|
||
dt, 1), max_soft_exit_time))
|
||
|
||
if debug_sleep:
|
||
log('Sleep {} sec (over_sleep)'.format(over_sleep))
|
||
sleep(over_sleep)
|
||
|
||
log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
|
||
|
||
return psi_t0
|
||
|
||
# log('Try to implement a corrective action...')
|
||
|
||
if print_victim_status:
|
||
# victim badness ищи снова, не полагайся на старое
|
||
victim_info = find_victim_info(pid, victim_badness, name)
|
||
log(victim_info)
|
||
|
||
soft_match = False
|
||
if soft_actions and threshold is SIGTERM:
|
||
name = pid_to_name(pid)
|
||
cgroup_v1 = pid_to_cgroup_v1(pid)
|
||
service = ''
|
||
cgroup_v1_tail = cgroup_v1.rpartition('/')[2]
|
||
if cgroup_v1_tail.endswith('.service'):
|
||
service = cgroup_v1_tail
|
||
for i in soft_actions_list:
|
||
unit = i[0]
|
||
if unit == 'name':
|
||
u = name
|
||
else:
|
||
u = cgroup_v1
|
||
regexp = i[1]
|
||
command = i[2]
|
||
|
||
if search(regexp, u) is not None:
|
||
log("Regexp '{}' matches with {} '{}'".format(
|
||
regexp, unit, u))
|
||
soft_match = True
|
||
break
|
||
|
||
if soft_match:
|
||
|
||
cmd = command.replace('$PID', pid).replace('$NAME', pid_to_name(
|
||
pid)).replace('$SERVICE', service)
|
||
exit_status = exe(cmd)
|
||
|
||
if exit_status == 0:
|
||
success = True
|
||
else:
|
||
success = False
|
||
|
||
response_time = time() - time0
|
||
|
||
preventing_oom_message = 'Implement a corrective act' \
|
||
'ion:\n Run the command: {}' \
|
||
'\n Exit status: {}; total response ' \
|
||
'time: {} ms'.format(
|
||
cmd,
|
||
exit_status,
|
||
round(response_time * 1000))
|
||
|
||
else:
|
||
|
||
try:
|
||
os.kill(int(pid), threshold)
|
||
|
||
response_time = time() - time0
|
||
|
||
send_result = 'total response time: {} ms'.format(
|
||
round(response_time * 1000))
|
||
|
||
preventing_oom_message = 'Implement a corrective action:' \
|
||
'\n Send {} to the victim; {}'.format(
|
||
sig_dict[threshold], send_result)
|
||
|
||
success = True
|
||
|
||
if threshold is SIGKILL:
|
||
vwd = True
|
||
|
||
except FileNotFoundError:
|
||
vwd = True
|
||
success = False
|
||
response_time = time() - time0
|
||
send_result = 'no such process; response time: {} ms'.format(
|
||
round(response_time * 1000))
|
||
key = 'The victim died in the search process: ' \
|
||
'FileNotFoundError'
|
||
except ProcessLookupError:
|
||
vwd = True
|
||
success = False
|
||
response_time = time() - time0
|
||
send_result = 'no such process; response time: {} ms'.format(
|
||
round(response_time * 1000))
|
||
key = 'The victim died in the search process: ' \
|
||
'ProcessLookupError'
|
||
|
||
try:
|
||
log(preventing_oom_message)
|
||
except UnboundLocalError:
|
||
preventing_oom_message = key
|
||
|
||
if not vwd:
|
||
if victim_id not in v_dict:
|
||
v_dict[victim_id] = dict()
|
||
v_dict[victim_id]['time'] = time()
|
||
v_dict[victim_id]['name'] = name
|
||
else:
|
||
pass
|
||
|
||
|
||
|
||
last_action_dict['t'] = kill_timestamp = time()
|
||
|
||
|
||
|
||
|
||
# print(v_dict)
|
||
|
||
response_time = time() - time0
|
||
|
||
# log('success: ' + str(success))
|
||
# log('victim will die: ' + str(vwd))
|
||
# log('response_time: ' + str(response_time) + ' sec')
|
||
|
||
|
||
# НАЧАЛО ОТСЛЕЖИВАНИЯ СОСТОЯНИЯ ЖЕРТВЫ. Можно вынести в отд фц. Приним
|
||
# айди, логирует, возвращает что-то.
|
||
|
||
# Далее поработать со словарями. Жертва тут умерла - сброс таймера. Все
|
||
# старые жертвы умерли до 3х секунд с следующих циклах - сброс таймера.
|
||
# После этого все должно быть супер охуенно.
|
||
|
||
while True:
|
||
sleep(0.005)
|
||
d = time() - kill_timestamp
|
||
#print('Прошло времени:', d)
|
||
iva = is_victim_alive(victim_id)
|
||
|
||
if iva == 0:
|
||
|
||
log('The victim died in {} sec'.format(round(d, 3)))
|
||
|
||
if victim_id in v_dict:
|
||
v_dict.pop(victim_id)
|
||
break
|
||
|
||
elif iva == 1:
|
||
#print('Жива и занимает память')
|
||
if not vwd and d > sensitivity_test_time:
|
||
|
||
log("The victim doesn't respond on corrective action in {} sec".format(
|
||
round(d, 3)))
|
||
|
||
break
|
||
|
||
elif iva == 2:
|
||
pass
|
||
#print('Смертельно ранена и освобождает память. Дождаться окончания освобождения памяти.')
|
||
|
||
else: # 3
|
||
#print('Z и быстро освобождает память, если еще не. Поспать немножно и выйти из цикла.')
|
||
|
||
log('The victim became a zombie in {} sec'.format(round(d, 3)))
|
||
|
||
if victim_id in v_dict:
|
||
v_dict.pop(victim_id)
|
||
sleep(post_zombie_delay)
|
||
break
|
||
|
||
mem_available, swap_total, swap_free = check_mem_and_swap()
|
||
ma_mib = int(mem_available) / 1024.0
|
||
sf_mib = int(swap_free) / 1024.0
|
||
log('Memory status after implementing a corrective act'
|
||
'ion:\n MemAvailable'
|
||
': {} MiB, SwapFree: {} MiB'.format(
|
||
round(ma_mib, 1), round(sf_mib, 1)))
|
||
|
||
if soft_match is False:
|
||
key = 'Send {} to {}'.format(sig_dict[threshold], name)
|
||
update_stat_dict_and_print(key)
|
||
else:
|
||
key = "Run the command '{}'".format(command)
|
||
update_stat_dict_and_print(key)
|
||
|
||
if threshold is SIGKILL and post_kill_exe != '':
|
||
|
||
cmd = post_kill_exe.replace('$PID', pid).replace(
|
||
'$NAME', pid_to_name(pid))
|
||
|
||
log('Execute post_kill_exe')
|
||
|
||
exe(cmd)
|
||
|
||
if post_action_gui_notifications:
|
||
if soft_match:
|
||
send_notify_etc(pid, name, cmd)
|
||
else:
|
||
send_notify(threshold, name, pid)
|
||
|
||
else:
|
||
|
||
response_time = time() - time0
|
||
victim_badness_is_too_small = 'victim badness ({}) < min_b' \
|
||
'adness ({}); nothing to do; response time: {} ms'.format(
|
||
victim_badness,
|
||
min_badness,
|
||
round(response_time * 1000))
|
||
|
||
log(victim_badness_is_too_small)
|
||
|
||
# update stat_dict
|
||
key = 'victim badness < min_badness'
|
||
update_stat_dict_and_print(key)
|
||
|
||
if vwd is None:
|
||
|
||
if debug_sleep:
|
||
log('Sleep {} sec (over_sleep)'.format(over_sleep))
|
||
sleep(over_sleep)
|
||
|
||
log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
|
||
|
||
return psi_t0
|
||
|
||
|
||
def sleep_after_check_mem():
|
||
"""Specify sleep times depends on rates and avialable memory."""
|
||
|
||
if stable_sleep:
|
||
|
||
if debug_sleep:
|
||
log('Sleep {} sec'.format(min_sleep))
|
||
stdout.flush()
|
||
sleep(min_sleep)
|
||
return None
|
||
|
||
if hard_threshold_min_mem_kb < soft_threshold_min_mem_kb:
|
||
mem_point = mem_available - soft_threshold_min_mem_kb
|
||
else:
|
||
mem_point = mem_available - hard_threshold_min_mem_kb
|
||
|
||
if hard_threshold_min_swap_kb < soft_threshold_min_swap_kb:
|
||
swap_point = swap_free - soft_threshold_min_swap_kb
|
||
else:
|
||
swap_point = swap_free - hard_threshold_min_swap_kb
|
||
|
||
if swap_point < 0:
|
||
swap_point = 0
|
||
|
||
if mem_point < 0:
|
||
mem_point = 0
|
||
|
||
t_mem = mem_point / fill_rate_mem
|
||
t_swap = swap_point / fill_rate_swap
|
||
|
||
if CHECK_ZRAM:
|
||
t_zram = (mem_total * 0.8 - mem_used_zram) / fill_rate_zram
|
||
if t_zram < 0:
|
||
t_zram = 0
|
||
t_mem_zram = t_mem + t_zram
|
||
z = ', t_zram={}'.format(round(t_zram, 2))
|
||
else:
|
||
z = ''
|
||
|
||
t_mem_swap = t_mem + t_swap
|
||
|
||
if CHECK_ZRAM:
|
||
|
||
if t_mem_swap <= t_mem_zram:
|
||
t = t_mem_swap
|
||
else:
|
||
t = t_mem_zram
|
||
else:
|
||
t = t_mem_swap
|
||
|
||
if t > max_sleep:
|
||
t = max_sleep
|
||
elif t < min_sleep:
|
||
t = min_sleep
|
||
else:
|
||
pass
|
||
|
||
if debug_sleep:
|
||
log('Sleep {} sec (t_mem={}, t_swap={}{})'.format(round(t, 2), round(
|
||
t_mem, 2), round(t_swap, 2), z))
|
||
|
||
try:
|
||
stdout.flush()
|
||
except OSError:
|
||
pass
|
||
|
||
sleep(t)
|
||
|
||
|
||
def calculate_percent(arg_key):
|
||
"""
|
||
parse conf dict
|
||
Calculate mem_min_KEY_percent.
|
||
|
||
Try use this one)
|
||
arg_key: str key for config_dict
|
||
returns int mem_min_percent or NoneType if got some error
|
||
"""
|
||
|
||
if arg_key in config_dict:
|
||
mem_min = config_dict[arg_key]
|
||
|
||
if mem_min.endswith('%'):
|
||
# truncate percents, so we have a number
|
||
mem_min_percent = mem_min[:-1].strip()
|
||
# then 'float test'
|
||
mem_min_percent = string_to_float_convert_test(mem_min_percent)
|
||
if mem_min_percent is None:
|
||
errprint('Invalid {} value, not float\nExit'.format(arg_key))
|
||
exit(1)
|
||
# Final validations...
|
||
if mem_min_percent < 0 or mem_min_percent > 100:
|
||
errprint(
|
||
'{}, as percents value, out of ran'
|
||
'ge [0; 100]\nExit'.format(arg_key))
|
||
exit(1)
|
||
|
||
# soft_threshold_min_mem_percent is clean and valid float percentage. Can
|
||
# translate into Kb
|
||
mem_min_kb = mem_min_percent / 100 * mem_total
|
||
mem_min_mb = round(mem_min_kb / 1024)
|
||
|
||
elif mem_min.endswith('M'):
|
||
mem_min_mb = string_to_float_convert_test(mem_min[:-1].strip())
|
||
if mem_min_mb is None:
|
||
errprint('Invalid {} value, not float\nExit'.format(arg_key))
|
||
exit(1)
|
||
mem_min_kb = mem_min_mb * 1024
|
||
if mem_min_kb > mem_total:
|
||
errprint(
|
||
'{} value can not be greater then MemT'
|
||
'otal ({} MiB)\nExit'.format(
|
||
arg_key, round(
|
||
mem_total / 1024)))
|
||
exit(1)
|
||
mem_min_percent = mem_min_kb / mem_total * 100
|
||
|
||
else:
|
||
log('Invalid {} units in config.\n Exit'.format(arg_key))
|
||
exit(1)
|
||
mem_min_percent = None
|
||
|
||
else:
|
||
log('{} not in config\nExit'.format(arg_key))
|
||
exit(1)
|
||
mem_min_percent = None
|
||
|
||
return mem_min_kb, mem_min_mb, mem_min_percent
|
||
|
||
|
||
##########################################################################
|
||
|
||
|
||
# {victim_id : {'time': timestamp, 'name': name}
|
||
v_dict = dict()
|
||
|
||
|
||
start_time = time()
|
||
|
||
|
||
help_mess = """usage: nohang [-h] [-v] [-p] [-c CONFIG] [-cc CONFIG]
|
||
|
||
optional arguments:
|
||
-h, --help show this help message and exit
|
||
-v, --version print version
|
||
-p, --print-proc-table
|
||
print table of processes with their badness values
|
||
-c CONFIG, --config CONFIG
|
||
path to the config file, default values:
|
||
./nohang.conf, /etc/nohang/nohang.conf
|
||
-cc CONFIG, --check-config CONFIG
|
||
ckeck and print config"""
|
||
|
||
|
||
SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
|
||
|
||
SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE'])
|
||
|
||
conf_err_mess = 'Invalid config. Exit.'
|
||
|
||
sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP]
|
||
|
||
sig_dict = {
|
||
SIGKILL: 'SIGKILL',
|
||
SIGINT: 'SIGINT',
|
||
SIGQUIT: 'SIGQUIT',
|
||
SIGHUP: 'SIGHUP',
|
||
SIGTERM: 'SIGTERM'
|
||
}
|
||
|
||
self_pid = str(os.getpid())
|
||
|
||
self_uid = os.geteuid()
|
||
|
||
if self_uid == 0:
|
||
root = True
|
||
else:
|
||
root = False
|
||
|
||
|
||
if os.path.exists('./nohang_notify_helper'):
|
||
notify_helper_path = './nohang_notify_helper'
|
||
else:
|
||
notify_helper_path = '/usr/sbin/nohang_notify_helper'
|
||
|
||
|
||
|
||
|
||
|
||
last_action_dict = dict()
|
||
|
||
last_action_dict['t'] = time()
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
# will store corrective actions stat
|
||
stat_dict = dict()
|
||
|
||
|
||
separate_log = False # will be overwritten after parse config
|
||
|
||
|
||
cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
|
||
|
||
|
||
self_oom_score_adj_min = '-600'
|
||
self_oom_score_adj_max = '-6'
|
||
|
||
|
||
write_self_oom_score_adj(self_oom_score_adj_min)
|
||
|
||
|
||
pid_list = get_pid_list()
|
||
|
||
|
||
print_proc_table_flag = False
|
||
|
||
check_config_flag = False
|
||
|
||
|
||
|
||
if os.path.exists('./nohang.conf'):
|
||
config = os.getcwd() + '/nohang.conf'
|
||
else:
|
||
config = '/etc/nohang/nohang.conf'
|
||
|
||
|
||
|
||
|
||
if len(argv) == 1:
|
||
pass
|
||
elif len(argv) == 2:
|
||
if argv[1] == '--help' or argv[1] == '-h':
|
||
print(help_mess)
|
||
exit()
|
||
elif argv[1] == '--check-config' or argv[1] == '-cc':
|
||
check_config_flag = True
|
||
elif argv[1] == '--version' or argv[1] == '-v':
|
||
print_version()
|
||
elif argv[1] == '--print-proc-table' or argv[1] == '-p':
|
||
print_proc_table_flag = True
|
||
if os.path.exists('./nohang.conf'):
|
||
config = os.getcwd() + '/nohang.conf'
|
||
else:
|
||
config = '/etc/nohang/nohang.conf'
|
||
else:
|
||
errprint('Unknown option: {}'.format(argv[1]))
|
||
exit(1)
|
||
elif len(argv) == 3:
|
||
if argv[1] == '--config' or argv[1] == '-c':
|
||
config = argv[2]
|
||
elif argv[1] == '--check-config' or argv[1] == '-cc':
|
||
config = argv[2]
|
||
check_config_flag = True
|
||
else:
|
||
errprint('Unknown option: {}'.format(argv[1]))
|
||
exit(1)
|
||
else:
|
||
errprint('Invalid CLI input: too many options')
|
||
exit(1)
|
||
|
||
|
||
# find mem_total
|
||
# find positions of SwapFree and SwapTotal in /proc/meminfo
|
||
|
||
with open('/proc/meminfo') as f:
|
||
mem_list = f.readlines()
|
||
|
||
mem_list_names = []
|
||
for s in mem_list:
|
||
mem_list_names.append(s.split(':')[0])
|
||
|
||
if mem_list_names[2] != 'MemAvailable':
|
||
errprint('WARNING: Your Linux kernel is too old, Linux 3.14+ requied')
|
||
# exit(1)
|
||
|
||
swap_total_index = mem_list_names.index('SwapTotal')
|
||
swap_free_index = swap_total_index + 1
|
||
|
||
mem_total = int(mem_list[0].split(':')[1][:-4])
|
||
|
||
# Get names from /proc/*/status to be able to get VmRSS and VmSwap values
|
||
|
||
with open('/proc/self/status') as file:
|
||
status_list = file.readlines()
|
||
|
||
status_names = []
|
||
for s in status_list:
|
||
status_names.append(s.split(':')[0])
|
||
|
||
ppid_index = status_names.index('PPid')
|
||
vm_size_index = status_names.index('VmSize')
|
||
vm_rss_index = status_names.index('VmRSS')
|
||
vm_swap_index = status_names.index('VmSwap')
|
||
uid_index = status_names.index('Uid')
|
||
state_index = status_names.index('State')
|
||
|
||
|
||
try:
|
||
anon_index = status_names.index('RssAnon')
|
||
file_index = status_names.index('RssFile')
|
||
shmem_index = status_names.index('RssShmem')
|
||
detailed_rss = True
|
||
# print(detailed_rss, 'detailed_rss')
|
||
except ValueError:
|
||
detailed_rss = False
|
||
# print('It is not Linux 4.5+')
|
||
|
||
|
||
log('config: ' + config)
|
||
|
||
|
||
##########################################################################
|
||
|
||
# parsing the config with obtaining the parameters dictionary
|
||
|
||
# conf_parameters_dict
|
||
# conf_restart_dict
|
||
|
||
# dictionary with config options
|
||
config_dict = dict()
|
||
|
||
badness_adj_re_name_list = []
|
||
badness_adj_re_cmdline_list = []
|
||
badness_adj_re_environ_list = []
|
||
badness_adj_re_uid_list = []
|
||
badness_adj_re_cgroup_v1_list = []
|
||
badness_adj_re_cgroup_v2_list = []
|
||
badness_adj_re_realpath_list = []
|
||
|
||
soft_actions_list = []
|
||
|
||
# separator for optional parameters (that starts with @)
|
||
opt_separator = '///'
|
||
|
||
# stupid conf parsing, need refactoring
|
||
try:
|
||
with open(config) as f:
|
||
|
||
for line in f:
|
||
|
||
a = line.startswith('#')
|
||
b = line.startswith('\n')
|
||
c = line.startswith('\t')
|
||
d = line.startswith(' ')
|
||
|
||
etc = line.startswith('@SOFT_ACTION_RE_NAME')
|
||
etc2 = line.startswith('@SOFT_ACTION_RE_CGROUP_V1')
|
||
|
||
if not a and not b and not c and not d and not etc and not etc2:
|
||
a = line.partition('=')
|
||
|
||
key = a[0].strip()
|
||
value = a[2].strip()
|
||
|
||
if key not in config_dict:
|
||
config_dict[key] = value
|
||
else:
|
||
log('ERROR: config key duplication: {}'.format(key))
|
||
exit(1)
|
||
|
||
if etc:
|
||
|
||
a = line.partition('@SOFT_ACTION_RE_NAME')[
|
||
2].partition(opt_separator)
|
||
|
||
a1 = 'name'
|
||
|
||
a2 = a[0].strip()
|
||
valid_re(a2)
|
||
|
||
a3 = a[2].strip()
|
||
|
||
zzz = (a1, a2, a3)
|
||
|
||
soft_actions_list.append(zzz)
|
||
|
||
if etc2:
|
||
|
||
a = line.partition('@SOFT_ACTION_RE_CGROUP_V1')[
|
||
2].partition(opt_separator)
|
||
|
||
a1 = 'cgroup_v1'
|
||
|
||
a2 = a[0].strip()
|
||
valid_re(a2)
|
||
|
||
a3 = a[2].strip()
|
||
|
||
zzz = (a1, a2, a3)
|
||
|
||
soft_actions_list.append(zzz)
|
||
|
||
if line.startswith('@BADNESS_ADJ_RE_NAME'):
|
||
a = line.partition('@BADNESS_ADJ_RE_NAME')[2].strip(
|
||
' \n').partition(opt_separator)
|
||
badness_adj = a[0].strip(' ')
|
||
reg_exp = a[2].strip(' ')
|
||
valid_re(reg_exp)
|
||
badness_adj_re_name_list.append((badness_adj, reg_exp))
|
||
|
||
if line.startswith('@BADNESS_ADJ_RE_CMDLINE'):
|
||
a = line.partition('@BADNESS_ADJ_RE_CMDLINE')[2].strip(
|
||
' \n').partition(opt_separator)
|
||
badness_adj = a[0].strip(' ')
|
||
reg_exp = a[2].strip(' ')
|
||
valid_re(reg_exp)
|
||
badness_adj_re_cmdline_list.append((badness_adj, reg_exp))
|
||
|
||
if line.startswith('@BADNESS_ADJ_RE_UID'):
|
||
a = line.partition('@BADNESS_ADJ_RE_UID')[2].strip(
|
||
' \n').partition(opt_separator)
|
||
badness_adj = a[0].strip(' ')
|
||
reg_exp = a[2].strip(' ')
|
||
valid_re(reg_exp)
|
||
badness_adj_re_uid_list.append((badness_adj, reg_exp))
|
||
|
||
if line.startswith('@BADNESS_ADJ_RE_CGROUP_V1'):
|
||
a = line.partition('@BADNESS_ADJ_RE_CGROUP_V1')[2].strip(
|
||
' \n').partition(opt_separator)
|
||
badness_adj = a[0].strip(' ')
|
||
reg_exp = a[2].strip(' ')
|
||
valid_re(reg_exp)
|
||
badness_adj_re_cgroup_v1_list.append((badness_adj, reg_exp))
|
||
|
||
if line.startswith('@BADNESS_ADJ_RE_CGROUP_V2'):
|
||
a = line.partition('@BADNESS_ADJ_RE_CGROUP_V2')[2].strip(
|
||
' \n').partition(opt_separator)
|
||
badness_adj = a[0].strip(' ')
|
||
reg_exp = a[2].strip(' ')
|
||
valid_re(reg_exp)
|
||
badness_adj_re_cgroup_v2_list.append((badness_adj, reg_exp))
|
||
|
||
if line.startswith('@BADNESS_ADJ_RE_REALPATH'):
|
||
a = line.partition('@BADNESS_ADJ_RE_REALPATH')[2].strip(
|
||
' \n').partition(opt_separator)
|
||
badness_adj = a[0].strip(' ')
|
||
reg_exp = a[2].strip(' ')
|
||
valid_re(reg_exp)
|
||
badness_adj_re_realpath_list.append((badness_adj, reg_exp))
|
||
|
||
if line.startswith('@BADNESS_ADJ_RE_ENVIRON'):
|
||
a = line.partition('@BADNESS_ADJ_RE_ENVIRON')[2].strip(
|
||
' \n').partition(opt_separator)
|
||
badness_adj = a[0].strip(' ')
|
||
reg_exp = a[2].strip(' ')
|
||
valid_re(reg_exp)
|
||
badness_adj_re_environ_list.append((badness_adj, reg_exp))
|
||
|
||
|
||
except PermissionError:
|
||
errprint('PermissionError', conf_err_mess)
|
||
exit(1)
|
||
except UnicodeDecodeError:
|
||
errprint('UnicodeDecodeError', conf_err_mess)
|
||
exit(1)
|
||
except IsADirectoryError:
|
||
errprint('IsADirectoryError', conf_err_mess)
|
||
exit(1)
|
||
except IndexError:
|
||
errprint('IndexError', conf_err_mess)
|
||
exit(1)
|
||
except FileNotFoundError:
|
||
errprint('FileNotFoundError', conf_err_mess)
|
||
exit(1)
|
||
|
||
|
||
if badness_adj_re_name_list == []:
|
||
regex_matching = False
|
||
else:
|
||
regex_matching = True
|
||
|
||
|
||
if badness_adj_re_cmdline_list == []:
|
||
re_match_cmdline = False
|
||
else:
|
||
re_match_cmdline = True
|
||
|
||
|
||
if badness_adj_re_uid_list == []:
|
||
re_match_uid = False
|
||
else:
|
||
re_match_uid = True
|
||
|
||
|
||
if badness_adj_re_environ_list == []:
|
||
re_match_environ = False
|
||
else:
|
||
re_match_environ = True
|
||
|
||
|
||
if badness_adj_re_realpath_list == []:
|
||
re_match_realpath = False
|
||
else:
|
||
re_match_realpath = True
|
||
|
||
|
||
if badness_adj_re_cgroup_v1_list == []:
|
||
re_match_cgroup_v1 = False
|
||
else:
|
||
re_match_cgroup_v1 = True
|
||
|
||
|
||
if badness_adj_re_cgroup_v2_list == []:
|
||
re_match_cgroup_v2 = False
|
||
else:
|
||
re_match_cgroup_v2 = True
|
||
|
||
|
||
if soft_actions_list == []:
|
||
soft_actions = False
|
||
else:
|
||
soft_actions = True
|
||
|
||
|
||
##########################################################################
|
||
|
||
|
||
# post_zombie_delay = 0.1
|
||
|
||
# victim_cache_time = 50
|
||
|
||
|
||
# extracting parameters from the dictionary
|
||
# check for all necessary parameters
|
||
# validation of all parameters
|
||
debug_psi = conf_parse_bool('debug_psi')
|
||
print_statistics = conf_parse_bool('print_statistics')
|
||
print_proc_table = conf_parse_bool('print_proc_table')
|
||
forbid_negative_badness = conf_parse_bool('forbid_negative_badness')
|
||
print_victim_status = conf_parse_bool('print_victim_status')
|
||
print_victim_cmdline = conf_parse_bool('print_victim_cmdline')
|
||
print_config_at_startup = conf_parse_bool('print_config_at_startup')
|
||
print_mem_check_results = conf_parse_bool('print_mem_check_results')
|
||
debug_sleep = conf_parse_bool('debug_sleep')
|
||
low_memory_warnings_enabled = conf_parse_bool('low_memory_warnings_enabled')
|
||
post_action_gui_notifications = conf_parse_bool('post_action_gui_notifications')
|
||
|
||
|
||
psi_checking_enabled = conf_parse_bool('psi_checking_enabled')
|
||
ignore_psi = not psi_checking_enabled
|
||
|
||
zram_checking_enabled = conf_parse_bool('zram_checking_enabled')
|
||
ignore_zram = not zram_checking_enabled
|
||
|
||
|
||
|
||
|
||
|
||
|
||
debug_gui_notifications = conf_parse_bool('debug_gui_notifications')
|
||
ignore_positive_oom_score_adj = conf_parse_bool('ignore_positive_oom_score_adj')
|
||
|
||
|
||
|
||
|
||
|
||
(soft_threshold_min_mem_kb, soft_threshold_min_mem_mb, soft_threshold_min_mem_percent
|
||
) = calculate_percent('soft_threshold_min_mem')
|
||
|
||
(hard_threshold_min_mem_kb, hard_threshold_min_mem_mb, hard_threshold_min_mem_percent
|
||
) = calculate_percent('hard_threshold_min_mem')
|
||
|
||
(soft_threshold_max_zram_kb, soft_threshold_max_zram_mb, soft_threshold_max_zram_percent
|
||
) = calculate_percent('soft_threshold_max_zram')
|
||
|
||
(hard_threshold_max_zram_kb, hard_threshold_max_zram_mb, hard_threshold_max_zram_percent
|
||
) = calculate_percent('hard_threshold_max_zram')
|
||
|
||
(warning_threshold_min_mem_kb, warning_threshold_min_mem_mb, warning_threshold_min_mem_percent
|
||
) = calculate_percent('warning_threshold_min_mem')
|
||
|
||
(warning_threshold_max_zram_kb, warning_threshold_max_zram_mb, warning_threshold_max_zram_percent
|
||
) = calculate_percent('warning_threshold_max_zram')
|
||
|
||
|
||
if 'post_zombie_delay' in config_dict:
|
||
post_zombie_delay = string_to_float_convert_test(
|
||
config_dict['post_zombie_delay'])
|
||
if post_zombie_delay is None:
|
||
errprint('Invalid post_zombie_delay, not float\nExit')
|
||
exit(1)
|
||
if post_zombie_delay < 0:
|
||
errprint('post_zombie_delay MUST be >= 0\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('post_zombie_delay not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'victim_cache_time' in config_dict:
|
||
victim_cache_time = string_to_float_convert_test(
|
||
config_dict['victim_cache_time'])
|
||
if victim_cache_time is None:
|
||
errprint('Invalid victim_cache_time, not float\nExit')
|
||
exit(1)
|
||
if victim_cache_time < 0:
|
||
errprint('victim_cache_time MUST be >= 0\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('victim_cache_time not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'fill_rate_mem' in config_dict:
|
||
fill_rate_mem = string_to_float_convert_test(config_dict['fill_rate_mem'])
|
||
if fill_rate_mem is None:
|
||
errprint('Invalid fill_rate_mem value, not float\nExit')
|
||
exit(1)
|
||
if fill_rate_mem <= 0:
|
||
errprint('fill_rate_mem MUST be > 0\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('fill_rate_mem not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'fill_rate_swap' in config_dict:
|
||
fill_rate_swap = string_to_float_convert_test(config_dict['fill_rate_swap'])
|
||
if fill_rate_swap is None:
|
||
errprint('Invalid fill_rate_swap value, not float\nExit')
|
||
exit(1)
|
||
if fill_rate_swap <= 0:
|
||
errprint('fill_rate_swap MUST be > 0\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('fill_rate_swap not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'fill_rate_zram' in config_dict:
|
||
fill_rate_zram = string_to_float_convert_test(config_dict['fill_rate_zram'])
|
||
if fill_rate_zram is None:
|
||
errprint('Invalid fill_rate_zram value, not float\nExit')
|
||
exit(1)
|
||
if fill_rate_zram <= 0:
|
||
errprint('fill_rate_zram MUST be > 0\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('fill_rate_zram not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'soft_threshold_min_swap' in config_dict:
|
||
soft_threshold_min_swap = config_dict['soft_threshold_min_swap']
|
||
else:
|
||
errprint('soft_threshold_min_swap not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'hard_threshold_min_swap' in config_dict:
|
||
hard_threshold_min_swap = config_dict['hard_threshold_min_swap']
|
||
else:
|
||
errprint('hard_threshold_min_swap not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'post_soft_action_delay' in config_dict:
|
||
post_soft_action_delay = string_to_float_convert_test(
|
||
config_dict['post_soft_action_delay'])
|
||
if post_soft_action_delay is None:
|
||
errprint('Invalid post_soft_action_delay value, not float\nExit')
|
||
exit(1)
|
||
if post_soft_action_delay < 0:
|
||
errprint('post_soft_action_delay must be positiv\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('post_soft_action_delay not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'psi_post_action_delay' in config_dict:
|
||
psi_post_action_delay = string_to_float_convert_test(
|
||
config_dict['psi_post_action_delay'])
|
||
if psi_post_action_delay is None:
|
||
errprint('Invalid psi_post_action_delay value, not float\nExit')
|
||
exit(1)
|
||
if psi_post_action_delay < 0:
|
||
errprint('psi_post_action_delay must be positive\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('psi_post_action_delay not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'hard_threshold_max_psi' in config_dict:
|
||
hard_threshold_max_psi = string_to_float_convert_test(
|
||
config_dict['hard_threshold_max_psi'])
|
||
if hard_threshold_max_psi is None:
|
||
errprint('Invalid hard_threshold_max_psi value, not float\nExit')
|
||
exit(1)
|
||
if hard_threshold_max_psi < 0 or hard_threshold_max_psi > 100:
|
||
errprint('hard_threshold_max_psi must be in the range [0; 100]\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('hard_threshold_max_psi not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'soft_threshold_max_psi' in config_dict:
|
||
soft_threshold_max_psi = string_to_float_convert_test(
|
||
config_dict['soft_threshold_max_psi'])
|
||
if soft_threshold_max_psi is None:
|
||
errprint('Invalid soft_threshold_max_psi value, not float\nExit')
|
||
exit(1)
|
||
if soft_threshold_max_psi < 0 or soft_threshold_max_psi > 100:
|
||
errprint('soft_threshold_max_psi must be in the range [0; 100]\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('soft_threshold_max_psi not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'warning_threshold_max_psi' in config_dict:
|
||
warning_threshold_max_psi = string_to_float_convert_test(
|
||
config_dict['warning_threshold_max_psi'])
|
||
if warning_threshold_max_psi is None:
|
||
errprint('Invalid warning_threshold_max_psi value, not float\nExit')
|
||
exit(1)
|
||
if warning_threshold_max_psi < 0 or warning_threshold_max_psi > 100:
|
||
errprint('warning_threshold_max_psi must be in the range [0; 100]\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('warning_threshold_max_psi not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'min_badness' in config_dict:
|
||
min_badness = string_to_int_convert_test(
|
||
config_dict['min_badness'])
|
||
if min_badness is None:
|
||
errprint('Invalid min_badness value, not integer\nExit')
|
||
exit(1)
|
||
if min_badness < 0 or min_badness > 1000:
|
||
errprint('Invalud min_badness value\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('min_badness not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
|
||
|
||
|
||
if 'min_post_warning_delay' in config_dict:
|
||
min_post_warning_delay = string_to_float_convert_test(
|
||
config_dict['min_post_warning_delay'])
|
||
if min_post_warning_delay is None:
|
||
errprint('Invalid min_post_warning_delay value, not float\nExit')
|
||
exit(1)
|
||
if min_post_warning_delay < 1 or min_post_warning_delay > 300:
|
||
errprint('min_post_warning_delay value out of range [1; 300]\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('min_post_warning_delay not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'warning_threshold_min_swap' in config_dict:
|
||
warning_threshold_min_swap = config_dict['warning_threshold_min_swap']
|
||
else:
|
||
errprint('warning_threshold_min_swap not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'max_victim_ancestry_depth' in config_dict:
|
||
max_victim_ancestry_depth = string_to_int_convert_test(
|
||
config_dict['max_victim_ancestry_depth'])
|
||
if min_badness is None:
|
||
errprint('Invalid max_victim_ancestry_depth value, not integer\nExit')
|
||
exit(1)
|
||
if max_victim_ancestry_depth < 1:
|
||
errprint('Invalud max_victim_ancestry_depth value\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('max_victim_ancestry_depth is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'max_soft_exit_time' in config_dict:
|
||
max_soft_exit_time = string_to_float_convert_test(
|
||
config_dict['max_soft_exit_time'])
|
||
if max_soft_exit_time is None:
|
||
errprint('Invalid max_soft_exit_time val'
|
||
'ue, not float\nExit')
|
||
exit(1)
|
||
if max_soft_exit_time < 0:
|
||
errprint('max_soft_exit_time must be non-n'
|
||
'egative number\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('max_soft_exit_time is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'post_kill_exe' in config_dict:
|
||
post_kill_exe = config_dict['post_kill_exe']
|
||
else:
|
||
errprint('post_kill_exe is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'psi_path' in config_dict:
|
||
psi_path = config_dict['psi_path']
|
||
else:
|
||
errprint('psi_path is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'psi_metrics' in config_dict:
|
||
psi_metrics = config_dict['psi_metrics']
|
||
else:
|
||
errprint('psi_metrics is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'warning_exe' in config_dict:
|
||
warning_exe = config_dict['warning_exe']
|
||
if warning_exe != '':
|
||
check_warning_exe = True
|
||
else:
|
||
check_warning_exe = False
|
||
else:
|
||
errprint('warning_exe is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'extra_table_info' in config_dict:
|
||
extra_table_info = config_dict['extra_table_info']
|
||
if (extra_table_info != 'None' and
|
||
extra_table_info != 'cgroup_v1' and
|
||
extra_table_info != 'cgroup_v2' and
|
||
extra_table_info != 'cmdline' and
|
||
extra_table_info != 'environ' and
|
||
extra_table_info != 'realpath'):
|
||
|
||
errprint('Invalid config: invalid extra_table_info value\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('Invalid config: extra_table_info is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
separate_log = conf_parse_bool('separate_log')
|
||
|
||
if separate_log:
|
||
|
||
import logging
|
||
from logging import basicConfig
|
||
from logging import info
|
||
|
||
log_dir = '/var/log/nohang'
|
||
|
||
try:
|
||
os.mkdir(log_dir)
|
||
except PermissionError:
|
||
print('ERROR: can not create log dir')
|
||
except FileExistsError:
|
||
pass
|
||
|
||
logfile = log_dir + '/nohang.log'
|
||
|
||
try:
|
||
with open(logfile, 'a') as f:
|
||
pass
|
||
except FileNotFoundError:
|
||
print('ERROR: log FileNotFoundError')
|
||
except PermissionError:
|
||
print('ERROR: log PermissionError')
|
||
|
||
try:
|
||
basicConfig(
|
||
filename=logfile,
|
||
level=logging.INFO,
|
||
format="%(asctime)s: %(message)s")
|
||
except PermissionError:
|
||
errprint('ERROR: Permission denied: {}'.format(logfile))
|
||
except FileNotFoundError:
|
||
errprint('ERROR: FileNotFoundError: {}'.format(logfile))
|
||
|
||
|
||
if 'min_mem_report_interval' in config_dict:
|
||
min_mem_report_interval = string_to_float_convert_test(
|
||
config_dict['min_mem_report_interval'])
|
||
if min_mem_report_interval is None:
|
||
errprint('Invalid min_mem_report_interval value, not float\nExit')
|
||
exit(1)
|
||
if min_mem_report_interval < 0:
|
||
errprint('min_mem_report_interval must be non-negative number\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('min_mem_report_interval is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'psi_excess_duration' in config_dict:
|
||
psi_excess_duration = string_to_float_convert_test(
|
||
config_dict['psi_excess_duration'])
|
||
if psi_excess_duration is None:
|
||
errprint('Invalid psi_excess_duration value, not float\nExit')
|
||
exit(1)
|
||
if psi_excess_duration < 0:
|
||
errprint('psi_excess_duration must be non-negative number\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('psi_excess_duration is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'max_sleep' in config_dict:
|
||
max_sleep = string_to_float_convert_test(
|
||
config_dict['max_sleep'])
|
||
if max_sleep is None:
|
||
errprint('Invalid max_sleep value, not float\nExit')
|
||
exit(1)
|
||
if max_sleep <= 0:
|
||
errprint('max_sleep must be positive number\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('max_sleep is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'min_sleep' in config_dict:
|
||
min_sleep = string_to_float_convert_test(
|
||
config_dict['min_sleep'])
|
||
if min_sleep is None:
|
||
errprint('Invalid min_sleep value, not float\nExit')
|
||
exit(1)
|
||
if min_sleep <= 0:
|
||
errprint('min_sleep must be positive number\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('min_sleep is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
if 'over_sleep' in config_dict:
|
||
over_sleep = string_to_float_convert_test(
|
||
config_dict['over_sleep'])
|
||
if over_sleep is None:
|
||
errprint('Invalid over_sleep value, not float\nExit')
|
||
exit(1)
|
||
if over_sleep <= 0:
|
||
errprint('over_sleep must be positive number\nExit')
|
||
exit(1)
|
||
else:
|
||
errprint('over_sleep is not in config\nExit')
|
||
exit(1)
|
||
|
||
|
||
sensitivity_test_time = over_sleep / 2
|
||
|
||
|
||
if max_sleep < min_sleep:
|
||
errprint('min_sleep value must not exceed max_sleep value.\nExit')
|
||
exit(1)
|
||
|
||
|
||
if min_sleep < over_sleep:
|
||
errprint('over_sleep value must not exceed min_sleep value.\nExit')
|
||
exit(1)
|
||
|
||
|
||
if max_sleep == min_sleep:
|
||
stable_sleep = True
|
||
else:
|
||
stable_sleep = False
|
||
|
||
|
||
if print_proc_table_flag:
|
||
|
||
if not root:
|
||
log('WARNING: effective UID != 0; euid={}; processes with other e'
|
||
'uids will be invisible for nohang'.format(self_uid))
|
||
|
||
func_print_proc_table()
|
||
|
||
|
||
##########################################################################
|
||
|
||
|
||
psi_support = os.path.exists(psi_path)
|
||
|
||
|
||
##########################################################################
|
||
|
||
# Get KiB levels if it's possible.
|
||
|
||
soft_threshold_min_swap_tuple = get_swap_threshold_tuple(soft_threshold_min_swap)
|
||
hard_threshold_min_swap_tuple = get_swap_threshold_tuple(hard_threshold_min_swap)
|
||
warning_threshold_min_swap_tuple = get_swap_threshold_tuple(warning_threshold_min_swap)
|
||
|
||
|
||
swap_kb_dict = dict()
|
||
|
||
swap_term_is_percent = soft_threshold_min_swap_tuple[1]
|
||
if swap_term_is_percent:
|
||
soft_threshold_min_swap_percent = soft_threshold_min_swap_tuple[0]
|
||
else:
|
||
soft_threshold_min_swap_kb = soft_threshold_min_swap_tuple[0]
|
||
swap_kb_dict['soft_threshold_min_swap_kb'] = soft_threshold_min_swap_kb
|
||
|
||
swap_kill_is_percent = hard_threshold_min_swap_tuple[1]
|
||
if swap_kill_is_percent:
|
||
hard_threshold_min_swap_percent = hard_threshold_min_swap_tuple[0]
|
||
else:
|
||
hard_threshold_min_swap_kb = hard_threshold_min_swap_tuple[0]
|
||
swap_kb_dict['hard_threshold_min_swap_kb'] = hard_threshold_min_swap_kb
|
||
|
||
|
||
swap_warn_is_percent = warning_threshold_min_swap_tuple[1]
|
||
if swap_warn_is_percent:
|
||
warning_threshold_min_swap_percent = warning_threshold_min_swap_tuple[0]
|
||
else:
|
||
warning_threshold_min_swap_kb = warning_threshold_min_swap_tuple[0]
|
||
swap_kb_dict['warning_threshold_min_swap_kb'] = warning_threshold_min_swap_kb
|
||
|
||
|
||
##########################################################################
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
if print_config_at_startup or check_config_flag:
|
||
check_config()
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
##########################################################################
|
||
|
||
|
||
# for calculating the column width when printing mem and zram
|
||
mem_len = len(str(round(mem_total / 1024.0)))
|
||
|
||
if post_action_gui_notifications:
|
||
notify_sig_dict = {SIGKILL: 'Killing',
|
||
SIGTERM: 'Terminating'}
|
||
|
||
|
||
# convert rates from MiB/s to KiB/s
|
||
fill_rate_mem = fill_rate_mem * 1024
|
||
fill_rate_swap = fill_rate_swap * 1024
|
||
fill_rate_zram = fill_rate_zram * 1024
|
||
|
||
|
||
warn_time_now = 0
|
||
warn_time_delta = 1000
|
||
warn_timer = 0
|
||
|
||
|
||
##########################################################################
|
||
|
||
|
||
if not root:
|
||
log('WARNING: effective UID != 0; euid={}; processes with other e'
|
||
'uids will be invisible for nohang'.format(self_uid))
|
||
|
||
|
||
# Try to lock all memory
|
||
|
||
mlockall()
|
||
|
||
##########################################################################
|
||
|
||
|
||
# print_self_rss()
|
||
|
||
psi_avg_string = '' # will be overwritten if PSI monitoring enabled
|
||
|
||
mem_used_zram = 0
|
||
|
||
|
||
if print_mem_check_results:
|
||
|
||
# to find delta mem
|
||
wt2 = 0
|
||
new_mem = 0
|
||
|
||
# init mem report interval
|
||
report0 = 0
|
||
|
||
|
||
# handle signals
|
||
for i in sig_list:
|
||
signal(i, signal_handler)
|
||
|
||
|
||
x0 = time()
|
||
delta0 = 0
|
||
|
||
|
||
threshold = None
|
||
mem_info = None
|
||
|
||
|
||
CHECK_PSI = False
|
||
if psi_support and not ignore_psi:
|
||
CHECK_PSI = True
|
||
|
||
psi_kill_exceeded_timer = 0
|
||
psi_term_exceeded_timer = 0
|
||
psi_t0 = time()
|
||
psi_threshold = zram_threshold = zram_info = psi_info = None
|
||
|
||
|
||
CHECK_ZRAM = not ignore_zram
|
||
|
||
log('Monitoring has started!')
|
||
|
||
stdout.flush()
|
||
|
||
|
||
##########################################################################
|
||
|
||
|
||
while True:
|
||
|
||
(masf_threshold, masf_info, mem_available, hard_threshold_min_swap_kb,
|
||
soft_threshold_min_swap_kb, swap_free, swap_total) = check_mem_swap_ex()
|
||
|
||
if CHECK_ZRAM:
|
||
zram_threshold, zram_info, mem_used_zram = check_zram_ex()
|
||
|
||
if CHECK_PSI:
|
||
(psi_threshold, psi_info, psi_t0, psi_kill_exceeded_timer,
|
||
psi_term_exceeded_timer, x0) = check_psi_ex(
|
||
psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0)
|
||
|
||
if print_mem_check_results:
|
||
|
||
if CHECK_PSI:
|
||
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
|
||
if time() - psi_t0 >= psi_post_action_delay:
|
||
psi_post_action_delay_exceeded = True
|
||
else:
|
||
psi_post_action_delay_exceeded = False
|
||
|
||
if print_mem_check_results:
|
||
psi_avg_string = 'PSI avg: {} | '.format(
|
||
str(psi_avg_value).rjust(6))
|
||
|
||
wt1 = time()
|
||
|
||
delta = (mem_available + swap_free) - new_mem
|
||
|
||
t_cycle = wt1 - wt2
|
||
|
||
report_delta = wt1 - report0
|
||
|
||
if report_delta >= min_mem_report_interval:
|
||
|
||
mem_report = True
|
||
new_mem = mem_available + swap_free
|
||
|
||
report0 = wt1
|
||
|
||
else:
|
||
mem_report = False
|
||
|
||
wt2 = time()
|
||
|
||
if mem_report:
|
||
|
||
speed = delta / 1024.0 / report_delta
|
||
speed_info = ' | dMem: {} M/s'.format(
|
||
str(round(speed)).rjust(5)
|
||
)
|
||
|
||
# Calculate 'swap-column' width
|
||
swap_len = len(str(round(swap_total / 1024.0)))
|
||
|
||
# Output available mem sizes
|
||
if swap_total == 0 and mem_used_zram == 0:
|
||
log('{}MemAvail: {} M, {} %{}'.format(
|
||
psi_avg_string,
|
||
human(mem_available, mem_len),
|
||
just_percent_mem(mem_available / mem_total),
|
||
speed_info
|
||
)
|
||
)
|
||
|
||
elif swap_total > 0 and mem_used_zram == 0:
|
||
log('{}MemAvail: {} M, {} % | SwapFree: {} M, {} %{}'.format(
|
||
psi_avg_string,
|
||
human(mem_available, mem_len),
|
||
just_percent_mem(mem_available / mem_total),
|
||
human(swap_free, swap_len),
|
||
just_percent_swap(swap_free / (swap_total + 0.1)),
|
||
speed_info
|
||
)
|
||
)
|
||
|
||
else:
|
||
log('{}MemAvail: {} M, {} % | SwapFree: {} M, {} % | Mem'
|
||
'UsedZram: {} M, {} %{}'.format(
|
||
psi_avg_string,
|
||
human(mem_available, mem_len),
|
||
just_percent_mem(mem_available / mem_total),
|
||
human(swap_free, swap_len),
|
||
just_percent_swap(swap_free / (swap_total + 0.1)),
|
||
human(mem_used_zram, mem_len),
|
||
just_percent_mem(mem_used_zram / mem_total),
|
||
speed_info
|
||
)
|
||
)
|
||
|
||
if (masf_threshold is SIGKILL or zram_threshold is SIGKILL or
|
||
psi_threshold is SIGKILL):
|
||
|
||
threshold = SIGKILL
|
||
mem_info_list = []
|
||
|
||
if masf_info is not None:
|
||
mem_info_list.append(masf_info)
|
||
|
||
if zram_info is not None:
|
||
mem_info_list.append(zram_info)
|
||
|
||
if psi_info is not None:
|
||
mem_info_list.append(psi_info)
|
||
|
||
psi_t0 = implement_corrective_action(
|
||
threshold,
|
||
mem_info_list,
|
||
psi_t0,
|
||
psi_kill_exceeded_timer,
|
||
psi_term_exceeded_timer,
|
||
x0, psi_threshold, zram_threshold, zram_info, psi_info)
|
||
continue
|
||
|
||
if (masf_threshold is SIGTERM or zram_threshold is SIGTERM or
|
||
psi_threshold is SIGTERM):
|
||
|
||
threshold = SIGTERM
|
||
mem_info_list = []
|
||
|
||
if masf_info is not None:
|
||
mem_info_list.append(masf_info)
|
||
|
||
if zram_info is not None:
|
||
mem_info_list.append(zram_info)
|
||
|
||
if psi_info is not None:
|
||
mem_info_list.append(psi_info)
|
||
|
||
psi_t0 = implement_corrective_action(
|
||
threshold,
|
||
mem_info_list,
|
||
psi_t0,
|
||
psi_kill_exceeded_timer,
|
||
psi_term_exceeded_timer,
|
||
x0, psi_threshold, zram_threshold, zram_info, psi_info)
|
||
continue
|
||
|
||
if low_memory_warnings_enabled:
|
||
|
||
if (masf_threshold == 'WARN' or zram_threshold == 'WARN' or
|
||
psi_threshold == 'WARN'):
|
||
|
||
warn_time_delta = time() - warn_time_now
|
||
warn_time_now = time()
|
||
warn_timer += warn_time_delta
|
||
if warn_timer > min_post_warning_delay:
|
||
|
||
send_notify_warn()
|
||
|
||
warn_timer = 0
|
||
|
||
sleep_after_check_mem()
|