nohang/nohang/nohang
2020-05-03 00:36:33 +09:00

3973 lines
115 KiB
Python
Executable File

#!/usr/bin/env python3
"""A sophisticated low memory handler."""
import os
from ctypes import CDLL
from time import sleep, monotonic, process_time
from operator import itemgetter
from sys import stdout, stderr, argv, exit
from re import search
from sre_constants import error as invalid_re
from signal import signal, SIGKILL, SIGTERM, SIGINT, SIGQUIT, SIGHUP, SIGUSR1
def missing_config_key(key):
"""
"""
errprint('ERROR: invalid config: missing key "{}"'.format(key))
exit(1)
def invalid_config_key_value(key):
"""
"""
errprint('ERROR: invalid config: invalid "{}" value'.format(key))
exit(1)
def check_permissions():
"""
"""
try:
os.path.realpath('/proc/1/exe')
except Exception as e:
print('WARNING: missing CAP_SYS_PTRACE: {}'.format(e))
try:
os.kill(1, 0)
except Exception as e:
print('WARNING: cannot send a signal: {}'.format(e))
try:
rline1('/proc/1/oom_score')
except Exception as e:
print('ERROR: {}'.format(e))
exit(1)
def memload():
"""
"""
with open('/proc/meminfo') as f:
mem_list = f.readlines()
mem_list_names = []
for s in mem_list:
mem_list_names.append(s.split(':')[0])
if mem_list_names[2] != 'MemAvailable':
errprint('Your Linux kernel is too old, Linux 3.14+ requied\nExit')
exit(1)
swap_total_index = mem_list_names.index('SwapTotal')
swap_free_index = swap_total_index + 1
def check_mem_and_swap():
"""find mem_available, swap_total, swap_free"""
with open('/proc/meminfo') as f:
for n, line in enumerate(f):
if n == 2:
mem_available = int(line.split(':')[1][:-4])
continue
if n == swap_total_index:
swap_total = int(line.split(':')[1][:-4])
continue
if n == swap_free_index:
swap_free = int(line.split(':')[1][:-4])
break
return mem_available, swap_total, swap_free
def print_mem(mem_available, swap_free):
print('\033MMemAvailable: {} MiB, SwapFree: {} MiB '
' '.format(
round(mem_available / 1024),
round(swap_free / 1024)))
try:
luid_init = rline1('/proc/1/loginuid')
except Exception as e:
print(e)
exit(1)
luid_self = rline1('/proc/self/loginuid')
if luid_init == luid_self:
print('The option is available only for logged in users.')
print('Self loginuid: {}'.format(luid_self))
print('Init loginuid: {}'.format(luid_init))
print('Self login UID must not be equal to init login UID to continue.'
)
print('Exit')
exit(1)
try:
hi = 'Warning! The process will consume memory until 40 MiB of mem' \
'ory\n(MemAvailable + SwapFree) remain free, and it will be t' \
'erminated via SIGUSR1\nat the end. This may cause the system' \
' to freeze and processes to terminate.\nDo you want to conti' \
'nue? [No/Yes] '
inp = input(hi)
except KeyboardInterrupt:
print('KeyboardInterrupt\nExit')
exit(1)
if inp != 'Yes':
print('Exit')
exit()
else:
print('Memory consumption has started!\n')
ex = []
z = monotonic()
self_pid = os.getpid()
while True:
try:
mem_available, swap_total, swap_free = check_mem_and_swap()
x = mem_available + swap_free
if x <= 1024 * 40: # 40 MiB
print_mem(mem_available, swap_free)
print('Self terminating by SIGUSR1')
os.kill(self_pid, SIGUSR1)
else:
ex.append(bytearray(1024 * 50)) # step size is 50 KiB
u = monotonic() - z
if u <= 0.01:
continue
z = monotonic()
print_mem(mem_available, swap_free)
except KeyboardInterrupt:
print('KeyboardInterrupt')
print('Self terminating by the SIGUSR1 signal')
os.kill(self_pid, SIGUSR1)
except MemoryError:
print('MemoryError')
print('Self terminating by the SIGUSR1 signal')
os.kill(self_pid, SIGUSR1)
def arcstats():
"""
"""
with open(arcstats_path, 'rb') as f:
a_list = f.read().decode().split('\n')
for n, line in enumerate(a_list):
if n == c_min_index:
c_min = int(line.rpartition(' ')[2]) / 1024
elif n == size_index:
size = int(line.rpartition(' ')[2]) / 1024
elif n == arc_meta_used_index:
arc_meta_used = int(line.rpartition(' ')[2]) / 1024
elif n == arc_meta_min_index:
arc_meta_min = int(line.rpartition(' ')[2]) / 1024
else:
continue
c_rec = size - c_min
if c_rec < 0:
c_rec = 0
meta_rec = arc_meta_used - arc_meta_min
if meta_rec < 0:
meta_rec = 0
zfs_available = c_rec + meta_rec
# return c_min, size, arc_meta_used, arc_meta_min, zfs_available
return zfs_available
def exe(cmd):
""" execute cmd in subprocess.Popen()
"""
cmd_list = shlex.split(cmd)
cmd_num_dict['cmd_num'] += 1
cmd_num = cmd_num_dict['cmd_num']
th_name = threading.current_thread().getName()
log('Executing Command-{} {} with timeout {}s in {}'.format(
cmd_num,
cmd_list,
exe_timeout,
th_name,
))
t3 = monotonic()
try:
with Popen(cmd_list) as proc:
try:
proc.wait(timeout=exe_timeout)
exit_status = proc.poll()
t4 = monotonic()
log('Command-{} execution completed in {} sec; exit status'
': {}'.format(cmd_num, round(t4 - t3, 3), exit_status))
except TimeoutExpired:
proc.kill()
log('Timeout expired for Command-{}'.format(cmd_num))
except Exception as e:
log('Exception in {}: {}'.format(th_name, e))
def start_thread(func, *a, **k):
""" run function in a new thread
"""
th = threading.Thread(target=func, args=a, kwargs=k, daemon=True)
th_name = th.getName()
if debug_threading:
log('Starting {} from {}'.format(
th_name, threading.current_thread().getName()
))
try:
t1 = monotonic()
th.start()
t2 = monotonic()
if debug_threading:
log('{} has started in {} ms, {} threads are '
'currently alive'.format(th_name, round((
t2 - t1) * 1000, 1), threading.active_count()))
except RuntimeError:
log('RuntimeError: cannot start {}'.format(th_name))
return 1
def re_pid_environ(pid):
"""
read environ of 1 process
returns tuple with USER, DBUS, DISPLAY like follow:
('user', 'DISPLAY=:0',
'DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus')
returns None if these vars is not in /proc/[pid]/environ
"""
try:
with open('/proc/' + pid + '/environ', 'rb') as f:
env = f.read().decode('utf-8', 'ignore')
except FileNotFoundError:
return None
except ProcessLookupError:
return None
if display_env in env and dbus_env in env and user_env in env:
env_list = env.split('\x00')
# iterating over a list of process environment variables
for i in env_list:
# exclude Display Manager's user
if i.startswith('HOME=/var'):
return None
if i.startswith(user_env):
user = i
if user == 'USER=root':
return None
continue
if i.startswith(display_env):
if i[-2] == '.':
# DISPLAY=:0.0 -> DISPLAY=:0
display = i[:-2]
else:
display = i
if len(display) > 10:
# skip DISPLAY >= :10
return None
continue
if i.startswith(dbus_env):
dbus = i
continue
try:
return user.partition('USER=')[2], display, dbus
except UnboundLocalError:
return None
def root_notify_env():
"""return set(user, display, dbus)"""
unsorted_envs_list = []
# iterates over processes, find processes with suitable env
for pid in os.listdir('/proc'):
if is_alive(pid):
one_env = re_pid_environ(pid)
unsorted_envs_list.append(one_env)
env = set(unsorted_envs_list)
env.discard(None)
# deduplicate dbus
new_env = []
end = []
for i in env:
key = i[0] + i[1]
if key not in end:
end.append(key)
new_env.append(i)
else:
continue
return new_env
def pop(cmd):
""" run cmd in subprocess.Popen()
"""
cmd_num_dict['cmd_num'] += 1
cmd_num = cmd_num_dict['cmd_num']
if swap_total == 0:
wait_time = 10
else:
wait_time = 30
th_name = threading.current_thread().getName()
log('Executing Command-{} {} with timeout {}s in {}'.format(
cmd_num,
cmd,
wait_time,
th_name
))
t3 = monotonic()
try:
with Popen(cmd) as proc:
try:
proc.wait(timeout=wait_time)
err = proc.poll()
t4 = monotonic()
if debug_gui_notifications:
log('Command-{} execution completed in {} sec; exit status'
': {}'.format(cmd_num, round(t4 - t3, 3), err))
except TimeoutExpired:
proc.kill()
if debug_gui_notifications:
log('Timeout expired for Command-{}'.format(cmd_num))
except Exception as e:
log('Exception in {}: {}'.format(th_name, e))
def send_notification(title, body):
"""
"""
if self_uid != 0:
cmd = ['notify-send', '--icon=dialog-warning', title, body]
pop(cmd)
return None
t1 = monotonic()
if envd['t'] is None:
list_with_envs = root_notify_env()
envd['list_with_envs'] = list_with_envs
envd['t'] = monotonic()
cached_env = ''
elif monotonic() - envd['t'] > env_cache_time:
list_with_envs = root_notify_env()
envd['list_with_envs'] = list_with_envs
envd['t'] = monotonic()
cached_env = ''
else:
list_with_envs = envd['list_with_envs']
cached_env = ' (cached)'
t2 = monotonic()
if debug_gui_notifications:
log('Found env in {} ms{}'.format(round((t2 - t1) * 1000), cached_env))
log(' Title: {}'.format([title]))
log(' Body: {}'.format([body]))
log(' Env list: {}'.format(list_with_envs))
list_len = len(list_with_envs)
# if somebody logged in with GUI
if list_len > 0:
# iterating over logged-in users
for i in list_with_envs:
username, display_env, dbus_env = i[0], i[1], i[2]
display_tuple = display_env.partition('=')
dbus_tuple = dbus_env.partition('=')
display_value = display_tuple[2]
dbus_value = dbus_tuple[2]
cmd = [
'sudo', '-u', username,
'env',
'DISPLAY=' + display_value,
'DBUS_SESSION_BUS_ADDRESS=' + dbus_value,
'notify-send',
'--icon=dialog-warning',
'--app-name=nohang',
title,
body
]
start_thread(pop, cmd)
else:
if debug_gui_notifications:
log('Nobody logged-in with GUI. Nothing to do.')
def send_notify_warn():
""" Implement Low memory warnings
"""
log('Warning threshold exceeded')
if check_warning_exe:
start_thread(exe, warning_exe)
else:
title = 'Low memory'
body = 'Save your unsaved data!\nClose unused apps!'
""""
body = 'MemAvail: {}%\nSwapFree: {}%'.format(
round(mem_available / mem_total * 100),
round(swap_free / (swap_total + 0.1) * 100)
)
"""
start_thread(send_notification, title, body)
def send_notify(threshold, name, pid):
"""
Notificate about OOM Preventing.
threshold: key for notify_sig_dict
name: str process name
pid: str process pid
"""
title = 'System hang prevention'
if hide_corrective_action_type:
body = 'Corrective action applied'
else:
body = '<b>{}</b> [{}] <b>{}</b>'.format(
notify_sig_dict[threshold],
pid,
name.replace(
# symbol '&' can break notifications in some themes,
# therefore it is replaced by '*'
'&', '*'
))
start_thread(send_notification, title, body)
def send_notify_etc(pid, name, command):
"""
Notificate about OOM Preventing.
command: str command that will be executed
name: str process name
pid: str process pid
"""
title = 'System hang prevention'
if hide_corrective_action_type:
body = 'Corrective action applied'
else:
body = '<b>Victim is</b> [{}] <b>{}</b>\nExecute the command:\n<b>' \
'{}</b>'.format(pid, name.replace(
'&', '*'), command.replace('&', '*'))
start_thread(send_notification, title, body)
def check_config():
"""
"""
log('\n1. Common zram settings')
log(' zram_checking_enabled: {}'.format(zram_checking_enabled))
log('\n2. Common PSI settings')
log(' psi_checking_enabled: {}'.format(psi_checking_enabled))
log(' psi_path: {}'.format(psi_path))
log(' psi_metrics: {}'.format(psi_metrics))
log(' psi_excess_duration: {} sec'.format(psi_excess_duration))
log(' psi_post_action_delay: {} sec'.format(psi_post_action_delay))
log('\n3. Poll rate')
log(' fill_rate_mem: {}'.format(fill_rate_mem))
log(' fill_rate_swap: {}'.format(fill_rate_swap))
log(' fill_rate_zram: {}'.format(fill_rate_zram))
log(' max_sleep: {} sec'.format(max_sleep))
log(' min_sleep: {} sec'.format(min_sleep))
log('\n4. Warnings and notifications')
log(' post_action_gui_notifications: {}'.format(
post_action_gui_notifications))
log(' hide_corrective_action_type: {}'.format(
hide_corrective_action_type))
log(' low_memory_warnings_enabled: {}'.format(
low_memory_warnings_enabled))
log(' warning_exe: {}'.format(warning_exe))
log(' warning_threshold_min_mem: {} MiB, {} %'.format(round(
warning_threshold_min_mem_mb), round(
warning_threshold_min_mem_percent, 1)))
log(' warning_threshold_min_swap: {}'.format
(warning_threshold_min_swap))
log(' warning_threshold_max_zram: {} MiB, {} %'.format(round(
warning_threshold_max_zram_mb), round(
warning_threshold_max_zram_percent, 1)))
log(' warning_threshold_max_psi: {}'.format(
warning_threshold_max_psi))
log(' min_post_warning_delay: {} sec'.format(
min_post_warning_delay))
log(' env_cache_time: {}'.format(env_cache_time))
log('\n5. Soft threshold')
log(' soft_threshold_min_mem: {} MiB, {} %'.format(
round(soft_threshold_min_mem_mb), round(
soft_threshold_min_mem_percent, 1)))
log(' soft_threshold_min_swap: {}'.format(soft_threshold_min_swap))
log(' soft_threshold_max_zram: {} MiB, {} %'.format(
round(soft_threshold_max_zram_mb), round(
soft_threshold_max_zram_percent, 1)))
log(' soft_threshold_max_psi: {}'.format(soft_threshold_max_psi))
log('\n6. Hard threshold')
log(' hard_threshold_min_mem: {} MiB, {} %'.format(
round(hard_threshold_min_mem_mb), round(
hard_threshold_min_mem_percent, 1)))
log(' hard_threshold_min_swap: {}'.format(hard_threshold_min_swap))
log(' hard_threshold_max_zram: {} MiB, {} %'.format(
round(hard_threshold_max_zram_mb), round(
hard_threshold_max_zram_percent, 1)))
log(' hard_threshold_max_psi: {}'.format(hard_threshold_max_psi))
log('\n7. Customize victim selection: adjusting badness of processes')
log('\n7.1. Ignore positive oom_score_adj')
log(' ignore_positive_oom_score_adj: {}'.format(
ignore_positive_oom_score_adj))
log('\n7.2. Adjusting badness of processes by matching with '
'regular expressions')
log('7.2.1. Matching process names with RE patterns')
if len(badness_adj_re_name_list) > 0:
log(' badness_adj: regexp:')
for i in badness_adj_re_name_list:
log(' {:>12} {}'.format(i[0], i[1]))
else:
log(' (not set)')
log('7.2.2. Matching CGroup_v1-line with RE patterns')
if len(badness_adj_re_cgroup_v1_list) > 0:
log(' badness_adj: regexp:')
for i in badness_adj_re_cgroup_v1_list:
log(' {:>12} {}'.format(i[0], i[1]))
else:
log(' (not set)')
log('7.2.3. Matching CGroup_v2-line with RE patterns')
if len(badness_adj_re_cgroup_v2_list) > 0:
log(' badness_adj: regexp:')
for i in badness_adj_re_cgroup_v2_list:
log(' {:>12} {}'.format(i[0], i[1]))
else:
log(' (not set)')
log('7.2.4. Matching eUIDs with RE patterns')
if len(badness_adj_re_uid_list) > 0:
log(' badness_adj: regexp:')
for i in badness_adj_re_uid_list:
log(' {:>12} {}'.format(i[0], i[1]))
else:
log(' (not set)')
log('7.2.5. Matching realpath with RE patterns')
if len(badness_adj_re_realpath_list) > 0:
log(' badness_adj: regexp:')
for i in badness_adj_re_realpath_list:
log(' {:>12} {}'.format(i[0], i[1]))
else:
log(' (not set)')
log('7.2.6. Matching cwd with RE patterns')
if len(badness_adj_re_cwd_list) > 0:
log(' badness_adj: regexp:')
for i in badness_adj_re_cwd_list:
log(' {:>12} {}'.format(i[0], i[1]))
else:
log(' (not set)')
log('7.2.7. Matching cmdlines with RE patterns')
if len(badness_adj_re_cmdline_list) > 0:
log(' badness_adj: regexp:')
for i in badness_adj_re_cmdline_list:
log(' {:>12} {}'.format(i[0], i[1]))
else:
log(' (not set)')
log('7.2.8. Matching environ with RE patterns')
if len(badness_adj_re_environ_list) > 0:
log(' badness_adj: regexp:')
for i in badness_adj_re_environ_list:
log(' {:>12} {}'.format(i[0], i[1]))
else:
log(' (not set)')
log('\n8. Customize soft corrective actions')
if len(soft_actions_list) > 0:
log(' Match by: regexp: command: ')
for i in soft_actions_list:
log(' {} {} {}'.format(i[0].ljust(10), i[1].ljust(12), i[2]))
else:
log(' (not set)')
log('\n9. Misc')
log(' max_soft_exit_time: {} sec'.format(max_soft_exit_time))
log(' post_kill_exe: {}'.format(post_kill_exe))
log(' min_badness: {}'.format(min_badness))
log(' post_soft_action_delay: {} sec'.format(
post_soft_action_delay))
log(' post_zombie_delay: {} sec'.format(post_zombie_delay))
log(' victim_cache_time: {} sec'.format(victim_cache_time))
log(' exe_timeout: {} sec'.format(exe_timeout))
log('\n10. Verbosity')
log(' print_config_at_startup: {}'.format(print_config_at_startup))
log(' print_mem_check_results: {}'.format(print_mem_check_results))
log(' min_mem_report_interval: {} sec'.format(
min_mem_report_interval))
log(' print_proc_table: {}'.format(print_proc_table))
log(' extra_table_info: {}'.format(extra_table_info))
log(' print_victim_status: {}'.format(print_victim_status))
log(' print_victim_cmdline: {}'.format(print_victim_cmdline))
log(' max_victim_ancestry_depth: {}'.format(max_victim_ancestry_depth))
log(' print_statistics: {}'.format(print_statistics))
log(' debug_gui_notifications: {}'.format(debug_gui_notifications))
log(' debug_psi: {}'.format(debug_psi))
log(' debug_sleep: {}'.format(debug_sleep))
log(' debug_threading: {}'.format(debug_threading))
log(' separate_log: {}'.format(separate_log))
if check_config_flag:
log('\nconfig is OK')
exit()
def get_swap_threshold_tuple(string, key):
# re (Num %, True) or (Num KiB, False)
"""Returns KiB value if abs val was set in config, or tuple with %"""
# return tuple with abs and bool: (abs %, True) or (abs MiB, False)
if string.endswith('%'):
value = string_to_float_convert_test(string[:-1])
if value is None or value < 0 or value > 100:
invalid_config_key_value(key)
return value, True
elif string.endswith('M'):
value = string_to_float_convert_test(string[:-1])
if value is None or value < 0:
invalid_config_key_value(key)
return value, False
else:
invalid_config_key_value(key)
def find_cgroup_indexes():
""" Find cgroup-line positions in /proc/*/cgroup file.
"""
cgroup_v1_index = cgroup_v2_index = None
with open('/proc/self/cgroup') as f:
for index, line in enumerate(f):
if ':name=' in line:
cgroup_v1_index = index
if line.startswith('0::'):
cgroup_v2_index = index
return cgroup_v1_index, cgroup_v2_index
def pid_to_rss(pid):
"""
"""
try:
rss = int(rline1(
'/proc/{}/statm'.format(pid)).split(' ')[1]) * SC_PAGESIZE
except IndexError:
rss = None
except FileNotFoundError:
rss = None
except ProcessLookupError:
rss = None
return rss
def pid_to_vm_size(pid):
"""
"""
try:
vm_size = int(rline1(
'/proc/{}/statm'.format(pid)).partition(' ')[0]) * SC_PAGESIZE
except IndexError:
vm_size = None
except FileNotFoundError:
vm_size = None
except ProcessLookupError:
vm_size = None
return vm_size
def signal_handler(signum, frame):
"""
"""
for i in sig_list:
signal(i, signal_handler_inner)
log('Got the {} signal '.format(
sig_dict[signum]))
fd['mi'].close()
print_stat_dict()
m1 = monotonic()
pt1 = process_time()
ab = pt1 - pt0
perc = (pt1 - pt0) / (m1 - m0) * 100
log('CPU time since monitoring has started: {} ({}%); exit.'.format(
format_time(ab), round(perc, 3)))
exit()
def signal_handler_inner(signum, frame):
"""
"""
log('Got the {} signal (ignored) '.format(
sig_dict[signum]))
def write(path, string):
"""
"""
with open(path, 'w') as f:
f.write(string)
def valid_re(reg_exp):
"""Validate regular expression.
"""
try:
search(reg_exp, '')
except invalid_re:
log('Invalid config: invalid regexp: {}'.format(reg_exp))
exit(1)
def func_print_proc_table():
"""
"""
print_proc_table = True
find_victim(print_proc_table)
exit()
def log(*msg):
"""
"""
print(*msg)
if separate_log:
logging.info(*msg)
def print_version():
"""
"""
if os.path.exists('/usr/local/etc/nohang/version'):
v = rline1('/usr/local/etc/nohang/version')
else:
try:
v = rline1('/etc/nohang/version')
except FileNotFoundError:
v = None
if v is None:
print('nohang unknown version')
else:
print('nohang ' + v)
exit()
def psi_file_mem_to_metrics(psi_path):
"""
"""
with open(psi_path) as f:
psi_list = f.readlines()
some_list, full_list = psi_list[0].split(' '), psi_list[1].split(' ')
some_avg10 = some_list[1].split('=')[1]
some_avg60 = some_list[2].split('=')[1]
some_avg300 = some_list[3].split('=')[1]
full_avg10 = full_list[1].split('=')[1]
full_avg60 = full_list[2].split('=')[1]
full_avg300 = full_list[3].split('=')[1]
return (some_avg10, some_avg60, some_avg300,
full_avg10, full_avg60, full_avg300)
def pid_to_cgroup_v1(pid):
"""
"""
cgroup_v1 = ''
try:
with open('/proc/' + pid + '/cgroup') as f:
for index, line in enumerate(f):
if index == cgroup_v1_index:
cgroup_v1 = '/' + line.partition('/')[2][:-1]
return cgroup_v1
except FileNotFoundError:
return ''
def pid_to_cgroup_v2(pid):
"""
"""
cgroup_v2 = ''
try:
with open('/proc/' + pid + '/cgroup') as f:
for index, line in enumerate(f):
if index == cgroup_v2_index:
cgroup_v2 = line[3:-1]
return cgroup_v2
except FileNotFoundError:
return ''
def pid_to_starttime(pid):
""" handle FNF error!
"""
try:
starttime = rline1('/proc/' + pid + '/stat').rpartition(')')[
2].split(' ')[20]
except UnicodeDecodeError:
with open('/proc/' + pid + '/stat', 'rb') as f:
starttime = f.read().decode('utf-8', 'ignore').rpartition(
')')[2].split(' ')[20]
return float(starttime) / SC_CLK_TCK
def pid_to_nssid(pid):
""" handle FNF error!
"""
try:
nssid = rline1('/proc/' + pid + '/stat').rpartition(')')[
2].split(' ')[4]
except UnicodeDecodeError:
with open('/proc/' + pid + '/stat', 'rb') as f:
nssid = f.read().decode('utf-8', 'ignore').rpartition(
')')[2].split(' ')[4]
return nssid
def get_victim_id(pid):
"""victim_id is starttime + pid"""
try:
return rline1('/proc/' + pid + '/stat').rpartition(
')')[2].split(' ')[20] + '_pid' + pid
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
def pid_to_state(pid):
"""
"""
try:
with open('/proc/' + pid + '/stat', 'rb') as f:
return f.read(40).decode('utf-8', 'ignore').rpartition(')')[2][1]
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
except IndexError:
with open('/proc/' + pid + '/stat', 'rb') as f:
return f.read().decode('utf-8', 'ignore').rpartition(')')[2][1]
def pid_to_name(pid):
"""
"""
try:
with open('/proc/{}/comm'.format(pid), 'rb', buffering=0) as f:
return f.read().decode('utf-8', 'ignore')[:-1]
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
def pid_to_ppid(pid):
"""
"""
try:
with open('/proc/' + pid + '/status') as f:
for n, line in enumerate(f):
if n is ppid_index:
return line.split('\t')[1].strip()
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
except UnicodeDecodeError:
with open('/proc/' + pid + '/status', 'rb') as f:
f_list = f.read().decode('utf-8', 'ignore').split('\n')
for i in range(len(f_list)):
if i is ppid_index:
return f_list[i].split('\t')[1]
def pid_to_ancestry(pid, max_victim_ancestry_depth=1):
"""
"""
if max_victim_ancestry_depth == 1:
ppid = pid_to_ppid(pid)
pname = pid_to_name(ppid)
return '\n PPID: {} ({})'.format(ppid, pname)
if max_victim_ancestry_depth == 0:
return ''
anc_list = []
for i in range(max_victim_ancestry_depth):
ppid = pid_to_ppid(pid)
pname = pid_to_name(ppid)
anc_list.append((ppid, pname))
if ppid == '1':
break
pid = ppid
a = ''
for i in anc_list:
a = a + ' <= PID {} ({})'.format(i[0], i[1])
return '\n ancestry: ' + a[4:]
def pid_to_cmdline(pid):
"""
Get process cmdline by pid.
pid: str pid of required process
returns string cmdline
"""
try:
with open('/proc/' + pid + '/cmdline', 'rb') as f:
return f.read().decode('utf-8', 'ignore').replace(
'\x00', ' ').rstrip()
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
def pid_to_environ(pid):
"""
Get process environ by pid.
pid: str pid of required process
returns string environ
"""
try:
with open('/proc/' + pid + '/environ', 'rb') as f:
return f.read().decode('utf-8', 'ignore').replace(
'\x00', ' ').rstrip()
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
def pid_to_realpath(pid):
"""
"""
try:
return os.path.realpath('/proc/{}/exe'.format(pid))
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
except PermissionError:
return ''
def pid_to_cwd(pid):
"""
"""
try:
return os.path.realpath('/proc/{}/cwd'.format(pid))
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
except PermissionError:
return ''
def pid_to_uid(pid):
"""return euid"""
try:
with open('/proc/{}/status'.format(pid), 'rb', buffering=0) as f:
f_list = f.read().decode('utf-8', 'ignore').split('\n')
return f_list[uid_index].split('\t')[2]
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
def pid_to_badness(pid, oom_score):
"""Find and modify badness (if it needs)."""
oom_score_adj = None
try:
if oom_score is None:
oom_score = pid_to_oom_score(pid)
if oom_score == 0:
return oom_score, oom_score
badness = oom_score
if ignore_positive_oom_score_adj:
oom_score_adj = pid_to_oom_score_adj(pid)
if oom_score_adj > 0:
badness = badness - oom_score_adj
if regex_matching:
name = pid_to_name(pid)
for re_tup in badness_adj_re_name_list:
if search(re_tup[1], name) is not None:
badness_adj = int(re_tup[0])
if badness_adj <= 0:
badness += badness_adj
else:
if oom_score_adj is None:
oom_score_adj = pid_to_oom_score_adj(pid)
if oom_score_adj >= 0:
badness += badness_adj
if re_match_cgroup_v1:
cgroup_v1 = pid_to_cgroup_v1(pid)
for re_tup in badness_adj_re_cgroup_v1_list:
if search(re_tup[1], cgroup_v1) is not None:
badness_adj = int(re_tup[0])
if badness_adj <= 0:
badness += badness_adj
else:
if oom_score_adj is None:
oom_score_adj = pid_to_oom_score_adj(pid)
if oom_score_adj >= 0:
badness += badness_adj
if re_match_cgroup_v2:
cgroup_v2 = pid_to_cgroup_v2(pid)
for re_tup in badness_adj_re_cgroup_v2_list:
if search(re_tup[1], cgroup_v2) is not None:
badness_adj = int(re_tup[0])
if badness_adj <= 0:
badness += badness_adj
else:
if oom_score_adj is None:
oom_score_adj = pid_to_oom_score_adj(pid)
if oom_score_adj >= 0:
badness += badness_adj
if re_match_realpath:
realpath = pid_to_realpath(pid)
for re_tup in badness_adj_re_realpath_list:
if search(re_tup[1], realpath) is not None:
badness_adj = int(re_tup[0])
if badness_adj <= 0:
badness += badness_adj
else:
if oom_score_adj is None:
oom_score_adj = pid_to_oom_score_adj(pid)
if oom_score_adj >= 0:
badness += badness_adj
if re_match_cwd:
cwd = pid_to_cwd(pid)
for re_tup in badness_adj_re_cwd_list:
if search(re_tup[1], cwd) is not None:
badness_adj = int(re_tup[0])
if badness_adj <= 0:
badness += badness_adj
else:
if oom_score_adj is None:
oom_score_adj = pid_to_oom_score_adj(pid)
if oom_score_adj >= 0:
badness += badness_adj
if re_match_cmdline:
cmdline = pid_to_cmdline(pid)
for re_tup in badness_adj_re_cmdline_list:
if search(re_tup[1], cmdline) is not None:
badness_adj = int(re_tup[0])
if badness_adj <= 0:
badness += badness_adj
else:
if oom_score_adj is None:
oom_score_adj = pid_to_oom_score_adj(pid)
if oom_score_adj >= 0:
badness += badness_adj
if re_match_environ:
environ = pid_to_environ(pid)
for re_tup in badness_adj_re_environ_list:
if search(re_tup[1], environ) is not None:
badness_adj = int(re_tup[0])
if badness_adj <= 0:
badness += badness_adj
else:
if oom_score_adj is None:
oom_score_adj = pid_to_oom_score_adj(pid)
if oom_score_adj >= 0:
badness += badness_adj
if re_match_uid:
uid = pid_to_uid(pid)
for re_tup in badness_adj_re_uid_list:
if search(re_tup[1], uid) is not None:
badness_adj = int(re_tup[0])
if badness_adj <= 0:
badness += badness_adj
else:
if oom_score_adj is None:
oom_score_adj = pid_to_oom_score_adj(pid)
if oom_score_adj >= 0:
badness += badness_adj
if badness < 0:
badness = 0
return badness, oom_score
except FileNotFoundError:
return None, None
except ProcessLookupError:
return None, None
def pid_to_status(pid):
"""
"""
try:
with open('/proc/{}/status'.format(pid), 'rb', buffering=0) as f:
f_list = f.read().decode('utf-8', 'ignore').split('\n')
for i in range(len(f_list)):
if i == 0:
name = f_list[i].split('\t')[1]
if i is state_index:
state = f_list[i].split('\t')[1][0]
if i is ppid_index:
ppid = f_list[i].split('\t')[1]
if i is uid_index:
uid = f_list[i].split('\t')[2]
if i is vm_size_index:
vm_size = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if i is vm_rss_index:
vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3]))
if i is vm_swap_index:
vm_swap = kib_to_mib(int(f_list[i].split('\t')[1][:-3]))
return name, state, ppid, uid, vm_size, vm_rss, vm_swap
except FileNotFoundError:
return None
except ProcessLookupError:
return None
except ValueError:
return None
def uptime():
"""
"""
return float(rline1('/proc/uptime').split(' ')[0])
def errprint(*text):
"""
"""
print(*text, file=stderr, flush=True)
try:
if separate_log:
logging.info(*msg)
except NameError:
pass
def mlockall():
"""
"""
MCL_CURRENT = 1
MCL_FUTURE = 2
MCL_ONFAULT = 4
libc = CDLL('libc.so.6', use_errno=True)
result = libc.mlockall(
MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT
)
if result != 0:
result = libc.mlockall(
MCL_CURRENT | MCL_FUTURE
)
if result != 0:
log('WARNING: cannot lock all memory: [Errno {}]'.format(result))
else:
pass
# log('All memory locked with MCL_CURRENT | MCL_FUTURE')
else:
pass
# log('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT')
def update_stat_dict(key):
"""
"""
if key is not None:
if key not in stat_dict:
stat_dict.update({key: 1})
else:
new_value = stat_dict[key] + 1
stat_dict.update({key: new_value})
def print_stat_dict():
"""
"""
if print_statistics:
lsd = len(stat_dict)
if lsd == 0:
log('No corrective actions applied in the last {}'.format(
format_time(monotonic() - start_time)))
else:
stats_msg = 'What happened in the last {}:'.format(
format_time(monotonic() - start_time))
for i in stat_dict:
stats_msg += '\n {}: {}'.format(i, stat_dict[i])
log(stats_msg)
def find_psi_metrics_value(psi_path, psi_metrics):
"""
"""
if psi_support:
if psi_metrics == 'some_avg10':
return float(rline1(psi_path).split(' ')[1].split('=')[1])
if psi_metrics == 'some_avg60':
return float(rline1(psi_path).split(' ')[2].split('=')[1])
if psi_metrics == 'some_avg300':
return float(rline1(psi_path).split(' ')[3].split('=')[1])
if psi_metrics == 'full_avg10':
with open(psi_path) as f:
psi_list = f.readlines()
return float(psi_list[1].split(' ')[1].split('=')[1])
if psi_metrics == 'full_avg60':
with open(psi_path) as f:
psi_list = f.readlines()
return float(psi_list[1].split(' ')[2].split('=')[1])
if psi_metrics == 'full_avg300':
with open(psi_path) as f:
psi_list = f.readlines()
return float(psi_list[1].split(' ')[3].split('=')[1])
def check_mem_and_swap0():
"""
"""
fd['mi'].seek(0)
m_list = fd['mi'].read().decode().split(' kB\n')
return (int(m_list[mem_available_index].split(':')[1]),
int(m_list[swap_total_index].split(':')[1]),
int(m_list[swap_free_index].split(':')[1]))
def check_mem_and_swap():
"""
"""
fd['mi'].seek(0)
m_list = fd['mi'].read().decode().split(' kB\n')
ma = int(m_list[mem_available_index].split(':')[1])
st = int(m_list[swap_total_index].split(':')[1])
sf = int(m_list[swap_free_index].split(':')[1])
if ZFS:
ma += arcstats()
return ma, st, sf
def meminfo():
"""
"""
fd['mi'].seek(0)
m_list = fd['mi'].read().decode().split(' kB\n')
mem_available = int(m_list[mem_available_index].split(':')[1])
mem_free = int(m_list[mem_free_index].split(':')[1])
swap_total = int(m_list[swap_total_index].split(':')[1])
swap_free = int(m_list[swap_free_index].split(':')[1])
buffers = int(m_list[buffers_index].split(':')[1])
cached = int(m_list[cached_index].split(':')[1])
sreclaimable = int(m_list[sreclaimable_index].split(':')[1])
shmem = int(m_list[shmem_index].split(':')[1])
md = dict()
md['total'] = mem_total
md['used'] = mem_total - mem_free - buffers - cached - sreclaimable
md['free'] = mem_free
md['available'] = mem_available
if ZFS:
z = arcstats()
mem_available += z
md['shared'] = shmem
md['buffers'] = buffers
md['cache'] = cached + sreclaimable
md['swap_total'] = swap_total
md['swap_used'] = swap_total - swap_free
md['swap_free'] = swap_free
return md
def memory_pressure():
"""
"""
with open('/proc/pressure/memory') as f:
psi_list = f.readlines()
some_list, full_list = psi_list[0].split(' '), psi_list[1].split(' ')
some_avg10 = some_list[1].split('=')[1]
some_avg60 = some_list[2].split('=')[1]
some_avg300 = some_list[3].split('=')[1]
full_avg10 = full_list[1].split('=')[1]
full_avg60 = full_list[2].split('=')[1]
full_avg300 = full_list[3].split('=')[1]
return (some_avg10, some_avg60, some_avg300,
full_avg10, full_avg60, full_avg300)
def check_zram():
"""Find MemUsedZram (mem_used_total)."""
if os.path.exists('/sys/block/zram0/mem_limit'):
summa = 0
if os.path.exists('/sys/block/zram0/mm_stat'):
for dev in os.listdir('/sys/block'):
try:
with open('/sys/block/{}/mm_stat'.format(
dev), 'rb', buffering=0) as f:
summa += int(f.read().decode().split()[2])
except FileNotFoundError:
continue
return summa / 1024
else:
for dev in os.listdir('/sys/block'):
try:
with open('/sys/block/{}/mem_used_total'.format(
dev), 'rb', buffering=0) as f:
summa += int(f.read())
except FileNotFoundError:
continue
return summa / 1024
else:
return 0
def format_time(t):
"""
"""
t = int(t)
if t < 60:
return '{}s'.format(t)
if t > 3600:
h = t // 3600
s0 = t - h * 3600
m = s0 // 60
s = s0 % 60
return '{}h {}min {}s'.format(h, m, s)
m = t // 60
s = t % 60
return '{}min {}s'.format(m, s)
def string_to_float_convert_test(string):
"""Try to interprete string values as floats."""
try:
return float(string)
except ValueError:
return None
def string_to_int_convert_test(string):
"""Try to interpret string values as integers."""
try:
return int(string)
except ValueError:
return None
def conf_parse_string(param):
"""
Get string parameters from the config dict.
param: config_dict key
returns config_dict[param].strip()
"""
if param in config_dict:
return config_dict[param].strip()
else:
missing_config_key(param)
def conf_parse_bool(param):
"""
Get bool parameters from the config_dict.
param: config_dict key
returns bool
"""
if param in config_dict:
param_str = config_dict[param]
if param_str == 'True':
return True
elif param_str == 'False':
return False
else:
invalid_config_key_value(param)
else:
missing_config_key(param)
def rline1(path):
"""Read 1st line from the path."""
try:
with open(path) as f:
for line in f:
return line.rstrip()
except UnicodeDecodeError:
with open(path, 'rb') as f:
return f.read(999).decode(
'utf-8', 'ignore').split('\n')[0] # use partition()!
def kib_to_mib(num):
"""Convert KiB values to MiB values."""
return round(num / 1024.0)
def percent(num):
"""Interprete num as percentage."""
return round(num * 100, 1)
def just_percent_mem(num):
"""Convert num to percent and justify."""
return str(round(num * 100, 1)).rjust(4, ' ')
def just_percent_swap(num):
"""
"""
return str(round(num * 100, 1)).rjust(5, ' ')
def human(num, lenth):
"""Convert KiB values to MiB values with right alignment."""
return str(round(num / 1024)).rjust(lenth, ' ')
def is_alive(pid):
"""
"""
try:
with open('/proc/{}/statm'.format(pid), 'rb', buffering=0) as f:
rss = f.read().decode().split(' ')[1]
if rss != '0':
return True
except FileNotFoundError:
return False
except ProcessLookupError:
return False
except NotADirectoryError:
return False
except PermissionError:
return False
def alive_pid_list():
"""
"""
pid_list = []
for pid in os.listdir('/proc'):
if pid[0].isdecimal() is False:
continue
if is_alive(pid):
pid_list.append(pid)
pid_list.remove(self_pid)
if '1' in pid_list:
pid_list.remove('1')
return pid_list
def pid_to_oom_score(pid):
try:
with open('/proc/{}/oom_score'.format(pid), 'rb', buffering=0) as f:
return int(f.read())
except FileNotFoundError:
return 0
except ProcessLookupError:
return 0
except NotADirectoryError:
return 0
def pid_to_oom_score_adj(pid):
try:
with open('/proc/{}/oom_score_adj'.format(pid), 'rb', buffering=0
) as f:
return int(f.read())
except FileNotFoundError:
return 0
except ProcessLookupError:
return 0
except NotADirectoryError:
return 0
def badness_pid_list():
"""
"""
pid_b_list = []
for pid in os.listdir('/proc'):
o = pid_to_oom_score(pid)
if o >= 1:
if pid[0].isdecimal() is False:
continue
if pid == self_pid or pid == '1':
continue
b = pid_to_badness(pid, o)[0]
# log('PID: {}, oom_score: {}, badness: {}, Name: {}'.format(
# pid, o, b, pid_to_name(pid)))
pid_b_list.append((pid, b))
return pid_b_list
def fast_find_victim():
"""
"""
ft1 = monotonic()
pid_badness_list = badness_pid_list()
real_proc_num = len(pid_badness_list)
if real_proc_num == 0:
log('Found {} tasks with non-zero oom_score (except init and self) '
'in {}ms'.format(real_proc_num, round((monotonic() - ft1) * 1000)))
return None
log('Found {} tasks with non-zero oom_score (except init and self) '
'in {}ms'.format(real_proc_num, round((monotonic() - ft1) * 1000)))
# Make list of (pid, badness) tuples, sorted by 'badness' values
pid_badness_list_sorted = sorted(
pid_badness_list,
key=itemgetter(1),
reverse=True)
m0 = monotonic()
top_n = 15
if real_proc_num < top_n:
top_n = real_proc_num
log('TOP-{} tasks by badness:'.format(top_n))
log(' Name PID badness')
log(' --------------- ------- -------')
for pid_badness in pid_badness_list_sorted[0:top_n]:
p = pid_badness[0]
b = str(pid_badness[1])
n = pid_to_name(p)
log(' {} {} {}'.format(n.ljust(15), p.rjust(7), b.rjust(7)))
pid = pid_badness_list_sorted[0][0]
victim_id = get_victim_id(pid)
# Get maximum 'badness' value
victim_badness = pid_badness_list_sorted[0][1]
victim_name = pid_to_name(pid)
log('TOP printed in {}ms; process with highest badness:\n PID: {}, na'
'me: {}, badness: {}'.format(
round((monotonic() - m0) * 1000),
pid,
victim_name,
victim_badness
))
return pid, victim_badness, victim_name, victim_id
def find_victim(_print_proc_table):
"""
Find the process with highest badness and its badness adjustment
Return pid and badness
"""
if not _print_proc_table:
return fast_find_victim()
ft1 = monotonic()
pid_list = alive_pid_list()
pid_badness_list = []
if _print_proc_table:
if extra_table_info == 'None':
extra_table_title = ''
elif extra_table_info == 'cgroup_v1':
extra_table_title = 'CGroup_v1'
elif extra_table_info == 'cgroup_v2':
extra_table_title = 'CGroup_v2'
elif extra_table_info == 'cmdline':
extra_table_title = 'cmdline'
elif extra_table_info == 'environ':
extra_table_title = 'environ'
elif extra_table_info == 'realpath':
extra_table_title = 'realpath'
elif extra_table_info == 'cwd':
extra_table_title = 'cwd'
else:
extra_table_title = ''
hr = '#' * 107
log('Tasks state (memory values in mebibytes):')
log(hr)
log('# PID PPID badness oom_score oom_score_adj e'
'UID S VmSize VmRSS VmSwap Name {}'.format(
extra_table_title))
log('#------- ------- ------- --------- ------------- -------'
'--- - ------ ----- ------ ---------------')
for pid in pid_list:
badness = pid_to_badness(pid, None)[0]
if badness is None:
continue
if _print_proc_table:
try:
oom_score = pid_to_oom_score(pid)
oom_score_adj = pid_to_oom_score_adj(pid)
except FileNotFoundError:
continue
if pid_to_status(pid) is None:
continue
else:
(name, state, ppid, uid, vm_size, vm_rss,
vm_swap) = pid_to_status(pid)
if extra_table_info == 'None':
extra_table_line = ''
elif extra_table_info == 'cgroup_v1':
extra_table_line = pid_to_cgroup_v1(pid)
elif extra_table_info == 'cgroup_v2':
extra_table_line = pid_to_cgroup_v2(pid)
elif extra_table_info == 'cmdline':
extra_table_line = pid_to_cmdline(pid)
elif extra_table_info == 'environ':
extra_table_line = pid_to_environ(pid)
elif extra_table_info == 'realpath':
extra_table_line = pid_to_realpath(pid)
elif extra_table_info == 'cwd':
extra_table_line = pid_to_cwd(pid)
else:
extra_table_line = ''
log('#{} {} {} {} {} {} {} {} {} {} {} {}'.format(
pid.rjust(7),
ppid.rjust(7),
str(badness).rjust(7),
str(oom_score).rjust(9),
str(oom_score_adj).rjust(13),
uid.rjust(10),
state,
str(vm_size).rjust(6),
str(vm_rss).rjust(5),
str(vm_swap).rjust(6),
name.ljust(15),
extra_table_line
)
)
pid_badness_list.append((pid, badness))
real_proc_num = len(pid_badness_list)
# Make list of (pid, badness) tuples, sorted by 'badness' values
# print(pid_badness_list)
pid_tuple_list = sorted(
pid_badness_list,
key=itemgetter(1),
reverse=True
)[0]
pid = pid_tuple_list[0]
victim_id = get_victim_id(pid)
# Get maximum 'badness' value
victim_badness = pid_tuple_list[1]
victim_name = pid_to_name(pid)
if _print_proc_table:
log(hr)
log('Found {} tasks with non-zero VmRSS (except init and self)'.format(
real_proc_num))
log(
'Process with highest badness (found in {}ms):\n PID: {}, Na'
'me: {}, badness: {}'.format(
round((monotonic() - ft1) * 1000),
pid,
victim_name,
victim_badness
)
)
return pid, victim_badness, victim_name, victim_id
def find_victim_info(pid, victim_badness, name):
"""
"""
status0 = monotonic()
try:
with open('/proc/{}/status'.format(pid), 'rb', buffering=0) as f:
f_list = f.read().decode('utf-8', 'ignore').split('\n')
for i in range(len(f_list)):
if i is state_index:
state = f_list[i].split('\t')[1].rstrip()
if i is uid_index:
uid = f_list[i].split('\t')[2]
if i is vm_size_index:
vm_size = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if i is vm_rss_index:
vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3]))
if detailed_rss:
if i is anon_index:
anon_rss = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if i is file_index:
file_rss = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if i is shmem_index:
shmem_rss = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if i is vm_swap_index:
vm_swap = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if print_victim_cmdline:
cmdline = pid_to_cmdline(pid)
oom_score = pid_to_oom_score(pid)
oom_score_adj = pid_to_oom_score_adj(pid)
except IndexError:
x = 'The victim died in the search process: IndexError'
log(x)
update_stat_dict(x)
print_stat_dict()
return None
except ValueError:
x = 'The victim died in the search process: ValueError'
log(x)
update_stat_dict(x)
print_stat_dict()
return None
try:
realpath = pid_to_realpath(pid)
cwd = pid_to_cwd(pid)
nssid = pid_to_nssid(pid)
victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
victim_cgroup_v1 = pid_to_cgroup_v1(pid)
victim_cgroup_v2 = pid_to_cgroup_v2(pid)
except FileNotFoundError:
x = 'The victim died in the search process: FileNotFoundError'
log(x)
update_stat_dict(x)
print_stat_dict()
return None
ancestry = pid_to_ancestry(pid, max_victim_ancestry_depth)
if print_victim_cmdline is False:
cmdline = ''
c1 = ''
else:
c1 = '\n cmdline: '
if detailed_rss:
detailed_rss_info = ' (Anon: {}, File: {}, Shmem: {})'.format(
anon_rss,
file_rss,
shmem_rss)
else:
detailed_rss_info = ''
victim_info = 'Victim status (found in {}ms):' \
'\n PID: {}, name: {}, state: {}, EUID: {}, ' \
'SID: {} ({}), lifetime: {}' \
'\n badness: {}, oom_score: {}, oom_score_adj: {}' \
'\n Vm, MiB: Size: {}, RSS: {}{}, Swap: {}' \
'\n cgroup_v1: {}' \
'\n cgroup_v2: {}' \
'{}{}{}' \
'\n exe realpath: {}' \
'\n cwd realpath: {}'.format(
round((monotonic() - status0) * 1000),
pid,
name,
state,
uid,
nssid, pid_to_name(nssid),
victim_lifetime,
victim_badness,
oom_score,
oom_score_adj,
vm_size,
vm_rss,
detailed_rss_info,
vm_swap,
victim_cgroup_v1,
victim_cgroup_v2,
ancestry,
c1, cmdline,
realpath,
cwd
)
return victim_info
def check_mem_swap_ex():
"""
Check: is mem and swap threshold exceeded?
Return: None, (SIGTERM, meminfo), (SIGKILL, meminfo)
"""
mem_available, swap_total, swap_free = check_mem_and_swap()
# if hard_threshold_min_swap is set in percent
if swap_kill_is_percent:
hard_threshold_min_swap_kb = swap_total * \
hard_threshold_min_swap_percent / 100.0
else:
hard_threshold_min_swap_kb = swap_kb_dict['hard_threshold_min_swap_kb']
if swap_term_is_percent:
soft_threshold_min_swap_kb = swap_total * \
soft_threshold_min_swap_percent / 100.0
else:
soft_threshold_min_swap_kb = swap_kb_dict['soft_threshold_min_swap_kb']
if swap_warn_is_percent:
warning_threshold_min_swap_kb = swap_total * \
warning_threshold_min_swap_percent / 100.0
else:
warning_threshold_min_swap_kb = swap_kb_dict[
'warning_threshold_min_swap_kb']
if swap_total > hard_threshold_min_swap_kb:
swap_sigkill_pc = percent(
hard_threshold_min_swap_kb / (swap_total + 0.1))
else:
swap_sigkill_pc = '-'
if swap_total > soft_threshold_min_swap_kb:
swap_sigterm_pc = percent(
soft_threshold_min_swap_kb / (swap_total + 0.1))
else:
swap_sigterm_pc = '-'
if (mem_available <= hard_threshold_min_mem_kb and
swap_free <= hard_threshold_min_swap_kb):
mem_info = 'Memory status that requires corrective actions:\n Mem' \
'Available [{} MiB, {} %] <= hard_threshold_min_mem [{} MiB' \
', {} %]\n SwapFree [{} MiB, {} %] <= hard_threshold_m' \
'in_swap [{} MiB, {} %]'.format(
kib_to_mib(mem_available),
percent(mem_available / mem_total),
kib_to_mib(hard_threshold_min_mem_kb),
round(hard_threshold_min_mem_percent, 1),
kib_to_mib(swap_free),
percent(swap_free / (swap_total + 0.1)),
kib_to_mib(hard_threshold_min_swap_kb),
swap_sigkill_pc)
return (SIGKILL, mem_info, mem_available, hard_threshold_min_swap_kb,
soft_threshold_min_swap_kb, swap_free, swap_total)
if (mem_available <= soft_threshold_min_mem_kb and
swap_free <= soft_threshold_min_swap_kb):
mem_info = 'Memory status that requires corrective actions:\n M' \
'emAvailable [{} MiB, {} %] <= soft_threshold_min_mem [{} MiB,' \
' {} %]\n SwapFree [{} MiB, {} %] <= soft_threshold_min_swa' \
'p [{} MiB, {} %]'.format(
kib_to_mib(mem_available),
percent(mem_available / mem_total),
kib_to_mib(soft_threshold_min_mem_kb),
round(soft_threshold_min_mem_percent, 1),
kib_to_mib(swap_free),
percent(swap_free / (swap_total + 0.1)),
kib_to_mib(soft_threshold_min_swap_kb),
swap_sigterm_pc)
return (SIGTERM, mem_info, mem_available, hard_threshold_min_swap_kb,
soft_threshold_min_swap_kb, swap_free, swap_total)
if low_memory_warnings_enabled:
if (mem_available <= warning_threshold_min_mem_kb and swap_free <=
warning_threshold_min_swap_kb + 0.1):
return ('WARN', None, mem_available, hard_threshold_min_swap_kb,
soft_threshold_min_swap_kb, swap_free, swap_total)
return (None, None, mem_available, hard_threshold_min_swap_kb,
soft_threshold_min_swap_kb, swap_free, swap_total)
def check_zram_ex():
"""
"""
mem_used_zram = check_zram()
if mem_available <= hard_threshold_min_mem_kb:
ma_hard_threshold_exceded = True
else:
ma_hard_threshold_exceded = False
if mem_available <= soft_threshold_min_mem_kb:
ma_soft_threshold_exceded = True
else:
ma_soft_threshold_exceded = False
if mem_available <= warning_threshold_min_mem_kb:
ma_warning_threshold_exceded = True
else:
ma_warning_threshold_exceded = False
if (mem_used_zram >= hard_threshold_max_zram_kb and
ma_hard_threshold_exceded):
mem_info = 'Memory status that requires corrective actions:\n MemAv' \
'ailable [{} MiB, {} %] <= hard_threshold_min_mem [{} MiB' \
', {} %]\n MemUsedZram [{} MiB, {} %] >= hard_threshold_' \
'max_zram [{} MiB, {} %]'.format(
kib_to_mib(mem_available),
percent(mem_available / mem_total),
kib_to_mib(hard_threshold_min_mem_kb),
round(hard_threshold_min_mem_percent, 1),
kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total),
kib_to_mib(hard_threshold_max_zram_kb),
percent(hard_threshold_max_zram_kb / mem_total))
return SIGKILL, mem_info, mem_used_zram
if (mem_used_zram >= soft_threshold_max_zram_kb and
ma_soft_threshold_exceded):
mem_info = 'Memory status that requires corrective actions:\n MemA' \
'vailable [{} MiB, {} %] <= soft_threshold_min_mem [{} M' \
'iB, {} %]\n MemUsedZram [{} MiB, {} %] >= soft_thresho' \
'ld_max_zram [{} M, {} %]'.format(
kib_to_mib(mem_available),
percent(mem_available / mem_total),
kib_to_mib(soft_threshold_min_mem_kb),
round(soft_threshold_min_mem_percent, 1),
kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total),
kib_to_mib(soft_threshold_max_zram_kb),
percent(soft_threshold_max_zram_kb / mem_total))
return SIGTERM, mem_info, mem_used_zram
if low_memory_warnings_enabled:
if (mem_used_zram >= warning_threshold_max_zram_kb and
ma_warning_threshold_exceded):
return 'WARN', None, mem_used_zram
return None, None, mem_used_zram
def check_psi_ex(psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0,
mem_available):
"""
"""
if mem_available <= hard_threshold_min_mem_kb:
ma_hard_threshold_exceded = True
else:
ma_hard_threshold_exceded = False
if mem_available <= soft_threshold_min_mem_kb:
ma_soft_threshold_exceded = True
else:
ma_soft_threshold_exceded = False
if mem_available <= warning_threshold_min_mem_kb:
ma_warning_threshold_exceded = True
else:
ma_warning_threshold_exceded = False
if not (ma_warning_threshold_exceded or ma_soft_threshold_exceded or
ma_hard_threshold_exceded) or swap_total == 0:
return (None, None,
psi_t0, psi_kill_exceeded_timer,
psi_term_exceeded_timer, x0)
delta0 = monotonic() - x0
x0 = monotonic()
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
psi_post_action_delay_timer = monotonic() - last_action_dict['t'] # psi_t0
if psi_post_action_delay_timer >= psi_post_action_delay:
psi_post_action_delay_exceeded = True
else:
psi_post_action_delay_exceeded = False
if psi_avg_value >= hard_threshold_max_psi:
sigkill_psi_exceeded = True
if ma_hard_threshold_exceded:
if psi_kill_exceeded_timer < 0:
psi_kill_exceeded_timer = 0
else:
psi_kill_exceeded_timer += delta0
else:
psi_kill_exceeded_timer = -0.0001
else:
sigkill_psi_exceeded = False
psi_kill_exceeded_timer = -0.0001
if debug_psi:
log('-------------------------------------------------------------'
'-----------')
log('psi_post_action_delay_timer: {}, psi_post_action_delay_exceed'
'ed: {}'.format(
round(psi_post_action_delay_timer, 1),
psi_post_action_delay_exceeded))
log('mem_avail_hard_threshold_exceded: {}, hard_threshold_psi_exce'
'eded: {}, hard_psi_excess_duration: {}'.format(
ma_hard_threshold_exceded,
sigkill_psi_exceeded,
round(psi_kill_exceeded_timer, 1)
))
if (sigkill_psi_exceeded and psi_kill_exceeded_timer >=
psi_excess_duration and psi_post_action_delay_exceeded and
ma_hard_threshold_exceded):
mem_info = 'Memory status that requires corrective actions:\n MemAv' \
'ailable [{} MiB, {} %] <= hard_threshold_min_mem [{} MiB' \
', {} %]\n PSI avg value ({}) >= hard_threshold_max_psi ' \
'({})\n PSI avg value exceeded psi_excess_duration (valu' \
'e={}s) for {}s'.format(
kib_to_mib(mem_available),
percent(mem_available / mem_total),
kib_to_mib(hard_threshold_min_mem_kb),
round(hard_threshold_min_mem_percent, 1),
psi_avg_value,
hard_threshold_max_psi,
psi_excess_duration,
round(psi_kill_exceeded_timer, 1)
)
return (SIGKILL, mem_info, psi_t0, psi_kill_exceeded_timer,
psi_term_exceeded_timer, x0)
if psi_avg_value >= soft_threshold_max_psi:
sigterm_psi_exceeded = True
if ma_soft_threshold_exceded:
if psi_term_exceeded_timer < 0:
psi_term_exceeded_timer = 0
else:
psi_term_exceeded_timer += delta0
else:
psi_term_exceeded_timer = -0.0001
else:
sigterm_psi_exceeded = False
psi_term_exceeded_timer = -0.0001
if debug_psi:
log('mem_avail_soft_threshold_exceded: {}, soft_threshold_psi_exce'
'eded: {}, soft_psi_excess_duration: {}'.format(
ma_soft_threshold_exceded,
sigterm_psi_exceeded,
round(psi_term_exceeded_timer, 1)
))
if (sigterm_psi_exceeded and psi_term_exceeded_timer >=
psi_excess_duration and psi_post_action_delay_exceeded and
ma_soft_threshold_exceded):
mem_info = 'Memory status that requires corrective actions:\n MemA' \
'vailable [{} MiB, {} %] <= soft_threshold_min_mem [{} M' \
'iB, {} %]\n PSI avg value ({}) >= soft_threshold_max_p' \
'si ({})\n PSI avg value exceeded psi_excess_duration (' \
'value={}s) for {}s'.format(
kib_to_mib(mem_available),
percent(mem_available / mem_total),
kib_to_mib(soft_threshold_min_mem_kb),
round(soft_threshold_min_mem_percent, 1),
psi_avg_value,
soft_threshold_max_psi,
psi_excess_duration,
round(psi_term_exceeded_timer, 1)
)
return (SIGTERM, mem_info, psi_t0, psi_kill_exceeded_timer,
psi_term_exceeded_timer, x0)
if low_memory_warnings_enabled:
if (psi_avg_value >= warning_threshold_max_psi and
ma_warning_threshold_exceded):
return ('WARN', None, psi_t0, psi_kill_exceeded_timer,
psi_term_exceeded_timer, x0)
return (None, None, psi_t0, psi_kill_exceeded_timer,
psi_term_exceeded_timer, x0)
def is_victim_alive(victim_id):
"""
We do not have a reliable sign of the end of the release of memory:
https://github.com/rfjakob/earlyoom/issues/128#issuecomment-507023717
"""
starttime, pid = victim_id.split('_pid')
new_victim_id = get_victim_id(pid)
if victim_id != new_victim_id:
return 0
if is_alive(pid):
return 1
state = pid_to_state(pid)
if state == 'R':
return 2
if state == 'Z':
return 3
if state == 'X' or state == '':
return 0
return 0
def implement_corrective_action(
threshold,
mem_info_list,
psi_t0,
psi_kill_exceeded_timer,
psi_term_exceeded_timer,
x0,
psi_threshold,
zram_threshold,
zram_info,
psi_info):
log('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>'
'>>>>>>>>>>>>>>')
debug_corrective_action = True
time0 = monotonic()
nu = []
for victim_id in v_dict:
iva = is_victim_alive(victim_id)
if iva == 0 or iva == 3:
nu.append(victim_id)
for i in nu:
if debug_corrective_action:
log('Remove {} from v_dict'.format(i))
v_dict.pop(i)
x = False
cache_list = []
for victim_id in v_dict:
tx = v_dict[victim_id]['time']
ddt = monotonic() - tx
if ddt < victim_cache_time:
if debug_corrective_action:
log('victim_cache_time is not exceeded for {} ({} <'
' {})'.format(victim_id, round(ddt, 3), victim_cache_time))
x = True
cache_list.append((victim_id, ddt))
break
if x:
e = sorted(cache_list, key=itemgetter(1), reverse=False)
cached_victim_id = e[0][0]
for i in mem_info_list:
log(i)
if x:
victim_id = cached_victim_id
pid = victim_id.partition('_pid')[2]
victim_badness = pid_to_badness(pid, None)[0]
name = v_dict[victim_id]['name']
log('New victim is cached victim {} ({})'.format(pid, name))
else:
s1 = set(os.listdir('/proc'))
fff = find_victim(print_proc_table)
# sleep(0.1)
s2 = set(os.listdir('/proc'))
dset = s1 - s2
if len(dset) > 0:
log('During the search for the victim, the processes were '
'completed: {}'.format(dset))
sleep(over_sleep)
log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
'<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
return psi_t0
if fff is None:
if debug_sleep:
log('Sleep {}s'.format(over_sleep))
sleep(over_sleep)
log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
'<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
return psi_t0
pid, victim_badness, name, victim_id = fff
log('Recheck memory levels...')
(masf_threshold, masf_info, mem_available, hard_threshold_min_swap_kb,
soft_threshold_min_swap_kb, swap_free, swap_total) = check_mem_swap_ex()
if CHECK_ZRAM:
zram_threshold, zram_info, mem_used_zram = check_zram_ex()
if CHECK_PSI:
(psi_threshold, psi_info, psi_t0, psi_kill_exceeded_timer,
psi_term_exceeded_timer, x0) = check_psi_ex(
psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0,
mem_available)
if (masf_threshold is SIGKILL or zram_threshold is SIGKILL or
psi_threshold is SIGKILL):
new_threshold = SIGKILL
mem_info_list = []
if masf_threshold is SIGKILL or masf_threshold is SIGTERM:
mem_info_list.append(masf_info)
if zram_threshold is SIGKILL or zram_threshold is SIGTERM:
mem_info_list.append(zram_info)
if psi_threshold is SIGKILL or psi_threshold is SIGTERM:
mem_info_list.append(psi_info)
elif (masf_threshold is SIGTERM or zram_threshold is SIGTERM or
psi_threshold is SIGTERM):
new_threshold = SIGTERM
mem_info_list = []
if masf_threshold is SIGKILL or masf_threshold is SIGTERM:
mem_info_list.append(masf_info)
if zram_threshold is SIGKILL or zram_threshold is SIGTERM:
mem_info_list.append(zram_info)
if psi_threshold is SIGKILL or psi_threshold is SIGTERM:
mem_info_list.append(psi_info)
else:
log('Thresholds is not exceeded now')
log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
'<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
return psi_t0
for i in mem_info_list:
log(i)
if new_threshold is None or new_threshold == 'WARN':
log('Thresholds is not exceeded now')
log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
'<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
return psi_t0
threshold = new_threshold
vwd = None # Victim Will Die
if threshold is SIGTERM:
if victim_id in v_dict:
dt = monotonic() - v_dict[victim_id]['time']
if dt > max_soft_exit_time:
log('max_soft_exit_time (value={}s) is exceeded the victim:'
' it will get SIGKILL'.format(
max_soft_exit_time))
threshold = SIGKILL
else:
log('max_soft_exit_time is not exceeded ('
'{} < {}) for the victim'.format(round(
dt, 1), max_soft_exit_time))
if debug_sleep:
log('Sleep {}s'.format(over_sleep))
sleep(over_sleep)
log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
'<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
return psi_t0
if victim_badness >= min_badness:
if print_victim_status:
victim_info = find_victim_info(pid, victim_badness, name)
if victim_info is not None:
log(victim_info)
else:
sleep(over_sleep)
log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
'<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<')
return psi_t0
mid = meminfo()
log('Memory info, MiB:')
log(' total={}, used={}, free={}, available={}, shared={}, buffers'
'={}, cache={},'.format(
round(mem_total / 1024),
round(mid['used'] / 1024),
round(mid['free'] / 1024),
round(mid['available'] / 1024),
round(mid['shared'] / 1024),
round(mid['buffers'] / 1024),
round(mid['cache'] / 1024)
))
log(' swap_total={}, swap_used={}, swap_free={}'.format(
round(mid['swap_total'] / 1024),
round(mid['swap_used'] / 1024),
round(mid['swap_free'] / 1024)
))
if psi_support:
mp = memory_pressure()
log('Memory pressure (system-wide):')
log(' some avg10={} avg60={} avg300={}'.format(
mp[0], mp[1], mp[2]
))
log(' full avg10={} avg60={} avg300={}'.format(
mp[3], mp[4], mp[5]
))
soft_match = False
if soft_actions and threshold is SIGTERM:
name = pid_to_name(pid)
cgroup_v1 = pid_to_cgroup_v1(pid)
cgroup_v2 = pid_to_cgroup_v2(pid)
if cgroup_v1 != '':
cgroup_v1_tail = cgroup_v1.rpartition('/')[2]
if cgroup_v1_tail.endswith('.service'):
service = cgroup_v1_tail
else:
service = ''
elif cgroup_v2 != '':
cgroup_v2_tail = cgroup_v2.rpartition('/')[2]
if cgroup_v2_tail.endswith('.service'):
service = cgroup_v2_tail
else:
service = ''
else:
service = ''
for i in soft_actions_list:
unit = i[0]
if unit == 'name':
u = name
elif unit == 'cgroup_v1':
u = cgroup_v1
else:
u = cgroup_v2
regexp = i[1]
command = i[2]
if search(regexp, u) is not None:
log("Regexp '{}' matches with {} '{}'".format(
regexp, unit, u))
soft_match = True
break
start_action = monotonic()
if soft_match:
cmd = command.replace('$PID', pid).replace('$NAME', pid_to_name(
pid)).replace('$SERVICE', service)
preventing_oom_message = 'Implementing a corrective action:\n ' \
'Executing the command: {}'.format(cmd)
log(preventing_oom_message)
err = start_thread(exe, cmd)
if err == 1:
key = 'Cannot execute the command in the new thread'
update_stat_dict(key)
log(key)
else:
update_stat_dict('Executing the command "{}"'.format(command))
response_time = monotonic() - time0
log('Total response time: {}ms'.format(round(
response_time * 1000)))
print_stat_dict()
else:
preventing_oom_message = 'Implementing a corrective action:\n ' \
'Sending {} to the victim'.format(
sig_dict[threshold])
log(preventing_oom_message)
try:
os.kill(int(pid), threshold)
update_stat_dict(
'[ OK ] Sending {} to {}'.format(sig_dict[threshold], name)
)
response_time = monotonic() - time0
send_result = 'OK; total response time: {}ms'.format(
round(response_time * 1000))
log(send_result)
if threshold is SIGKILL:
vwd = True
print_stat_dict()
except FileNotFoundError:
vwd = True
key = 'Cannot send a signal: FileNotFoundError'
update_stat_dict(key)
print_stat_dict()
log(key)
except ProcessLookupError:
vwd = True
key = 'Cannot send a signal: ProcessLookupError'
update_stat_dict(key)
print_stat_dict()
log(key)
except PermissionError:
vwd = False
key = 'Cannot send a signal: PermissionError'
log(key)
update_stat_dict(key)
print_stat_dict()
log('Sleep {}s'.format(post_soft_action_delay))
sleep(post_soft_action_delay)
# do not send signal twice!
if not vwd:
if victim_id not in v_dict:
v_dict[victim_id] = dict()
v_dict[victim_id]['time'] = monotonic()
v_dict[victim_id]['name'] = name
else:
pass
last_action_dict['t'] = kill_timestamp = monotonic()
kill_timestamp = start_action
while True:
sleep(0.01)
d = monotonic() - kill_timestamp
iva = is_victim_alive(victim_id)
if iva == 0:
log('The victim died in {}s'.format(round(d, 3)))
if victim_id in v_dict:
v_dict.pop(victim_id)
break
elif iva == 1:
if vwd and d > sensitivity_test_time + 10:
log('The victim doesn\'t respond on corrective action'
' in {}s'.format(round(d, 3)))
break
if not vwd and d > sensitivity_test_time:
log('The victim doesn\'t respond on corrective action'
' in {}s'.format(round(d, 3)))
break
elif iva == 2:
pass
else:
log('The victim became a zombie in {}s'.format(round(d, 3)))
if victim_id in v_dict:
v_dict.pop(victim_id)
sleep(post_zombie_delay)
break
mem_available, swap_total, swap_free = check_mem_and_swap()
ma_mib = int(mem_available) / 1024.0
sf_mib = int(swap_free) / 1024.0
log('Memory status after implementing a corrective act'
'ion:\n MemAvailable'
': {} MiB, SwapFree: {} MiB'.format(
round(ma_mib, 1), round(sf_mib, 1)))
if threshold is SIGKILL and post_kill_exe != '':
cmd = post_kill_exe.replace('$PID', pid).replace(
'$NAME', pid_to_name(pid))
log('Execute post_kill_exe')
start_thread(exe, cmd)
if post_action_gui_notifications:
if soft_match:
send_notify_etc(pid, name, cmd)
else:
send_notify(threshold, name, pid)
else:
response_time = monotonic() - time0
victim_badness_is_too_small = 'victim (PID: {}, Name: {}) badness ' \
'({}) < min_badness ({}); nothing to do; response tim' \
'e: {}ms'.format(
pid, name,
victim_badness,
min_badness,
round(response_time * 1000))
log(victim_badness_is_too_small)
# update stat_dict
key = 'victim badness < min_badness'
update_stat_dict(key)
print_stat_dict()
if vwd is None:
if debug_sleep:
log('Sleep {}s'.format(over_sleep))
sleep(over_sleep)
log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'
'<<<<<<<<<<<<<<<<<')
return psi_t0
def sleep_after_check_mem():
"""Specify sleep times depends on rates and avialable memory."""
if stable_sleep:
if debug_sleep:
log('Sleep {}s'.format(min_sleep))
stdout.flush()
sleep(min_sleep)
return None
if hard_threshold_min_mem_kb < soft_threshold_min_mem_kb:
mem_point = mem_available - soft_threshold_min_mem_kb
else:
mem_point = mem_available - hard_threshold_min_mem_kb
if hard_threshold_min_swap_kb < soft_threshold_min_swap_kb:
swap_point = swap_free - soft_threshold_min_swap_kb
else:
swap_point = swap_free - hard_threshold_min_swap_kb
if swap_point < 0:
swap_point = 0
if mem_point < 0:
mem_point = 0
t_mem = mem_point / fill_rate_mem
t_swap = swap_point / fill_rate_swap
if CHECK_ZRAM:
t_zram = (mem_total * 0.8 - mem_used_zram) / fill_rate_zram
if t_zram < 0:
t_zram = 0
t_mem_zram = t_mem + t_zram
z = ', t_zram={}'.format(round(t_zram, 2))
else:
z = ''
t_mem_swap = t_mem + t_swap
if CHECK_ZRAM:
if t_mem_swap <= t_mem_zram:
t = t_mem_swap
else:
t = t_mem_zram
else:
t = t_mem_swap
if t > max_sleep:
t = max_sleep
elif t < min_sleep:
t = min_sleep
else:
pass
if debug_sleep:
log('Sleep {}s (t_mem={}, t_swap={}{})'.format(round(t, 2), round(
t_mem, 2), round(t_swap, 2), z))
stdout.flush()
sleep(t)
def calculate_percent(arg_key):
"""
parse conf dict
Calculate mem_min_KEY_percent.
arg_key: str key for config_dict
returns int mem_min_percent or NoneType if got some error
"""
if arg_key in config_dict:
mem_min = config_dict[arg_key]
if mem_min.endswith('%'):
# truncate percents, so we have a number
mem_min_percent = mem_min[:-1].strip()
# then 'float test'
mem_min_percent = string_to_float_convert_test(mem_min_percent)
if mem_min_percent is None:
invalid_config_key_value(arg_key)
# soft_threshold_min_mem_percent is clean and valid float
# percentage. Can translate into Kb
mem_min_kb = mem_min_percent / 100 * mem_total
mem_min_mb = round(mem_min_kb / 1024)
elif mem_min.endswith('M'):
mem_min_mb = string_to_float_convert_test(mem_min[:-1].strip())
if mem_min_mb is None:
invalid_config_key_value(arg_key)
mem_min_kb = mem_min_mb * 1024
mem_min_percent = mem_min_kb / mem_total * 100
else:
invalid_config_key_value(arg_key)
else:
missing_config_key(arg_key)
if (arg_key == 'soft_threshold_min_mem' or
arg_key == 'hard_threshold_min_mem'):
if mem_min_kb > mem_total * 0.5 or mem_min_kb < 0:
invalid_config_key_value(arg_key)
if (arg_key == 'soft_threshold_max_zram' or
arg_key == 'hard_threshold_max_zram'):
if mem_min_kb > mem_total * 0.9 or mem_min_kb < mem_total * 0.1:
invalid_config_key_value(arg_key)
if (arg_key == 'warning_threshold_min_mem' or
arg_key == 'warning_threshold_max_zram'):
if mem_min_kb > mem_total or mem_min_kb < 0:
invalid_config_key_value(arg_key)
return mem_min_kb, mem_min_mb, mem_min_percent
###############################################################################
# {victim_id : {'time': timestamp, 'name': name}
v_dict = dict()
start_time = monotonic()
help_mess = """usage: nohang [-h|--help] [-v|--version] [-m|--memload]
[-c|--config CONFIG] [--check] [--monitor] [--tasks]
optional arguments:
-h, --help show this help message and exit
-v, --version show version of installed package and exit
-m, --memload consume memory until 40 MiB (MemAvailable + SwapFree)
remain free, and terminate the process
-c CONFIG, --config CONFIG
path to the config file. This should only be used
with one of the following options:
--monitor, --tasks, --check
--check check and show the configuration and exit. This should
only be used with -c/--config CONFIG option
--monitor start monitoring. This should only be used with
-c/--config CONFIG option
--tasks show tasks state and exit. This should only be used
with -c/--config CONFIG option"""
SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE'])
conf_err_mess = 'Invalid config. Exit.'
sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP]
sig_dict = {
SIGKILL: 'SIGKILL',
SIGINT: 'SIGINT',
SIGQUIT: 'SIGQUIT',
SIGHUP: 'SIGHUP',
SIGTERM: 'SIGTERM'
}
self_pid = str(os.getpid())
self_uid = os.geteuid()
if self_uid == 0:
root = True
else:
root = False
last_action_dict = dict()
last_action_dict['t'] = monotonic()
# will store corrective actions stat
stat_dict = dict()
separate_log = False # will be overwritten after parse config
cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes()
pid_list = alive_pid_list()
print_proc_table_flag = False
check_config_flag = False
a = argv[1:]
la = len(a)
if la == 0:
print('ERROR: invalid input: missing CLI options\n')
print(help_mess)
exit(1)
if la == 1:
if a[0] == '-h' or a[0] == '--help':
print(help_mess)
exit()
if a[0] == '-v' or a[0] == '--version':
print_version()
if a[0] == '-m' or a[0] == '--memload':
memload()
print('ERROR: invalid input\n')
print(help_mess)
exit(1)
if la == 2:
print('ERROR: invalid input\n')
print(help_mess)
exit(1)
if la == 3:
if '-c' in a or '--config' in a:
if '--monitor' in a or '--check' in a or '--tasks' in a:
try:
aaa = a.index('-c')
except ValueError:
pass
try:
aaa = a.index('--config')
except ValueError:
pass
try:
config = a[aaa + 1]
except IndexError:
print('ERROR: invalid input\n')
print(help_mess)
exit(1)
if (config == '--check' or config == '--monitor' or
config == '--tasks:'):
print('ERROR: invalid input\n')
print(help_mess)
exit(1)
if '--check' in a:
check_config_flag = True
if '--tasks' in a:
print_proc_table_flag = True
else:
print('ERROR: invalid input\n')
print(help_mess)
exit(1)
else:
print('ERROR: invalid input\n')
print(help_mess)
exit(1)
if la > 3:
print('ERROR: invalid CLI input: too many options\n')
print(help_mess)
exit(1)
# find mem_total
# find positions of SwapFree and SwapTotal in /proc/meminfo
with open('/proc/meminfo') as f:
mem_list = f.readlines()
mem_list_names = []
for s in mem_list:
mem_list_names.append(s.split(':')[0])
try:
mem_available_index = mem_list_names.index('MemAvailable')
except ValueError:
errprint('ERROR: your Linux kernel is too old, Linux 3.14+ required')
mem_free_index = mem_list_names.index('MemFree')
swap_total_index = mem_list_names.index('SwapTotal')
swap_free_index = mem_list_names.index('SwapFree')
buffers_index = mem_list_names.index('Buffers')
cached_index = mem_list_names.index('Cached')
sreclaimable_index = mem_list_names.index('SReclaimable')
shmem_index = mem_list_names.index('Shmem')
mem_total = int(mem_list[0].split(':')[1][:-4])
# Get names from /proc/*/status to be able to get VmRSS and VmSwap values
with open('/proc/self/status') as file:
status_list = file.readlines()
status_names = []
for s in status_list:
status_names.append(s.split(':')[0])
ppid_index = status_names.index('PPid')
vm_size_index = status_names.index('VmSize')
vm_rss_index = status_names.index('VmRSS')
vm_swap_index = status_names.index('VmSwap')
uid_index = status_names.index('Uid')
state_index = status_names.index('State')
try:
anon_index = status_names.index('RssAnon')
file_index = status_names.index('RssFile')
shmem_index = status_names.index('RssShmem')
detailed_rss = True
# print(detailed_rss, 'detailed_rss')
except ValueError:
detailed_rss = False
# print('It is not Linux 4.5+')
log('config: ' + config)
###############################################################################
# parsing the config with obtaining the parameters dictionary
# conf_parameters_dict
# conf_restart_dict
# dictionary with config options
config_dict = dict()
badness_adj_re_name_list = []
badness_adj_re_cmdline_list = []
badness_adj_re_environ_list = []
badness_adj_re_uid_list = []
badness_adj_re_cgroup_v1_list = []
badness_adj_re_cgroup_v2_list = []
badness_adj_re_realpath_list = []
badness_adj_re_cwd_list = []
soft_actions_list = []
# separator for optional parameters (that starts with @)
opt_separator = '///'
# stupid conf parsing, it needs refactoring
try:
with open(config) as f:
for line in f:
a = line.startswith('#')
b = line.startswith('\n')
c = line.startswith('\t')
d = line.startswith(' ')
etc = line.startswith('@SOFT_ACTION_RE_NAME')
etc2 = line.startswith('@SOFT_ACTION_RE_CGROUP_V1')
etc2_2 = line.startswith('@SOFT_ACTION_RE_CGROUP_V2')
if (not a and not b and not c and not d and not etc and
not etc2 and not etc2_2):
a = line.partition('=')
key = a[0].strip()
value = a[2].strip()
if key not in config_dict:
config_dict[key] = value
else:
log('ERROR: config key duplication: {}'.format(key))
exit(1)
if etc:
a = line.partition('@SOFT_ACTION_RE_NAME')[
2].partition(opt_separator)
a1 = 'name'
a2 = a[0].strip()
valid_re(a2)
a3 = a[2].strip()
zzz = (a1, a2, a3)
soft_actions_list.append(zzz)
if etc2:
a = line.partition('@SOFT_ACTION_RE_CGROUP_V1')[
2].partition(opt_separator)
a1 = 'cgroup_v1'
a2 = a[0].strip()
valid_re(a2)
a3 = a[2].strip()
zzz = (a1, a2, a3)
soft_actions_list.append(zzz)
if etc2_2:
a = line.partition('@SOFT_ACTION_RE_CGROUP_V2')[
2].partition(opt_separator)
a1 = 'cgroup_v2'
a2 = a[0].strip()
valid_re(a2)
a3 = a[2].strip()
zzz = (a1, a2, a3)
soft_actions_list.append(zzz)
if line.startswith('@BADNESS_ADJ_RE_NAME'):
a = line.partition('@BADNESS_ADJ_RE_NAME')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_name_list.append((badness_adj, reg_exp))
if line.startswith('@BADNESS_ADJ_RE_CMDLINE'):
a = line.partition('@BADNESS_ADJ_RE_CMDLINE')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_cmdline_list.append((badness_adj, reg_exp))
if line.startswith('@BADNESS_ADJ_RE_UID'):
a = line.partition('@BADNESS_ADJ_RE_UID')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_uid_list.append((badness_adj, reg_exp))
if line.startswith('@BADNESS_ADJ_RE_CGROUP_V1'):
a = line.partition('@BADNESS_ADJ_RE_CGROUP_V1')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_cgroup_v1_list.append((badness_adj, reg_exp))
if line.startswith('@BADNESS_ADJ_RE_CGROUP_V2'):
a = line.partition('@BADNESS_ADJ_RE_CGROUP_V2')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_cgroup_v2_list.append((badness_adj, reg_exp))
if line.startswith('@BADNESS_ADJ_RE_REALPATH'):
a = line.partition('@BADNESS_ADJ_RE_REALPATH')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_realpath_list.append((badness_adj, reg_exp))
if line.startswith('@BADNESS_ADJ_RE_CWD'):
a = line.partition('@BADNESS_ADJ_RE_CWD')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_cwd_list.append((badness_adj, reg_exp))
if line.startswith('@BADNESS_ADJ_RE_ENVIRON'):
a = line.partition('@BADNESS_ADJ_RE_ENVIRON')[2].strip(
' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
badness_adj_re_environ_list.append((badness_adj, reg_exp))
except PermissionError:
errprint('PermissionError', conf_err_mess)
exit(1)
except UnicodeDecodeError:
errprint('UnicodeDecodeError', conf_err_mess)
exit(1)
except IsADirectoryError:
errprint('IsADirectoryError', conf_err_mess)
exit(1)
except IndexError:
errprint('IndexError', conf_err_mess)
exit(1)
except FileNotFoundError:
errprint('FileNotFoundError', conf_err_mess)
exit(1)
if badness_adj_re_name_list == []:
regex_matching = False
else:
regex_matching = True
if badness_adj_re_cmdline_list == []:
re_match_cmdline = False
else:
re_match_cmdline = True
if badness_adj_re_uid_list == []:
re_match_uid = False
else:
re_match_uid = True
if badness_adj_re_environ_list == []:
re_match_environ = False
else:
re_match_environ = True
if badness_adj_re_realpath_list == []:
re_match_realpath = False
else:
re_match_realpath = True
if badness_adj_re_cwd_list == []:
re_match_cwd = False
else:
re_match_cwd = True
if badness_adj_re_cgroup_v1_list == []:
re_match_cgroup_v1 = False
else:
re_match_cgroup_v1 = True
if badness_adj_re_cgroup_v2_list == []:
re_match_cgroup_v2 = False
else:
re_match_cgroup_v2 = True
if soft_actions_list == []:
soft_actions = False
else:
soft_actions = True
###############################################################################
# extracting parameters from the dictionary
# check for all necessary parameters
# validation of all parameters
debug_psi = conf_parse_bool('debug_psi')
print_statistics = conf_parse_bool('print_statistics')
print_proc_table = conf_parse_bool('print_proc_table')
print_victim_status = conf_parse_bool('print_victim_status')
print_victim_cmdline = conf_parse_bool('print_victim_cmdline')
print_config_at_startup = conf_parse_bool('print_config_at_startup')
print_mem_check_results = conf_parse_bool('print_mem_check_results')
debug_sleep = conf_parse_bool('debug_sleep')
hide_corrective_action_type = conf_parse_bool('hide_corrective_action_type')
low_memory_warnings_enabled = conf_parse_bool('low_memory_warnings_enabled')
post_action_gui_notifications = conf_parse_bool(
'post_action_gui_notifications')
debug_threading = conf_parse_bool('debug_threading')
psi_checking_enabled = conf_parse_bool('psi_checking_enabled')
ignore_psi = not psi_checking_enabled
if psi_checking_enabled:
try:
psi_file_mem_to_metrics('/proc/pressure/memory')
except Exception as e:
print('WARNING: PSI metrics are not provided by the kernel: {}'.format(
e))
ignore_psi = True
zram_checking_enabled = conf_parse_bool('zram_checking_enabled')
ignore_zram = not zram_checking_enabled
debug_gui_notifications = conf_parse_bool('debug_gui_notifications')
ignore_positive_oom_score_adj = conf_parse_bool(
'ignore_positive_oom_score_adj')
(soft_threshold_min_mem_kb, soft_threshold_min_mem_mb,
soft_threshold_min_mem_percent) = calculate_percent('soft_threshold_min_mem')
(hard_threshold_min_mem_kb, hard_threshold_min_mem_mb,
hard_threshold_min_mem_percent) = calculate_percent('hard_threshold_min_mem')
(soft_threshold_max_zram_kb, soft_threshold_max_zram_mb,
soft_threshold_max_zram_percent) = calculate_percent(
'soft_threshold_max_zram')
(hard_threshold_max_zram_kb, hard_threshold_max_zram_mb,
hard_threshold_max_zram_percent) = calculate_percent(
'hard_threshold_max_zram')
(warning_threshold_min_mem_kb, warning_threshold_min_mem_mb,
warning_threshold_min_mem_percent) = calculate_percent(
'warning_threshold_min_mem')
(warning_threshold_max_zram_kb, warning_threshold_max_zram_mb,
warning_threshold_max_zram_percent) = calculate_percent(
'warning_threshold_max_zram')
if 'post_zombie_delay' in config_dict:
post_zombie_delay = string_to_float_convert_test(
config_dict['post_zombie_delay'])
if post_zombie_delay is None or post_zombie_delay < 0:
invalid_config_key_value('post_zombie_delay')
else:
missing_config_key('post_zombie_delay')
if 'victim_cache_time' in config_dict:
victim_cache_time = string_to_float_convert_test(
config_dict['victim_cache_time'])
if victim_cache_time is None or victim_cache_time < 0:
invalid_config_key_value('victim_cache_time')
else:
missing_config_key('victim_cache_time')
if 'env_cache_time' in config_dict:
env_cache_time = string_to_float_convert_test(
config_dict['env_cache_time'])
if env_cache_time is None or env_cache_time < 0:
invalid_config_key_value('env_cache_time')
else:
missing_config_key('env_cache_time')
if 'exe_timeout' in config_dict:
exe_timeout = string_to_float_convert_test(config_dict['exe_timeout'])
if exe_timeout is None or exe_timeout < 0.1:
invalid_config_key_value('exe_timeout')
else:
missing_config_key('exe_timeout')
if 'fill_rate_mem' in config_dict:
fill_rate_mem = string_to_float_convert_test(config_dict['fill_rate_mem'])
if fill_rate_mem is None or fill_rate_mem < 100:
invalid_config_key_value('fill_rate_mem')
else:
missing_config_key('fill_rate_mem')
if 'fill_rate_swap' in config_dict:
fill_rate_swap = string_to_float_convert_test(
config_dict['fill_rate_swap'])
if fill_rate_swap is None or fill_rate_swap < 100:
invalid_config_key_value('fill_rate_swap')
else:
missing_config_key('fill_rate_swap')
if 'fill_rate_zram' in config_dict:
fill_rate_zram = string_to_float_convert_test(
config_dict['fill_rate_zram'])
if fill_rate_zram is None or fill_rate_zram < 100:
invalid_config_key_value('fill_rate_zram')
else:
missing_config_key('fill_rate_zram')
if 'soft_threshold_min_swap' in config_dict:
soft_threshold_min_swap = config_dict['soft_threshold_min_swap']
else:
errprint('soft_threshold_min_swap not in config\nExit')
exit(1)
if 'hard_threshold_min_swap' in config_dict:
hard_threshold_min_swap = config_dict['hard_threshold_min_swap']
else:
missing_config_key('hard_threshold_min_swap')
if 'post_soft_action_delay' in config_dict:
post_soft_action_delay = string_to_float_convert_test(
config_dict['post_soft_action_delay'])
if post_soft_action_delay is None or post_soft_action_delay < 0.1:
invalid_config_key_value('post_soft_action_delay')
else:
missing_config_key('post_soft_action_delay')
if 'psi_post_action_delay' in config_dict:
psi_post_action_delay = string_to_float_convert_test(
config_dict['psi_post_action_delay'])
if psi_post_action_delay is None or psi_post_action_delay < 10:
invalid_config_key_value('psi_post_action_delay')
else:
missing_config_key('psi_post_action_delay')
if 'hard_threshold_max_psi' in config_dict:
hard_threshold_max_psi = string_to_float_convert_test(
config_dict['hard_threshold_max_psi'])
if (hard_threshold_max_psi is None or hard_threshold_max_psi < 1 or
hard_threshold_max_psi > 100):
invalid_config_key_value('hard_threshold_max_psi')
else:
missing_config_key('hard_threshold_max_psi')
if 'soft_threshold_max_psi' in config_dict:
soft_threshold_max_psi = string_to_float_convert_test(
config_dict['soft_threshold_max_psi'])
if (soft_threshold_max_psi is None or soft_threshold_max_psi < 1 or
soft_threshold_max_psi > 100):
invalid_config_key_value('soft_threshold_max_psi')
else:
missing_config_key('soft_threshold_max_psi')
if 'warning_threshold_max_psi' in config_dict:
warning_threshold_max_psi = string_to_float_convert_test(
config_dict['warning_threshold_max_psi'])
if (warning_threshold_max_psi is None or warning_threshold_max_psi < 1 or
warning_threshold_max_psi > 100):
invalid_config_key_value('warning_threshold_max_psi')
else:
missing_config_key('warning_threshold_max_psi')
if 'min_badness' in config_dict:
min_badness = string_to_int_convert_test(config_dict['min_badness'])
if min_badness is None or min_badness < 1:
invalid_config_key_value('min_badness')
else:
missing_config_key('min_badness')
if 'min_post_warning_delay' in config_dict:
min_post_warning_delay = string_to_float_convert_test(
config_dict['min_post_warning_delay'])
if min_post_warning_delay is None or min_post_warning_delay < 1:
invalid_config_key_value('min_post_warning_delay')
else:
missing_config_key('min_post_warning_delay')
if 'warning_threshold_min_swap' in config_dict:
warning_threshold_min_swap = config_dict['warning_threshold_min_swap']
else:
missing_config_key('warning_threshold_min_swap')
if 'max_victim_ancestry_depth' in config_dict:
max_victim_ancestry_depth = string_to_int_convert_test(
config_dict['max_victim_ancestry_depth'])
if min_badness is None:
errprint('Invalid max_victim_ancestry_depth value, not integer\nExit')
exit(1)
if max_victim_ancestry_depth < 1:
errprint('Invalud max_victim_ancestry_depth value\nExit')
exit(1)
else:
missing_config_key('max_victim_ancestry_depth')
if 'max_soft_exit_time' in config_dict:
max_soft_exit_time = string_to_float_convert_test(
config_dict['max_soft_exit_time'])
if max_soft_exit_time is None or max_soft_exit_time < 0.1:
invalid_config_key_value('max_soft_exit_time')
else:
missing_config_key('max_soft_exit_time')
if 'post_kill_exe' in config_dict:
post_kill_exe = config_dict['post_kill_exe']
else:
missing_config_key('post_kill_exe')
if 'psi_path' in config_dict:
psi_path = config_dict['psi_path']
if not ignore_psi:
try:
psi_file_mem_to_metrics(psi_path)
except Exception as e:
errprint('WARNING: invalid psi_path "{}": {}'.format(
psi_path, e))
else:
missing_config_key('psi_path')
if 'psi_metrics' in config_dict:
psi_metrics = config_dict['psi_metrics']
valid_metrics = {
'some_avg10', 'some_avg60', 'some_avg300',
'full_avg10', 'full_avg60', 'full_avg300'}
if psi_metrics not in valid_metrics:
invalid_config_key_value('psi_metrics')
else:
missing_config_key('psi_metrics')
if 'warning_exe' in config_dict:
warning_exe = config_dict['warning_exe']
if warning_exe != '':
check_warning_exe = True
else:
check_warning_exe = False
else:
missing_config_key('warning_exe')
if 'extra_table_info' in config_dict:
extra_table_info = config_dict['extra_table_info']
valid_eti = {'None', 'cwd', 'realpath',
'cgroup_v1', 'cgroup_v2', 'cmdline', 'environ'}
if extra_table_info not in valid_eti:
invalid_config_key_value('extra_table_info')
else:
missing_config_key('extra_table_info')
separate_log = conf_parse_bool('separate_log')
if separate_log:
import logging
log_dir = '/var/log/nohang'
logfile = log_dir + '/nohang.log'
try:
os.mkdir(log_dir)
except FileExistsError:
pass
except PermissionError:
errprint('ERROR: cannot create {}'.format(log_dir))
try:
os.chmod(log_dir, mode=0o750)
except FileNotFoundError:
errprint('ERROR: file not found: {}'.format(log_dir))
except PermissionError:
errprint('ERROR: permission denied: {}'.format(log_dir))
try:
logging.basicConfig(
filename=logfile,
level=logging.INFO,
format="%(asctime)s: %(message)s")
except FileNotFoundError:
errprint('ERROR: file not found: {}'.format(logfile))
except PermissionError:
errprint('ERROR: permission denied: {}'.format(logfile))
if 'min_mem_report_interval' in config_dict:
min_mem_report_interval = string_to_float_convert_test(
config_dict['min_mem_report_interval'])
if min_mem_report_interval is None or min_mem_report_interval < 0:
invalid_config_key_value('min_mem_report_interval')
else:
missing_config_key('min_mem_report_interval')
if 'psi_excess_duration' in config_dict:
psi_excess_duration = string_to_float_convert_test(
config_dict['psi_excess_duration'])
if psi_excess_duration is None or psi_excess_duration < 0:
invalid_config_key_value('psi_excess_duration')
else:
missing_config_key('psi_excess_duration')
if 'max_sleep' in config_dict:
max_sleep = string_to_float_convert_test(
config_dict['max_sleep'])
if max_sleep is None or max_sleep < 0.01:
invalid_config_key_value('max_sleep')
else:
missing_config_key('max_sleep')
if 'min_sleep' in config_dict:
min_sleep = string_to_float_convert_test(
config_dict['min_sleep'])
if min_sleep is None or min_sleep < 0.01 or min_sleep > max_sleep:
invalid_config_key_value('min_sleep')
else:
missing_config_key('min_sleep')
over_sleep = min_sleep
sensitivity_test_time = over_sleep / 4
if max_sleep == min_sleep:
stable_sleep = True
else:
stable_sleep = False
if print_proc_table_flag:
check_permissions()
func_print_proc_table()
if (low_memory_warnings_enabled or
post_action_gui_notifications or
check_warning_exe or
soft_actions or
post_kill_exe != ''):
import threading
import shlex
from subprocess import Popen, TimeoutExpired
psi_support = os.path.exists(psi_path)
# Get KiB levels if it's possible.
soft_threshold_min_swap_tuple = get_swap_threshold_tuple(
soft_threshold_min_swap, 'soft_threshold_min_swap')
hard_threshold_min_swap_tuple = get_swap_threshold_tuple(
hard_threshold_min_swap, 'hard_threshold_min_swap')
warning_threshold_min_swap_tuple = get_swap_threshold_tuple(
warning_threshold_min_swap, 'warning_threshold_min_swap')
swap_kb_dict = dict()
swap_term_is_percent = soft_threshold_min_swap_tuple[1]
if swap_term_is_percent:
soft_threshold_min_swap_percent = soft_threshold_min_swap_tuple[0]
else:
soft_threshold_min_swap_kb = soft_threshold_min_swap_tuple[0]
swap_kb_dict['soft_threshold_min_swap_kb'] = soft_threshold_min_swap_kb
swap_kill_is_percent = hard_threshold_min_swap_tuple[1]
if swap_kill_is_percent:
hard_threshold_min_swap_percent = hard_threshold_min_swap_tuple[0]
else:
hard_threshold_min_swap_kb = hard_threshold_min_swap_tuple[0]
swap_kb_dict['hard_threshold_min_swap_kb'] = hard_threshold_min_swap_kb
swap_warn_is_percent = warning_threshold_min_swap_tuple[1]
if swap_warn_is_percent:
warning_threshold_min_swap_percent = warning_threshold_min_swap_tuple[0]
else:
warning_threshold_min_swap_kb = warning_threshold_min_swap_tuple[0]
swap_kb_dict[
'warning_threshold_min_swap_kb'] = warning_threshold_min_swap_kb
if print_config_at_startup or check_config_flag:
check_config()
# for calculating the column width when printing mem and zram
mem_len = len(str(round(mem_total / 1024.0)))
if post_action_gui_notifications:
notify_sig_dict = {SIGKILL: 'Killing',
SIGTERM: 'Terminating'}
# convert rates from MiB/s to KiB/s
fill_rate_mem = fill_rate_mem * 1024
fill_rate_swap = fill_rate_swap * 1024
fill_rate_zram = fill_rate_zram * 1024
warn_time_now = 0
warn_time_delta = 1000 # ?
warn_timer = 0
mlockall()
check_permissions()
psi_avg_string = '' # will be overwritten if PSI monitoring enabled
mem_used_zram = 0
if print_mem_check_results:
# to find delta mem
wt2 = 0
new_mem = 0
# init mem report interval
report0 = 0
# handle signals
for i in sig_list:
signal(i, signal_handler)
x0 = monotonic()
delta0 = 0
threshold = None
mem_info = None
CHECK_PSI = False
if psi_support and not ignore_psi:
CHECK_PSI = True
psi_kill_exceeded_timer = psi_term_exceeded_timer = -0.0001
psi_t0 = monotonic()
psi_threshold = zram_threshold = zram_info = psi_info = None
CHECK_ZRAM = not ignore_zram
log('Monitoring has started!')
stdout.flush()
display_env = 'DISPLAY='
dbus_env = 'DBUS_SESSION_BUS_ADDRESS='
user_env = 'USER='
envd = dict()
envd['list_with_envs'] = envd['t'] = None
cmd_num_dict = dict()
cmd_num_dict['cmd_num'] = 0
fd = dict()
fd['mi'] = open('/proc/meminfo', 'rb', buffering=0)
arcstats_path = '/proc/spl/kstat/zfs/arcstats'
# arcstats_path = './arcstats'
ZFS = os.path.exists(arcstats_path)
if ZFS:
try:
# find indexes
with open(arcstats_path, 'rb') as f:
a_list = f.read().decode().split('\n')
for n, line in enumerate(a_list):
if line.startswith('c_min '):
c_min_index = n
elif line.startswith('size '):
size_index = n
elif line.startswith('arc_meta_used '):
arc_meta_used_index = n
elif line.startswith('arc_meta_min '):
arc_meta_min_index = n
else:
continue
except Exception as e:
log(e)
m0 = monotonic()
pt0 = process_time()
while True:
(masf_threshold, masf_info, mem_available, hard_threshold_min_swap_kb,
soft_threshold_min_swap_kb, swap_free, swap_total) = check_mem_swap_ex()
if CHECK_ZRAM:
zram_threshold, zram_info, mem_used_zram = check_zram_ex()
if CHECK_PSI:
(psi_threshold, psi_info, psi_t0, psi_kill_exceeded_timer,
psi_term_exceeded_timer, x0) = check_psi_ex(
psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0,
mem_available)
if print_mem_check_results:
if CHECK_PSI:
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
# print(psi_avg_value)
if monotonic() - psi_t0 >= psi_post_action_delay:
psi_post_action_delay_exceeded = True
else:
psi_post_action_delay_exceeded = False
if print_mem_check_results:
psi_avg_string = 'PSI: {} | '.format(
str(psi_avg_value).rjust(6))
wt1 = monotonic()
delta = (mem_available + swap_free) - new_mem
t_cycle = wt1 - wt2
report_delta = wt1 - report0
if report_delta >= min_mem_report_interval:
mem_report = True
new_mem = mem_available + swap_free
report0 = wt1
else:
mem_report = False
wt2 = monotonic()
if mem_report:
speed = delta / 1024.0 / report_delta
speed_info = ' | dMem: {} M/s'.format(
str(round(speed)).rjust(5)
)
# Calculate 'swap-column' width
swap_len = len(str(round(swap_total / 1024.0)))
# Output available mem sizes
if swap_total == 0 and mem_used_zram == 0:
log('{}MemAvail: {} M, {} %{}'.format(
psi_avg_string,
human(mem_available, mem_len),
just_percent_mem(mem_available / mem_total),
speed_info
)
)
elif swap_total > 0 and mem_used_zram == 0:
log('{}MemAvail: {} M, {} % | SwapFree: {} M, {} %{}'.format(
psi_avg_string,
human(mem_available, mem_len),
just_percent_mem(mem_available / mem_total),
human(swap_free, swap_len),
just_percent_swap(swap_free / (swap_total + 0.1)),
speed_info
)
)
else:
log('{}MemAvail: {} M, {} % | SwapFree: {} M, {} % | Mem'
'UsedZram: {} M, {} %{}'.format(
psi_avg_string,
human(mem_available, mem_len),
just_percent_mem(mem_available / mem_total),
human(swap_free, swap_len),
just_percent_swap(swap_free / (swap_total + 0.1)),
human(mem_used_zram, mem_len),
just_percent_mem(mem_used_zram / mem_total),
speed_info
)
)
if (masf_threshold == SIGKILL or zram_threshold == SIGKILL or
psi_threshold == SIGKILL):
threshold = SIGKILL
mem_info_list = []
if masf_info is not None:
mem_info_list.append(masf_info)
if zram_info is not None:
mem_info_list.append(zram_info)
if psi_info is not None:
mem_info_list.append(psi_info)
psi_t0 = implement_corrective_action(
threshold,
mem_info_list,
psi_t0,
psi_kill_exceeded_timer,
psi_term_exceeded_timer,
x0, psi_threshold, zram_threshold, zram_info, psi_info)
continue
if (masf_threshold == SIGTERM or zram_threshold == SIGTERM or
psi_threshold == SIGTERM):
threshold = SIGTERM
mem_info_list = []
if masf_info is not None:
mem_info_list.append(masf_info)
if zram_info is not None:
mem_info_list.append(zram_info)
if psi_info is not None:
mem_info_list.append(psi_info)
psi_t0 = implement_corrective_action(
threshold,
mem_info_list,
psi_t0,
psi_kill_exceeded_timer,
psi_term_exceeded_timer,
x0, psi_threshold, zram_threshold, zram_info, psi_info)
continue
if low_memory_warnings_enabled:
if (masf_threshold == 'WARN' or zram_threshold == 'WARN' or
psi_threshold == 'WARN'):
warn_time_delta = monotonic() - warn_time_now
warn_time_now = monotonic()
warn_timer += warn_time_delta
if warn_timer > min_post_warning_delay:
send_notify_warn()
warn_timer = 0
sleep_after_check_mem()