nohang/n4

1767 lines
53 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""A daemon that prevents OOM in Linux systems."""
import os
from time import sleep, time
from operator import itemgetter
from sys import stdout
from signal import SIGKILL, SIGTERM, SIGSTOP, SIGCONT
start_time = time()
sig_dict = {SIGKILL: 'SIGKILL',
SIGTERM: 'SIGTERM'}
self_pid = str(os.getpid())
self_uid = os.geteuid()
if self_uid == 0:
root = True
else:
root = False
wait_time = 3
# todo: make config option
max_sleep_time = 2
# todo: make config option
min_sleep_time = 0.1
notify_helper_path = '/usr/sbin/nohang_notify_helper'
psi_path = '/proc/pressure/memory'
psi_support = os.path.exists(psi_path)
HR = '~' * 79
# todo: make config option
print_total_stat = True
stop_cont = False
stop_cont_warn = False
# print(os.path.realpath('/proc/29758/exe'))
##########################################################################
# define functions
def pid_to_state(pid):
return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1]
def stop():
#print()
#print('Stop running processes...')
t1 = time()
t2 = time()
stopped_list = []
for pid in os.listdir('/proc')[::-1]:
# only directories whose names consist only of numbers, except /proc/1/
if pid[0].isdecimal() is False or pid is '1' or pid == self_pid:
continue
try:
oom_score_r = int(rline1('/proc/' + pid + '/oom_score'))
if oom_score_r > 9:
uid_r = pid_to_uid(pid)
#print('PID: {}, State: {}, oom_score {}'.format(pid, pid_to_state(pid), oom_score_r))
if uid_r != '0':
stopped_list.append(pid)
print('Send SIGSTOP to {}, {}, {}...'.format(
pid, pid_to_name(pid), pid_to_cmdline(pid)[:40]))
os.kill(int(pid), SIGSTOP)
t2 = time()
except FileNotFoundError:
continue
except ProcessLookupError:
continue
print('Stop time:', t2 - t1)
stdout.flush()
return stopped_list
def cont(stopped_list):
print()
print('Continue stopped processes...')
t1 = time()
if len(stopped_list) > 0:
for pid in stopped_list:
print('Send SIGCONT to', [pid], pid_to_name(pid))
try:
os.kill(int(pid), SIGCONT)
except FileNotFoundError:
continue
except ProcessLookupError:
continue
t2 = time()
print('All cont time: ', t2 - t1)
def update_stat_dict_and_print(key):
if key not in stat_dict:
stat_dict.update({key: 1})
else:
new_value = stat_dict[key] + 1
stat_dict.update({key: new_value})
if print_total_stat:
stats_msg = '{}\n\033[4mThe following corrective actions have been implemented in the last {}:\033[0m'.format(
HR, format_time(time() - start_time))
for i in stat_dict:
stats_msg += '\n- {}: {}'.format(i, stat_dict[i])
print(stats_msg)
def psi_mem_some_avg_total():
if psi_support:
return float(rline1(psi_path).rpartition('=')[2])
def psi_mem_some_avg10():
return float(rline1(psi_path).split(' ')[1].split('=')[1])
def check_mem():
"""find mem_available"""
return int(rline1('/proc/meminfo').split(':')[1].strip(' kB\n'))
def check_mem_and_swap():
"""find mem_available, swap_total, swap_free"""
with open('/proc/meminfo') as f:
for n, line in enumerate(f):
if n is 2:
mem_available = int(line.split(':')[1].strip(' kB\n'))
continue
if n is swap_total_index:
swap_total = int(line.split(':')[1].strip(' kB\n'))
continue
if n is swap_free_index:
swap_free = int(line.split(':')[1].strip(' kB\n'))
break
return mem_available, swap_total, swap_free
def check_zram():
"""find MemUsedZram"""
disksize_sum = 0
mem_used_total_sum = 0
for dev in os.listdir('/sys/block'):
if dev.startswith('zram'):
stat = zram_stat(dev)
disksize_sum += int(stat[0])
mem_used_total_sum += int(stat[1])
ZRAM_DISKSIZE_FACTOR = 0.0042
# Означает, что при задани zram disksize = 1 GiB доступная память
# уменьшится на 0.0042 GiB.
# Найден экспериментально, требует уточнения с разными ядрами и архитектурами.
# На небольших дисксайзах (до гигабайта) может быть больше, до 0.0045.
# Создатель модуля zram утверждает, что ZRAM_DISKSIZE_FACTOR доожен быть 0.001:
# ("zram uses about 0.1% of the size of the disk"
# - https://www.kernel.org/doc/Documentation/blockdev/zram.txt),
# но это утверждение противоречит опытным данным.
# ZRAM_DISKSIZE_FACTOR = deltaMemAvailavle / disksize
# found experimentally
return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0
def format_time(t):
t = int(t)
if t < 60:
return '{} sec'.format(t)
elif t >= 60 and t < 3600:
m = t // 60
s = t % 60
return '{} min {} sec'.format(m, s)
else:
h = t // 3600
s0 = t - h * 3600
m = s0 // 60
s = s0 % 60
return '{} h {} min {} sec'.format(h, m, s)
def string_to_float_convert_test(string):
"""Try to interprete string values as floats."""
try:
return float(string)
except ValueError:
return None
def string_to_int_convert_test(string):
"""Try to interpret string values as integers."""
try:
return int(string)
except ValueError:
return None
def conf_parse_string(param):
"""
Get string parameters from the config dict.
param: config_dict key
returns config_dict[param].strip()
"""
if param in config_dict:
return config_dict[param].strip()
else:
print('All the necessary parameters must be in the config')
print('There is no "{}" parameter in the config'.format(param))
exit()
def conf_parse_bool(param):
"""
Get bool parameters from the config_dict.
param: config_dicst key
returns bool
"""
if param in config_dict:
param_str = config_dict[param]
if param_str == 'True':
return True
elif param_str == 'False':
return False
else:
print('Invalid value of the "{}" parameter.'.format(param_str))
print('Valid values are True and False.')
print('Exit')
exit()
else:
print('All the necessary parameters must be in the config')
print('There is no "{}" parameter in the config'.format(param_str))
exit()
def rline1(path):
"""read 1st line from path."""
with open(path) as f:
for line in f:
return line[:-1]
def kib_to_mib(num):
"""Convert KiB values to MiB values."""
return round(num / 1024.0)
def percent(num):
"""Interprete mum as percentage."""
return round(num * 100, 1)
def just_percent_mem(num):
"""convert num to percent and justify"""
return str(round(num * 100, 1)).rjust(4, ' ')
def just_percent_swap(num):
return str(round(num * 100, 1)).rjust(5, ' ')
def human(num, lenth):
"""Convert KiB values to MiB values with right alignment"""
return str(round(num / 1024)).rjust(lenth, ' ')
def zram_stat(zram_id):
"""
Get zram state.
zram_id: str zram block-device id
returns bytes diskcize, str mem_used_total
"""
try:
disksize = rline1('/sys/block/' + zram_id + '/disksize')
except FileNotFoundError:
return '0', '0'
if disksize == ['0\n']:
return '0', '0'
try:
mm_stat = rline1('/sys/block/' + zram_id + '/mm_stat').split(' ')
mm_stat_list = []
for i in mm_stat:
if i != '':
mm_stat_list.append(i)
mem_used_total = mm_stat_list[2]
except FileNotFoundError:
mem_used_total = rline1('/sys/block/' + zram_id + '/mem_used_total')
return disksize, mem_used_total # BYTES, str
def pid_to_name(pid):
"""
Get process name by pid.
pid: str pid of required process
returns string process_name
"""
try:
with open('/proc/' + pid + '/status') as f:
f.seek(6)
for line in f:
return line[:-1]
except FileNotFoundError:
return ''
except ProcessLookupError:
return ''
except UnicodeDecodeError:
with open('/proc/' + pid + '/status', 'rb') as f:
f.seek(6)
return f.read(15).decode(
'utf-8', 'ignore').partition('\n')[0]
def pid_to_cmdline(pid):
"""
Get process cmdline by pid.
pid: str pid of required process
returns string cmdline
"""
with open('/proc/' + pid + '/cmdline') as f:
return f.read().replace('\x00', ' ').rstrip()
def pid_to_environ(pid):
"""
Get process cmdline by pid.
pid: str pid of required process
returns string cmdline
"""
with open('/proc/' + pid + '/environ') as f:
return f.read().replace('\x00', '\n').rstrip()
def pid_to_uid(pid):
'''return euid'''
try:
with open('/proc/' + pid + '/status') as f:
for n, line in enumerate(f):
if n is uid_index:
return line.split('\t')[2]
except UnicodeDecodeError:
with open('/proc/' + pid + '/status', 'rb') as f:
f_list = f.read().decode('utf-8', 'ignore').split('\n')
return f_list[uid_index].split('\t')[2]
def notify_send_wait(title, body):
'''GUI notifications with UID != 0'''
with Popen(['notify-send', '--icon=dialog-warning', title, body]) as proc:
try:
proc.wait(timeout=wait_time)
except TimeoutExpired:
proc.kill()
print('TimeoutExpired: notify-send {} {}'.format(title, body))
def notify_helper(title, body):
'''GUI notification with UID = 0'''
with Popen([notify_helper_path, title, body]) as proc:
try:
proc.wait(timeout=wait_time)
except TimeoutExpired:
proc.kill()
print(
'TimeoutExpired: nohang_notify_helper: {} {}'.format(
title, body))
def send_notify_warn():
"""
Look for process with maximum 'badness' and warn user with notification.
(implement Low memory warnings)
"""
if stop_cont_warn:
stopped_list = stop()
# find process with max badness
fat_tuple = fattest()
pid = fat_tuple[0]
name = pid_to_name(pid)
if mem_used_zram > 0:
low_mem_percent = '{}% {}% {}%'.format(
round(mem_available / mem_total * 100),
round(swap_free / (swap_total + 0.1) * 100),
round(mem_used_zram / mem_total * 100))
elif swap_free > 0:
low_mem_percent = '{}% {}%'.format(
round(mem_available / mem_total * 100),
round(swap_free / (swap_total + 0.1) * 100))
else:
low_mem_percent = '{}%'.format(
round(mem_available / mem_total * 100))
# title = 'Low memory: {}'.format(low_mem_percent)
title = 'Low memory'
body = 'Hog: <b>{}</b> [{}]'.format(
name.replace(
# symbol '&' can break notifications in some themes,
# therefore it is replaced by '*'
'&', '*'),
pid
)
if root: # If nohang was started by root
# send notification to all active users with special script
notify_helper(title, body)
else: # Or by regular user
# send notification to user that runs this nohang
notify_send_wait(title, body)
if stop_cont_warn:
cont(stopped_list)
def send_notify(signal, name, pid):
"""
Notificate about OOM Preventing.
signal: key for notify_sig_dict
name: str process name
pid: str process pid
"""
title = 'Hang prevention'
body = '<b>{} {}</b> [{}]'.format(
notify_sig_dict[signal],
name.replace(
# symbol '&' can break notifications in some themes,
# therefore it is replaced by '*'
'&', '*'),
pid
)
if root:
# send notification to all active users with notify-send
notify_helper(title, body)
else:
# send notification to user that runs this nohang
notify_send_wait(title, body)
def send_notify_etc(pid, name, command):
"""
Notificate about OOM Preventing.
command: str command that will be executed
name: str process name
pid: str process pid
"""
title = 'Hang prevention'
body = 'Victim is process <b>{}</b> [{}]\nExecute the command:\n<b>{}</b>'.format(
name.replace('&', '*'),
pid,
command.replace('&', '*')
)
if root:
# send notification to all active users with notify-send
notify_helper(title, body)
else:
# send notification to user that runs this nohang
notify_send_wait(title, body)
def sleep_after_send_signal(signal):
"""
Sleeping after signal was sent.
signal: sent signal
"""
if signal is SIGKILL:
if print_sleep_periods:
print(' sleep', min_delay_after_sigkill)
sleep(min_delay_after_sigkill)
else:
if print_sleep_periods:
print(' sleep', min_delay_after_sigterm)
sleep(min_delay_after_sigterm)
def fattest():
"""
Find the process with highest badness and its badness adjustment
Return pid and badness
-> find_mem_hog() or find_victim()
"""
pid_badness_list = []
for pid in os.listdir('/proc'):
# only directories whose names consist only of numbers, except /proc/1/
#if pid[0].isdecimal() is False:
# continue
if pid[0].isdecimal() is False or pid is '1' or pid == self_pid:
continue
x = os.path.exists('/proc/' + pid + '/exe')
if x is False:
continue
#print(x)
# в таком случае в список попадет self и selfpid
# их можно отдельно вычленить
# find and modify badness (if it needs)
try:
#realpath = os.path.realpath('/proc/' + pid + '/exe')
#print(pid, pid_to_name(pid), realpath)
badness = int(rline1('/proc/' + pid + '/oom_score'))
#state = pid_to_state(pid)
name = pid_to_name(pid)
except FileNotFoundError:
continue
except ProcessLookupError:
continue
pid_badness_list.append((pid, badness))
# Make list of (pid, badness) tuples, sorted by 'badness' values
pid_tuple_list = sorted(
pid_badness_list,
key=itemgetter(1),
reverse=True
)[0]
pid = pid_tuple_list[0]
# Get maximum 'badness' value
victim_badness = pid_tuple_list[1]
#print(pid_badness_list)
return pid, victim_badness
t0 = time()
x = fattest()
t1 = time()
print(t1 - t0)
print(x)
exit()
def find_victim_and_send_signal(signal):
"""
Find victim with highest badness and send SIGTERM/SIGKILL
-> implement_corrective_action()
"""
if stop_cont:
stopped_list = stop()
pid, victim_badness = fattest()
name = pid_to_name(pid)
if victim_badness >= min_badness:
# Try to send signal to found victim
# Get VmRSS and VmSwap and cmdline of victim process
# and try to send a signal
try:
with open('/proc/' + pid + '/status') as f:
for n, line in enumerate(f):
if n is uid_index:
uid = line.split('\t')[1]
continue
if n is vm_size_index:
vm_size = kib_to_mib(int(line.split('\t')[1][:-4]))
continue
if n is vm_rss_index:
vm_rss = kib_to_mib(int(line.split('\t')[1][:-4]))
continue
if detailed_rss:
if n is anon_index:
anon_rss = kib_to_mib(
int(line.split('\t')[1][:-4]))
continue
if n is file_index:
file_rss = kib_to_mib(
int(line.split('\t')[1][:-4]))
continue
if n is shmem_index:
shmem_rss = kib_to_mib(
int(line.split('\t')[1][:-4]))
continue
if n is vm_swap_index:
vm_swap = kib_to_mib(int(line.split('\t')[1][:-4]))
break
with open('/proc/' + pid + '/cmdline') as file:
cmdline = file.readlines()[0].replace('\x00', ' ')
oom_score = rline1('/proc/' + pid + '/oom_score')
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
except FileNotFoundError:
print(mem_info)
print('The victim died in the search process: FileNotFoundError')
update_stat_dict_and_print(
'The victim died in the search process: FileNotFoundError')
return None
except ProcessLookupError:
print(mem_info)
print('The victim died in the search process: ProcessLookupError')
update_stat_dict_and_print(
'The victim died in the search process: ProcessLookupError')
return None
except UnicodeDecodeError:
# тут надо снова все исключ обработать
with open('/proc/' + pid + '/status', 'rb') as f:
f_list = f.read().decode('utf-8', 'ignore').split('\n')
for i in range(len(f_list)):
if i is ppid_index:
ppid = f_list[i].split('\t')[2]
for i in range(len(f_list)):
if i is uid_index:
uid = f_list[i].split('\t')[2]
if i is vm_size_index:
vm_size = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if i is vm_rss_index:
vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3]))
if detailed_rss:
if i is anon_index:
anon_rss = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if i is file_index:
file_rss = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if i is shmem_index:
shmem_rss = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
if i is vm_swap_index:
vm_swap = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
with open('/proc/' + pid + '/cmdline') as file:
cmdline = file.readlines()[0].replace('\x00', ' ')
oom_score = rline1('/proc/' + pid + '/oom_score')
oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj')
except IndexError:
print(mem_info)
print('The victim died in the search process: IndexError')
update_stat_dict_and_print(
'The victim died in the search process: IndexError')
return None
except ValueError:
print(mem_info)
print('The victim died in the search process: ValueError')
update_stat_dict_and_print(
'The victim died in the search process: ValueError')
return None
len_vm = len(str(vm_size))
if detailed_rss:
environ = pid_to_environ(pid)
victim_info = '\033[4mFound a process with highest badness:\033[0m' \
'\n Name: \033[33m{}\033[0m' \
'\n PID: \033[33m{}\033[0m' \
'\n UID: \033[33m{}\033[0m' \
'\n badness: \033[33m{}\033[0m, ' \
'oom_score: \033[33m{}\033[0m, ' \
'oom_score_adj: \033[33m{}\033[0m' \
'\n VmSize: \033[33m{}\033[0m MiB' \
'\n VmRSS: \033[33m{}\033[0m MiB (' \
'Anon: \033[33m{}\033[0m MiB, ' \
'File: \033[33m{}\033[0m MiB, ' \
'Shmem: \033[33m{}\033[0m MiB)' \
'\n VmSwap: \033[33m{}\033[0m MiB' \
'\n environ:\n\033[33m{}\033[0m' \
'\n cmdline: \033[33m{}\033[0m'.format(
name,
pid,
uid,
victim_badness,
oom_score,
oom_score_adj,
vm_size,
str(vm_rss).rjust(len_vm),
anon_rss,
file_rss,
shmem_rss,
str(vm_swap).rjust(len_vm),
environ,
cmdline
)
else:
# нахер такое ветвление
victim_info = '\033[4mFound a process with highest badness:\033[0m' \
'\n Name: \033[33m{}\033[0m' \
'\n PID: \033[33m{}\033[0m' \
'\n UID: \033[33m{}\033[0m' \
'\n Badness: \033[33m{}\033[0m, ' \
'oom_score: \033[33m{}\033[0m, ' \
'oom_score_adj: \033[33m{}\033[0m' \
'\n VmSize: \033[33m{}\033[0m MiB' \
'\n VmRSS: \033[33m{}\033[0m MiB' \
'\n VmSwap: \033[33m{}\033[0m MiB' \
'\n CmdLine: \033[33m{}\033[0m'.format(
name,
pid,
uid,
victim_badness,
oom_score,
oom_score_adj,
vm_size,
str(vm_rss).rjust(len_vm),
str(vm_swap).rjust(len_vm),
cmdline)
if execute_the_command and signal is SIGTERM and name in etc_dict:
command = etc_dict[name]
exit_status = os.system(etc_dict[name].replace(
'$PID', pid).replace('$NAME', pid_to_name(pid)))
if exit_status == 0:
exit_status = '\033[32m0\033[0m'
else:
exit_status = '\033[31m{}\033[0m'.format(exit_status)
response_time = time() - time0
etc_info = '{}' \
'\n\033[4mImplement corrective action:\033[0m\n Run the command: \033[4m{}\033[0m' \
'\n Exit status: {}; response time: {} ms'.format(
victim_info, command.replace(
'$PID', pid).replace('$NAME', pid_to_name(pid)), exit_status,
round(response_time * 1000))
print(mem_info)
print(etc_info)
key = "Run the command '\033[35m{}\033[0m'".format(command)
update_stat_dict_and_print(key)
if gui_notifications:
send_notify_etc(
pid,
name,
command.replace('$PID', pid).replace('$NAME', pid_to_name(pid)))
else:
try:
m = check_mem_and_swap()
ma = round(int(m[0]) / 1024.0)
sf = round(int(m[2]) / 1024.0)
print('\nMemory status before sending a signal:\nMemA'
'v: {} MiB, SwFree: {} MiB'.format(ma, sf))
if stop_cont:
os.kill(int(pid), SIGCONT)
os.kill(int(pid), signal)
response_time = time() - time0
send_result = '\033[32mOK\033[0m; response time: {} ms'.format(
round(response_time * 1000))
preventing_oom_message = '{}' \
'\n\033[4mImplement a corrective action:\033[0m\n ' \
'Sending \033[4m{}\033[0m to the victim; {}'.format(
victim_info, sig_dict[signal], send_result)
key = 'Send \033[35m{}\033[0m to \033[35m{}\033[0m'.format(
sig_dict[signal], name)
if gui_notifications:
send_notify(signal, name, pid)
except FileNotFoundError:
response_time = time() - time0
send_result = 'no such process; response time: {} ms'.format(
round(response_time * 1000))
key = 'The victim died in the search process: FileNotFoundError'
except ProcessLookupError:
response_time = time() - time0
send_result = 'no such process; response time: {} ms'.format(
round(response_time * 1000))
key = 'The victim died in the search process: ProcessLookupError'
print(mem_info)
print(pid_to_state(pid))
print(preventing_oom_message)
update_stat_dict_and_print(key)
else:
response_time = time() - time0
print(mem_info)
victim_badness_is_too_small = 'victim badness {} < min_b' \
'adness {}; nothing to do; response time: {} ms'.format(
victim_badness,
min_badness,
round(response_time * 1000))
print(victim_badness_is_too_small)
# update stat_dict
key = 'victim badness < min_badness'
update_stat_dict_and_print(key)
if stop_cont:
cont(stopped_list)
sleep_after_send_signal(signal)
def sleep_after_check_mem():
"""Specify sleep times depends on rates and avialable memory."""
if mem_min_sigkill_kb < mem_min_sigterm_kb:
mem_point = mem_available - mem_min_sigterm_kb
else:
mem_point = mem_available - mem_min_sigkill_kb
if swap_min_sigkill_kb < swap_min_sigterm_kb:
swap_point = swap_free - swap_min_sigterm_kb
else:
swap_point = swap_free - swap_min_sigkill_kb
t_mem = mem_point / rate_mem
t_swap = swap_point / rate_swap
t_zram = (mem_total * 0.9 - mem_used_zram) / rate_zram
t_mem_swap = t_mem + t_swap
t_mem_zram = t_mem + t_zram
if t_mem_swap <= t_mem_zram:
t = t_mem_swap
else:
t = t_mem_zram
if t > max_sleep_time:
t = max_sleep_time
elif t < min_sleep_time:
t = min_sleep_time
else:
pass
try:
if print_sleep_periods:
print('sleep', round(t, 2))
# ' (t_mem={}, t_swap={}, t_zram={})'.format(
# round(t_mem, 2),
# round(t_swap, 2),
# round(t_zram, 2)))
stdout.flush()
sleep(t)
except KeyboardInterrupt:
exit()
def calculate_percent(arg_key):
"""
parse conf dict
Calculate mem_min_KEY_percent.
Try use this one)
arg_key: str key for config_dict
returns int mem_min_percent or NoneType if got some error
"""
if arg_key in config_dict:
mem_min = config_dict[arg_key]
if mem_min.endswith('%'):
# truncate percents, so we have a number
mem_min_percent = mem_min[:-1].strip()
# then 'float test'
mem_min_percent = string_to_float_convert_test(mem_min_percent)
if mem_min_percent is None:
print('Invalid {} value, not float\nExit'.format(arg_key))
exit()
# Final validations...
if mem_min_percent < 0 or mem_min_percent > 100:
print(
'{}, as percents value, out of range [0; 100]\nExit'.format(arg_key))
exit()
# mem_min_sigterm_percent is clean and valid float percentage. Can
# translate into Kb
mem_min_kb = mem_min_percent / 100 * mem_total
mem_min_mb = round(mem_min_kb / 1024)
elif mem_min.endswith('M'):
mem_min_mb = string_to_float_convert_test(mem_min[:-1].strip())
if mem_min_mb is None:
print('Invalid {} value, not float\nExit'.format(arg_key))
exit()
mem_min_kb = mem_min_mb * 1024
if mem_min_kb > mem_total:
print(
'{} value can not be greater then MemTotal ({} MiB)\nExit'.format(
arg_key, round(
mem_total / 1024)))
exit()
mem_min_percent = mem_min_kb / mem_total * 100
else:
print('Invalid {} units in config.\n Exit'.format(arg_key))
mem_min_percent = None
else:
print('{} not in config\nExit'.format(arg_key))
mem_min_percent = None
return mem_min_kb, mem_min_mb, mem_min_percent
##########################################################################
# find mem_total
# find positions of SwapFree and SwapTotal in /proc/meminfo
with open('/proc/meminfo') as f:
mem_list = f.readlines()
mem_list_names = []
for s in mem_list:
mem_list_names.append(s.split(':')[0])
if mem_list_names[2] != 'MemAvailable':
print('Your Linux kernel is too old, Linux 3.14+ requied\nExit')
exit()
swap_total_index = mem_list_names.index('SwapTotal')
swap_free_index = swap_total_index + 1
mem_total = int(mem_list[0].split(':')[1].strip(' kB\n'))
# Get names from /proc/*/status to be able to get VmRSS and VmSwap values
with open('/proc/self/status') as file:
status_list = file.readlines()
status_names = []
for s in status_list:
status_names.append(s.split(':')[0])
ppid_index = status_names.index('PPid')
vm_size_index = status_names.index('VmSize')
vm_rss_index = status_names.index('VmRSS')
vm_swap_index = status_names.index('VmSwap')
uid_index = status_names.index('Uid')
try:
anon_index = status_names.index('RssAnon')
file_index = status_names.index('RssFile')
shmem_index = status_names.index('RssShmem')
detailed_rss = True
# print(detailed_rss, 'detailed_rss')
except ValueError:
detailed_rss = False
# print('It is not Linux 4.5+')
##########################################################################
'''
# Configurations
cd = os.getcwd()
'''
config = '/etc/nohang/nohang.conf'
# config = 'nohang.conf'
print('Config:', config)
##########################################################################
# parsing the config with obtaining the parameters dictionary
# conf_parameters_dict
# conf_restart_dict
# dictionary with config options
config_dict = dict()
processname_re_list = []
cmdline_re_list = []
uid_re_list = []
# dictionary with names and commands for the parameter
# execute_the_command
# тут тоже список нужен, а не словарь
etc_dict = dict()
# will store corrective actions stat
stat_dict = dict()
try:
with open(config) as f:
for line in f:
a = line.startswith('#')
b = line.startswith('\n')
c = line.startswith('\t')
d = line.startswith(' ')
etc = line.startswith('$ETC')
if not a and not b and not c and not d and not etc:
a = line.partition('=')
config_dict[a[0].strip()] = a[2].strip()
if etc:
a = line[4:].split('///')
etc_name = a[0].strip()
etc_command = a[1].strip()
if len(etc_name) > 15:
print('Invalid config, the length of the process '
'name must not exceed 15 characters\nExit')
exit()
etc_dict[etc_name] = etc_command
# NEED VALIDATION!
if line.startswith('@PROCESSNAME_RE'):
a = line.partition('@PROCESSNAME_RE')[
2].strip(' \n').partition('///')
processname_re_list.append((a[0].strip(' '), a[2].strip(' ')))
if line.startswith('@CMDLINE_RE'):
a = line.partition('@CMDLINE_RE')[2].strip(
' \n').partition('///')
cmdline_re_list.append((a[0].strip(' '), a[2].strip(' ')))
if line.startswith('@UID_RE'):
a = line.partition('@UID_RE')[2].strip(' \n').partition('///')
uid_re_list.append((a[0].strip(' '), a[2].strip(' ')))
except PermissionError:
print('PermissionError', conf_err_mess)
exit()
except UnicodeDecodeError:
print('UnicodeDecodeError', conf_err_mess)
exit()
except IsADirectoryError:
print('IsADirectoryError', conf_err_mess)
exit()
except IndexError:
print('IndexError', conf_err_mess)
exit()
# print(processname_re_list)
# print(cmdline_re_list)
# print(uid_re_list)
##########################################################################
# extracting parameters from the dictionary
# check for all necessary parameters
# validation of all parameters
print_config = conf_parse_bool('print_config')
print_mem_check_results = conf_parse_bool('print_mem_check_results')
print_sleep_periods = conf_parse_bool('print_sleep_periods')
gui_low_memory_warnings = conf_parse_bool('gui_low_memory_warnings')
gui_notifications = conf_parse_bool('gui_notifications')
decrease_oom_score_adj = conf_parse_bool('decrease_oom_score_adj')
execute_the_command = conf_parse_bool('execute_the_command')
ignore_psi = conf_parse_bool('ignore_psi')
regex_matching = conf_parse_bool('regex_matching')
re_match_cmdline = conf_parse_bool('re_match_cmdline')
re_match_uid = conf_parse_bool('re_match_uid')
if regex_matching or re_match_cmdline or re_match_uid:
from re import search
import sre_constants
mem_min_sigterm_kb, mem_min_sigterm_mb, mem_min_sigterm_percent = calculate_percent(
'mem_min_sigterm')
mem_min_sigkill_kb, mem_min_sigkill_mb, mem_min_sigkill_percent = calculate_percent(
'mem_min_sigkill')
zram_max_sigterm_kb, zram_max_sigterm_mb, zram_max_sigterm_percent = calculate_percent(
'zram_max_sigterm')
zram_max_sigkill_kb, zram_max_sigkill_mb, zram_max_sigkill_percent = calculate_percent(
'zram_max_sigkill')
mem_min_warnings_kb, mem_min_warnings_mb, mem_min_warnings_percent = calculate_percent(
'mem_min_warnings')
zram_max_warnings_kb, zram_max_warnings_mb, zram_max_warnings_percent = calculate_percent(
'zram_max_warnings')
if 'rate_mem' in config_dict:
rate_mem = string_to_float_convert_test(config_dict['rate_mem'])
if rate_mem is None:
print('Invalid rate_mem value, not float\nExit')
exit()
if rate_mem <= 0:
print('rate_mem MUST be > 0\nExit')
exit()
else:
print('rate_mem not in config\nExit')
exit()
if 'rate_swap' in config_dict:
rate_swap = string_to_float_convert_test(config_dict['rate_swap'])
if rate_swap is None:
print('Invalid rate_swap value, not float\nExit')
exit()
if rate_swap <= 0:
print('rate_swap MUST be > 0\nExit')
exit()
else:
print('rate_swap not in config\nExit')
exit()
if 'rate_zram' in config_dict:
rate_zram = string_to_float_convert_test(config_dict['rate_zram'])
if rate_zram is None:
print('Invalid rate_zram value, not float\nExit')
exit()
if rate_zram <= 0:
print('rate_zram MUST be > 0\nExit')
exit()
else:
print('rate_zram not in config\nExit')
exit()
if 'swap_min_sigterm' in config_dict:
swap_min_sigterm = config_dict['swap_min_sigterm']
else:
print('swap_min_sigterm not in config\nExit')
exit()
if 'swap_min_sigkill' in config_dict:
swap_min_sigkill = config_dict['swap_min_sigkill']
else:
print('swap_min_sigkill not in config\nExit')
exit()
if 'min_delay_after_sigterm' in config_dict:
min_delay_after_sigterm = string_to_float_convert_test(
config_dict['min_delay_after_sigterm'])
if min_delay_after_sigterm is None:
print('Invalid min_delay_after_sigterm value, not float\nExit')
exit()
if min_delay_after_sigterm < 0:
print('min_delay_after_sigterm must be positiv\nExit')
exit()
else:
print('min_delay_after_sigterm not in config\nExit')
exit()
if 'min_delay_after_sigkill' in config_dict:
min_delay_after_sigkill = string_to_float_convert_test(
config_dict['min_delay_after_sigkill'])
if min_delay_after_sigkill is None:
print('Invalid min_delay_after_sigkill value, not float\nExit')
exit()
if min_delay_after_sigkill < 0:
print('min_delay_after_sigkill must be positive\nExit')
exit()
else:
print('min_delay_after_sigkill not in config\nExit')
exit()
if 'psi_avg10_sleep_time' in config_dict:
psi_avg10_sleep_time = string_to_float_convert_test(
config_dict['psi_avg10_sleep_time'])
if psi_avg10_sleep_time is None:
print('Invalid psi_avg10_sleep_time value, not float\nExit')
exit()
if psi_avg10_sleep_time < 0:
print('psi_avg10_sleep_time must be positive\nExit')
exit()
else:
print('psi_avg10_sleep_time not in config\nExit')
exit()
if 'sigkill_psi_avg10' in config_dict:
sigkill_psi_avg10 = string_to_float_convert_test(
config_dict['sigkill_psi_avg10'])
if sigkill_psi_avg10 is None:
print('Invalid sigkill_psi_avg10 value, not float\nExit')
exit()
if sigkill_psi_avg10 < 0 or sigkill_psi_avg10 > 100:
print('sigkill_psi_avg10 must be in the range [0; 100]\nExit')
exit()
else:
print('sigkill_psi_avg10 not in config\nExit')
exit()
if 'sigterm_psi_avg10' in config_dict:
sigterm_psi_avg10 = string_to_float_convert_test(
config_dict['sigterm_psi_avg10'])
if sigterm_psi_avg10 is None:
print('Invalid sigterm_psi_avg10 value, not float\nExit')
exit()
if sigterm_psi_avg10 < 0 or sigterm_psi_avg10 > 100:
print('sigterm_psi_avg10 must be in the range [0; 100]\nExit')
exit()
else:
print('sigterm_psi_avg10 not in config\nExit')
exit()
if 'min_badness' in config_dict:
min_badness = string_to_int_convert_test(
config_dict['min_badness'])
if min_badness is None:
print('Invalid min_badness value, not integer\nExit')
exit()
if min_badness < 0 or min_badness > 1000:
print('Invalud min_badness value\nExit')
exit()
else:
print('min_badness not in config\nExit')
exit()
if 'oom_score_adj_max' in config_dict:
oom_score_adj_max = string_to_int_convert_test(
config_dict['oom_score_adj_max'])
if oom_score_adj_max is None:
print('Invalid oom_score_adj_max value, not integer\nExit')
exit()
if oom_score_adj_max < 0 or oom_score_adj_max > 1000:
print('Invalid oom_score_adj_max value\nExit')
exit()
else:
print('oom_score_adj_max not in config\nExit')
exit()
if 'min_time_between_warnings' in config_dict:
min_time_between_warnings = string_to_float_convert_test(
config_dict['min_time_between_warnings'])
if min_time_between_warnings is None:
print('Invalid min_time_between_warnings value, not float\nExit')
exit()
if min_time_between_warnings < 1 or min_time_between_warnings > 300:
print('min_time_between_warnings value out of range [1; 300]\nExit')
exit()
else:
print('min_time_between_warnings not in config\nExit')
exit()
if 'swap_min_warnings' in config_dict:
swap_min_warnings = config_dict['swap_min_warnings']
else:
print('swap_min_warnings not in config\nExit')
exit()
##########################################################################
# Get KiB levels if it's possible.
# получ кб. если не кб - то процент. Если процент - находим кб ниже на
# основе полученного своптотал и процентов.
def get_swap_threshold_tuple(string):
# re (Num %, True) or (Num KiB, False)
"""Returns KiB value if abs val was set in config, or tuple with %"""
# return tuple with abs and bool: (abs %, True) or (abs MiB, False)
if string.endswith('%'):
valid = string_to_float_convert_test(string[:-1])
if valid is None:
print('somewhere swap unit is not float_%')
exit()
value = float(string[:-1].strip())
if value < 0 or value > 100:
print('invalid value, must be from the range[0; 100] %')
exit()
return value, True
elif string.endswith('M'):
valid = string_to_float_convert_test(string[:-1])
if valid is None:
print('somewhere swap unit is not float_M')
exit()
value = float(string[:-1].strip()) * 1024
if value < 0:
print('invalid unit in config (negative value)')
exit()
return value, False
else:
print('Invalid config file. There are invalid units somewhere\nExit')
exit()
swap_min_sigterm_tuple = get_swap_threshold_tuple(swap_min_sigterm)
swap_min_sigkill_tuple = get_swap_threshold_tuple(swap_min_sigkill)
swap_min_warnings_tuple = get_swap_threshold_tuple(swap_min_warnings)
swap_term_is_percent = swap_min_sigterm_tuple[1]
if swap_term_is_percent:
swap_min_sigterm_percent = swap_min_sigterm_tuple[0]
else:
swap_min_sigterm_kb = swap_min_sigterm_tuple[0]
swap_kill_is_percent = swap_min_sigkill_tuple[1]
if swap_kill_is_percent:
swap_min_sigkill_percent = swap_min_sigkill_tuple[0]
else:
swap_min_sigkill_kb = swap_min_sigkill_tuple[0]
swap_warn_is_percent = swap_min_warnings_tuple[1]
if swap_warn_is_percent:
swap_min_warnings_percent = swap_min_warnings_tuple[0]
else:
swap_min_warnings_kb = swap_min_warnings_tuple[0]
##########################################################################
if print_config:
print(
'\n1. Memory levels to respond to as an OOM threat\n[displaying these options need fix]\n')
print('mem_min_sigterm: {} MiB, {} %'.format(
round(mem_min_sigterm_mb), round(mem_min_sigterm_percent, 1)))
print('mem_min_sigkill: {} MiB, {} %'.format(
round(mem_min_sigkill_mb), round(mem_min_sigkill_percent, 1)))
print('swap_min_sigterm: {}'.format(swap_min_sigterm))
print('swap_min_sigkill: {}'.format(swap_min_sigkill))
print('zram_max_sigterm: {} MiB, {} %'.format(
round(zram_max_sigterm_mb), round(zram_max_sigterm_percent, 1)))
print('zram_max_sigkill: {} MiB, {} %'.format(
round(zram_max_sigkill_mb), round(zram_max_sigkill_percent, 1)))
print('\n2. The frequency of checking the level of available memory (and CPU usage)\n')
print('rate_mem: {}'.format(rate_mem))
print('rate_swap: {}'.format(rate_swap))
print('rate_zram: {}'.format(rate_zram))
print('\n3. The prevention of killing innocent victims\n')
print('min_delay_after_sigterm: {}'.format(min_delay_after_sigterm))
print('min_delay_after_sigkill: {}'.format(min_delay_after_sigkill))
print('min_badness: {}'.format(min_badness))
# False (OK) - OK не нужен когда фолс
print('decrease_oom_score_adj: {}'.format(
decrease_oom_score_adj
))
if decrease_oom_score_adj:
print('oom_score_adj_max: {}'.format(oom_score_adj_max))
print('\n4. Impact on the badness of processes via matching their'
' names, cmdlines ir UIDs with regular expressions\n')
print('(todo)')
print('\n5. The execution of a specific command instead of sending the\nSIGTERM signal\n')
print('execute_the_command: {}'.format(execute_the_command))
if execute_the_command:
print('\nPROCESS NAME COMMAND TO EXECUTE')
for key in etc_dict:
print('{} {}'.format(key.ljust(15), etc_dict[key]))
print('\n6. GUI notifications:\n- OOM prevention results and\n- low memory warnings\n')
print('gui_notifications: {}'.format(gui_notifications))
print('gui_low_memory_warnings: {}'.format(gui_low_memory_warnings))
if gui_low_memory_warnings:
print('min_time_between_warnings: {}'.format(min_time_between_warnings))
print('mem_min_warnings: {} MiB, {} %'.format(
round(mem_min_warnings_mb), round(mem_min_warnings_percent, 1)))
print('swap_min_warnings: {}'.format(swap_min_warnings))
print('zram_max_warnings: {} MiB, {} %'.format(
round(zram_max_warnings_mb), round(zram_max_warnings_percent, 1)))
print('\n7. Output verbosity\n')
print('print_config: {}'.format(print_config))
print('print_mem_check_results: {}'.format(print_mem_check_results))
print('print_sleep_periods: {}\n'.format(print_sleep_periods))
##########################################################################
# for calculating the column width when printing mem and zram
mem_len = len(str(round(mem_total / 1024.0)))
if gui_notifications or gui_low_memory_warnings:
from subprocess import Popen, TimeoutExpired
notify_sig_dict = {SIGKILL: 'Killing',
SIGTERM: 'Terminating'}
rate_mem = rate_mem * 1048576
rate_swap = rate_swap * 1048576
rate_zram = rate_zram * 1048576
warn_time_now = 0
warn_time_delta = 1000
warn_timer = 0
print('Monitoring started!')
stdout.flush()
sigterm_psi = sigterm_psi_avg10
sigkill_psi = sigkill_psi_avg10
psi_min_sleep_time_after_action = psi_avg10_sleep_time
##########################################################################
if psi_support and not ignore_psi:
kill_psi_t0 = time() + psi_avg10_sleep_time
term_psi_t0 = time() + psi_avg10_sleep_time
avg_value = ''
while True:
if psi_support and not ignore_psi:
avg10 = psi_mem_some_avg10()
if print_mem_check_results:
avg_value = 'PSI mem some avg10: {} | '.format(str(avg10).rjust(6))
if avg10 >= sigkill_psi and time() - kill_psi_t0 >= psi_min_sleep_time_after_action:
time0 = time()
mem_info = 'avg ({}) > sigkill_psi ({})'.format(avg10, sigkill_psi)
find_victim_and_send_signal(SIGKILL)
kill_psi_t0 = time()
elif avg10 >= sigterm_psi and time() - term_psi_t0 >= psi_min_sleep_time_after_action:
time0 = time()
mem_info = 'avg ({}) > sigterm_psi ({})'.format(avg10, sigterm_psi)
find_victim_and_send_signal(SIGTERM)
term_psi_t0 = time()
else:
pass
mem_available, swap_total, swap_free = check_mem_and_swap()
# если метры - получаем киб выше и сразу. см.
# if swap_min_sigkill is set in percent
if swap_kill_is_percent:
swap_min_sigkill_kb = swap_total * swap_min_sigkill_percent / 100.0
if swap_term_is_percent:
swap_min_sigterm_kb = swap_total * swap_min_sigterm_percent / 100.0
if swap_warn_is_percent:
swap_min_warnings_kb = swap_total * swap_min_warnings_percent / 100.0
# в общем случае для работы нужны килобайты. Если в процентах задано -
# находим КБ тут, после получения своптотал.
mem_used_zram = check_zram()
if print_mem_check_results:
# Calculate 'swap-column' width
swap_len = len(str(round(swap_total / 1024.0)))
# Output avialable mem sizes
if swap_total == 0 and mem_used_zram == 0:
print('{}MemAvail: {} M, {} %'.format(
avg_value,
human(mem_available, mem_len),
just_percent_mem(mem_available / mem_total)))
elif swap_total > 0 and mem_used_zram == 0:
print('{}MemAvail: {} M, {} % | SwapFree: {} M, {} %'.format(
avg_value,
human(mem_available, mem_len),
just_percent_mem(mem_available / mem_total),
human(swap_free, swap_len),
just_percent_swap(swap_free / (swap_total + 0.1))))
else:
print('{}MemAvail: {} M, {} % | SwapFree: {} M, {} % | Mem'
'UsedZram: {} M, {} %'.format(
avg_value,
human(mem_available, mem_len),
just_percent_mem(mem_available / mem_total),
human(swap_free, swap_len),
just_percent_swap(swap_free / (swap_total + 0.1)),
human(mem_used_zram, mem_len),
just_percent_mem(mem_used_zram / mem_total)))
# если swap_min_sigkill задан в абсолютной величине и Swap_total = 0
if swap_total > swap_min_sigkill_kb: # If swap_min_sigkill is absolute
swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1))
else:
swap_sigkill_pc = '-'
if swap_total > swap_min_sigterm_kb:
swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1))
else:
# печатать так: SwapTotal = 0, ignore swapspace
swap_sigterm_pc = '-'
# это для печати меминфо. Все переработать нахрен.
# далее пошло ветвление
# MEM SWAP KILL
if mem_available <= mem_min_sigkill_kb and \
swap_free <= swap_min_sigkill_kb:
time0 = time()
mem_info = '{}\n\033[4mMemory status that r' \
'equires corrective actions:' \
'\033[0m\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
'p_min_sigkill [{} MiB, {} %]'.format(
HR,
kib_to_mib(mem_available),
percent(mem_available / mem_total),
kib_to_mib(mem_min_sigkill_kb),
percent(mem_min_sigkill_kb / mem_total),
kib_to_mib(swap_free),
percent(swap_free / (swap_total + 0.1)),
kib_to_mib(swap_min_sigkill_kb),
swap_sigkill_pc)
find_victim_and_send_signal(SIGKILL)
kill_psi_t0 = time()
term_psi_t0 = time()
# ZRAM KILL
elif mem_used_zram >= zram_max_sigkill_kb:
time0 = time()
mem_info = '{}\n\033[4mMemory statu' \
's that requires corrective actions:' \
'\033[0m\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
'kill [{} MiB, {} %]'.format(
HR,
kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigkill_kb),
percent(zram_max_sigkill_kb / mem_total))
find_victim_and_send_signal(SIGKILL)
kill_psi_t0 = time()
term_psi_t0 = time()
# MEM SWAP TERM
elif mem_available <= mem_min_sigterm_kb and \
swap_free <= swap_min_sigterm_kb:
time0 = time()
mem_info = '{}\n\033[4mMemory status tha' \
't requires corrective actions:' \
'\033[0m\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
'p_min_sigterm [{} MiB, {} %]'.format(
HR,
kib_to_mib(mem_available),
percent(mem_available / mem_total),
kib_to_mib(mem_min_sigterm_kb),
# percent(mem_min_sigterm_kb / mem_total),
# ОКРУГЛЯТЬ НА МЕСТЕ ВЫШЕ (или не выше, хз)
round(mem_min_sigterm_percent, 1),
kib_to_mib(swap_free),
percent(swap_free / (swap_total + 0.1)),
kib_to_mib(swap_min_sigterm_kb),
swap_sigterm_pc)
find_victim_and_send_signal(SIGTERM)
kill_psi_t0 = time()
term_psi_t0 = time()
# ZRAM TERM
elif mem_used_zram >= zram_max_sigterm_kb:
time0 = time()
mem_info = '{}\n\033[4mMemory status that r' \
'equires corrective actions:' \
'\033[0m\n MemUsedZram [{} MiB, {} %] >= ' \
'zram_max_sigterm [{} M, {} %]'.format(
HR,
kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigterm_kb),
percent(zram_max_sigterm_kb / mem_total))
find_victim_and_send_signal(SIGTERM)
kill_psi_t0 = time()
term_psi_t0 = time()
# LOW MEMORY WARNINGS
elif gui_low_memory_warnings:
if mem_available <= mem_min_warnings_kb and \
swap_free <= swap_min_warnings_kb + 0.1 or \
mem_used_zram >= zram_max_warnings_kb:
warn_time_delta = time() - warn_time_now
warn_time_now = time()
warn_timer += warn_time_delta
if warn_timer > min_time_between_warnings:
t0 = time()
send_notify_warn()
print(time() - t0, 'send notify warning time')
warn_timer = 0
sleep_after_check_mem()
# SLEEP BETWEEN MEM CHECKS
else:
sleep_after_check_mem()