#!/usr/bin/env python3 """A daemon that prevents OOM in Linux systems.""" import os from time import sleep, time from operator import itemgetter from sys import stdout from signal import SIGKILL, SIGTERM, SIGSTOP, SIGCONT start_time = time() sig_dict = {SIGKILL: 'SIGKILL', SIGTERM: 'SIGTERM'} self_pid = str(os.getpid()) self_uid = os.geteuid() if self_uid == 0: root = True else: root = False wait_time = 3 # todo: make config option max_sleep_time = 2 # todo: make config option min_sleep_time = 0.1 notify_helper_path = '/usr/sbin/nohang_notify_helper' psi_path = '/proc/pressure/memory' psi_support = os.path.exists(psi_path) HR = '~' * 79 # todo: make config option print_total_stat = True stop_cont = False stop_cont_warn = False # print(os.path.realpath('/proc/29758/exe')) ########################################################################## # define functions def pid_to_state(pid): return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1] def stop(): #print() #print('Stop running processes...') t1 = time() t2 = time() stopped_list = [] for pid in os.listdir('/proc')[::-1]: # only directories whose names consist only of numbers, except /proc/1/ if pid[0].isdecimal() is False or pid is '1' or pid == self_pid: continue try: oom_score_r = int(rline1('/proc/' + pid + '/oom_score')) if oom_score_r > 9: uid_r = pid_to_uid(pid) #print('PID: {}, State: {}, oom_score {}'.format(pid, pid_to_state(pid), oom_score_r)) if uid_r != '0': stopped_list.append(pid) print('Send SIGSTOP to {}, {}, {}...'.format( pid, pid_to_name(pid), pid_to_cmdline(pid)[:40])) os.kill(int(pid), SIGSTOP) t2 = time() except FileNotFoundError: continue except ProcessLookupError: continue print('Stop time:', t2 - t1) stdout.flush() return stopped_list def cont(stopped_list): print() print('Continue stopped processes...') t1 = time() if len(stopped_list) > 0: for pid in stopped_list: print('Send SIGCONT to', [pid], pid_to_name(pid)) try: os.kill(int(pid), SIGCONT) except FileNotFoundError: continue except ProcessLookupError: continue t2 = time() print('All cont time: ', t2 - t1) def update_stat_dict_and_print(key): if key not in stat_dict: stat_dict.update({key: 1}) else: new_value = stat_dict[key] + 1 stat_dict.update({key: new_value}) if print_total_stat: stats_msg = '{}\n\033[4mThe following corrective actions have been implemented in the last {}:\033[0m'.format( HR, format_time(time() - start_time)) for i in stat_dict: stats_msg += '\n- {}: {}'.format(i, stat_dict[i]) print(stats_msg) def psi_mem_some_avg_total(): if psi_support: return float(rline1(psi_path).rpartition('=')[2]) def psi_mem_some_avg10(): return float(rline1(psi_path).split(' ')[1].split('=')[1]) def check_mem(): """find mem_available""" return int(rline1('/proc/meminfo').split(':')[1].strip(' kB\n')) def check_mem_and_swap(): """find mem_available, swap_total, swap_free""" with open('/proc/meminfo') as f: for n, line in enumerate(f): if n is 2: mem_available = int(line.split(':')[1].strip(' kB\n')) continue if n is swap_total_index: swap_total = int(line.split(':')[1].strip(' kB\n')) continue if n is swap_free_index: swap_free = int(line.split(':')[1].strip(' kB\n')) break return mem_available, swap_total, swap_free def check_zram(): """find MemUsedZram""" disksize_sum = 0 mem_used_total_sum = 0 for dev in os.listdir('/sys/block'): if dev.startswith('zram'): stat = zram_stat(dev) disksize_sum += int(stat[0]) mem_used_total_sum += int(stat[1]) ZRAM_DISKSIZE_FACTOR = 0.0042 # Означает, что при задани zram disksize = 1 GiB доступная память # уменьшится на 0.0042 GiB. # Найден экспериментально, требует уточнения с разными ядрами и архитектурами. # На небольших дисксайзах (до гигабайта) может быть больше, до 0.0045. # Создатель модуля zram утверждает, что ZRAM_DISKSIZE_FACTOR доожен быть 0.001: # ("zram uses about 0.1% of the size of the disk" # - https://www.kernel.org/doc/Documentation/blockdev/zram.txt), # но это утверждение противоречит опытным данным. # ZRAM_DISKSIZE_FACTOR = deltaMemAvailavle / disksize # found experimentally return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0 def format_time(t): t = int(t) if t < 60: return '{} sec'.format(t) elif t >= 60 and t < 3600: m = t // 60 s = t % 60 return '{} min {} sec'.format(m, s) else: h = t // 3600 s0 = t - h * 3600 m = s0 // 60 s = s0 % 60 return '{} h {} min {} sec'.format(h, m, s) def string_to_float_convert_test(string): """Try to interprete string values as floats.""" try: return float(string) except ValueError: return None def string_to_int_convert_test(string): """Try to interpret string values as integers.""" try: return int(string) except ValueError: return None def conf_parse_string(param): """ Get string parameters from the config dict. param: config_dict key returns config_dict[param].strip() """ if param in config_dict: return config_dict[param].strip() else: print('All the necessary parameters must be in the config') print('There is no "{}" parameter in the config'.format(param)) exit() def conf_parse_bool(param): """ Get bool parameters from the config_dict. param: config_dicst key returns bool """ if param in config_dict: param_str = config_dict[param] if param_str == 'True': return True elif param_str == 'False': return False else: print('Invalid value of the "{}" parameter.'.format(param_str)) print('Valid values are True and False.') print('Exit') exit() else: print('All the necessary parameters must be in the config') print('There is no "{}" parameter in the config'.format(param_str)) exit() def rline1(path): """read 1st line from path.""" with open(path) as f: for line in f: return line[:-1] def kib_to_mib(num): """Convert KiB values to MiB values.""" return round(num / 1024.0) def percent(num): """Interprete mum as percentage.""" return round(num * 100, 1) def just_percent_mem(num): """convert num to percent and justify""" return str(round(num * 100, 1)).rjust(4, ' ') def just_percent_swap(num): return str(round(num * 100, 1)).rjust(5, ' ') def human(num, lenth): """Convert KiB values to MiB values with right alignment""" return str(round(num / 1024)).rjust(lenth, ' ') def zram_stat(zram_id): """ Get zram state. zram_id: str zram block-device id returns bytes diskcize, str mem_used_total """ try: disksize = rline1('/sys/block/' + zram_id + '/disksize') except FileNotFoundError: return '0', '0' if disksize == ['0\n']: return '0', '0' try: mm_stat = rline1('/sys/block/' + zram_id + '/mm_stat').split(' ') mm_stat_list = [] for i in mm_stat: if i != '': mm_stat_list.append(i) mem_used_total = mm_stat_list[2] except FileNotFoundError: mem_used_total = rline1('/sys/block/' + zram_id + '/mem_used_total') return disksize, mem_used_total # BYTES, str def pid_to_name(pid): """ Get process name by pid. pid: str pid of required process returns string process_name """ try: with open('/proc/' + pid + '/status') as f: f.seek(6) for line in f: return line[:-1] except FileNotFoundError: return '' except ProcessLookupError: return '' except UnicodeDecodeError: with open('/proc/' + pid + '/status', 'rb') as f: f.seek(6) return f.read(15).decode( 'utf-8', 'ignore').partition('\n')[0] def pid_to_cmdline(pid): """ Get process cmdline by pid. pid: str pid of required process returns string cmdline """ with open('/proc/' + pid + '/cmdline') as f: return f.read().replace('\x00', ' ').rstrip() def pid_to_environ(pid): """ Get process cmdline by pid. pid: str pid of required process returns string cmdline """ with open('/proc/' + pid + '/environ') as f: return f.read().replace('\x00', '\n').rstrip() def pid_to_uid(pid): '''return euid''' try: with open('/proc/' + pid + '/status') as f: for n, line in enumerate(f): if n is uid_index: return line.split('\t')[2] except UnicodeDecodeError: with open('/proc/' + pid + '/status', 'rb') as f: f_list = f.read().decode('utf-8', 'ignore').split('\n') return f_list[uid_index].split('\t')[2] def notify_send_wait(title, body): '''GUI notifications with UID != 0''' with Popen(['notify-send', '--icon=dialog-warning', title, body]) as proc: try: proc.wait(timeout=wait_time) except TimeoutExpired: proc.kill() print('TimeoutExpired: notify-send {} {}'.format(title, body)) def notify_helper(title, body): '''GUI notification with UID = 0''' with Popen([notify_helper_path, title, body]) as proc: try: proc.wait(timeout=wait_time) except TimeoutExpired: proc.kill() print( 'TimeoutExpired: nohang_notify_helper: {} {}'.format( title, body)) def send_notify_warn(): """ Look for process with maximum 'badness' and warn user with notification. (implement Low memory warnings) """ if stop_cont_warn: stopped_list = stop() # find process with max badness fat_tuple = fattest() pid = fat_tuple[0] name = pid_to_name(pid) if mem_used_zram > 0: low_mem_percent = '{}% {}% {}%'.format( round(mem_available / mem_total * 100), round(swap_free / (swap_total + 0.1) * 100), round(mem_used_zram / mem_total * 100)) elif swap_free > 0: low_mem_percent = '{}% {}%'.format( round(mem_available / mem_total * 100), round(swap_free / (swap_total + 0.1) * 100)) else: low_mem_percent = '{}%'.format( round(mem_available / mem_total * 100)) # title = 'Low memory: {}'.format(low_mem_percent) title = 'Low memory' body = 'Hog: {} [{}]'.format( name.replace( # symbol '&' can break notifications in some themes, # therefore it is replaced by '*' '&', '*'), pid ) if root: # If nohang was started by root # send notification to all active users with special script notify_helper(title, body) else: # Or by regular user # send notification to user that runs this nohang notify_send_wait(title, body) if stop_cont_warn: cont(stopped_list) def send_notify(signal, name, pid): """ Notificate about OOM Preventing. signal: key for notify_sig_dict name: str process name pid: str process pid """ title = 'Hang prevention' body = '{} {} [{}]'.format( notify_sig_dict[signal], name.replace( # symbol '&' can break notifications in some themes, # therefore it is replaced by '*' '&', '*'), pid ) if root: # send notification to all active users with notify-send notify_helper(title, body) else: # send notification to user that runs this nohang notify_send_wait(title, body) def send_notify_etc(pid, name, command): """ Notificate about OOM Preventing. command: str command that will be executed name: str process name pid: str process pid """ title = 'Hang prevention' body = 'Victim is process {} [{}]\nExecute the command:\n{}'.format( name.replace('&', '*'), pid, command.replace('&', '*') ) if root: # send notification to all active users with notify-send notify_helper(title, body) else: # send notification to user that runs this nohang notify_send_wait(title, body) def sleep_after_send_signal(signal): """ Sleeping after signal was sent. signal: sent signal """ if signal is SIGKILL: if print_sleep_periods: print(' sleep', min_delay_after_sigkill) sleep(min_delay_after_sigkill) else: if print_sleep_periods: print(' sleep', min_delay_after_sigterm) sleep(min_delay_after_sigterm) def fattest(): """ Find the process with highest badness and its badness adjustment Return pid and badness -> find_mem_hog() or find_victim() """ pid_badness_list = [] for pid in os.listdir('/proc'): # only directories whose names consist only of numbers, except /proc/1/ #if pid[0].isdecimal() is False: # continue if pid[0].isdecimal() is False or pid is '1' or pid == self_pid: continue x = os.path.exists('/proc/' + pid + '/exe') if x is False: continue #print(x) # в таком случае в список попадет self и selfpid # их можно отдельно вычленить # find and modify badness (if it needs) try: #realpath = os.path.realpath('/proc/' + pid + '/exe') #print(pid, pid_to_name(pid), realpath) badness = int(rline1('/proc/' + pid + '/oom_score')) #state = pid_to_state(pid) name = pid_to_name(pid) except FileNotFoundError: continue except ProcessLookupError: continue pid_badness_list.append((pid, badness)) # Make list of (pid, badness) tuples, sorted by 'badness' values pid_tuple_list = sorted( pid_badness_list, key=itemgetter(1), reverse=True )[0] pid = pid_tuple_list[0] # Get maximum 'badness' value victim_badness = pid_tuple_list[1] #print(pid_badness_list) return pid, victim_badness t0 = time() x = fattest() t1 = time() print(t1 - t0) print(x) exit() def find_victim_and_send_signal(signal): """ Find victim with highest badness and send SIGTERM/SIGKILL -> implement_corrective_action() """ if stop_cont: stopped_list = stop() pid, victim_badness = fattest() name = pid_to_name(pid) if victim_badness >= min_badness: # Try to send signal to found victim # Get VmRSS and VmSwap and cmdline of victim process # and try to send a signal try: with open('/proc/' + pid + '/status') as f: for n, line in enumerate(f): if n is uid_index: uid = line.split('\t')[1] continue if n is vm_size_index: vm_size = kib_to_mib(int(line.split('\t')[1][:-4])) continue if n is vm_rss_index: vm_rss = kib_to_mib(int(line.split('\t')[1][:-4])) continue if detailed_rss: if n is anon_index: anon_rss = kib_to_mib( int(line.split('\t')[1][:-4])) continue if n is file_index: file_rss = kib_to_mib( int(line.split('\t')[1][:-4])) continue if n is shmem_index: shmem_rss = kib_to_mib( int(line.split('\t')[1][:-4])) continue if n is vm_swap_index: vm_swap = kib_to_mib(int(line.split('\t')[1][:-4])) break with open('/proc/' + pid + '/cmdline') as file: cmdline = file.readlines()[0].replace('\x00', ' ') oom_score = rline1('/proc/' + pid + '/oom_score') oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') except FileNotFoundError: print(mem_info) print('The victim died in the search process: FileNotFoundError') update_stat_dict_and_print( 'The victim died in the search process: FileNotFoundError') return None except ProcessLookupError: print(mem_info) print('The victim died in the search process: ProcessLookupError') update_stat_dict_and_print( 'The victim died in the search process: ProcessLookupError') return None except UnicodeDecodeError: # тут надо снова все исключ обработать with open('/proc/' + pid + '/status', 'rb') as f: f_list = f.read().decode('utf-8', 'ignore').split('\n') for i in range(len(f_list)): if i is ppid_index: ppid = f_list[i].split('\t')[2] for i in range(len(f_list)): if i is uid_index: uid = f_list[i].split('\t')[2] if i is vm_size_index: vm_size = kib_to_mib( int(f_list[i].split('\t')[1][:-3])) if i is vm_rss_index: vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3])) if detailed_rss: if i is anon_index: anon_rss = kib_to_mib( int(f_list[i].split('\t')[1][:-3])) if i is file_index: file_rss = kib_to_mib( int(f_list[i].split('\t')[1][:-3])) if i is shmem_index: shmem_rss = kib_to_mib( int(f_list[i].split('\t')[1][:-3])) if i is vm_swap_index: vm_swap = kib_to_mib( int(f_list[i].split('\t')[1][:-3])) with open('/proc/' + pid + '/cmdline') as file: cmdline = file.readlines()[0].replace('\x00', ' ') oom_score = rline1('/proc/' + pid + '/oom_score') oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') except IndexError: print(mem_info) print('The victim died in the search process: IndexError') update_stat_dict_and_print( 'The victim died in the search process: IndexError') return None except ValueError: print(mem_info) print('The victim died in the search process: ValueError') update_stat_dict_and_print( 'The victim died in the search process: ValueError') return None len_vm = len(str(vm_size)) if detailed_rss: environ = pid_to_environ(pid) victim_info = '\033[4mFound a process with highest badness:\033[0m' \ '\n Name: \033[33m{}\033[0m' \ '\n PID: \033[33m{}\033[0m' \ '\n UID: \033[33m{}\033[0m' \ '\n badness: \033[33m{}\033[0m, ' \ 'oom_score: \033[33m{}\033[0m, ' \ 'oom_score_adj: \033[33m{}\033[0m' \ '\n VmSize: \033[33m{}\033[0m MiB' \ '\n VmRSS: \033[33m{}\033[0m MiB (' \ 'Anon: \033[33m{}\033[0m MiB, ' \ 'File: \033[33m{}\033[0m MiB, ' \ 'Shmem: \033[33m{}\033[0m MiB)' \ '\n VmSwap: \033[33m{}\033[0m MiB' \ '\n environ:\n\033[33m{}\033[0m' \ '\n cmdline: \033[33m{}\033[0m'.format( name, pid, uid, victim_badness, oom_score, oom_score_adj, vm_size, str(vm_rss).rjust(len_vm), anon_rss, file_rss, shmem_rss, str(vm_swap).rjust(len_vm), environ, cmdline ) else: # нахер такое ветвление victim_info = '\033[4mFound a process with highest badness:\033[0m' \ '\n Name: \033[33m{}\033[0m' \ '\n PID: \033[33m{}\033[0m' \ '\n UID: \033[33m{}\033[0m' \ '\n Badness: \033[33m{}\033[0m, ' \ 'oom_score: \033[33m{}\033[0m, ' \ 'oom_score_adj: \033[33m{}\033[0m' \ '\n VmSize: \033[33m{}\033[0m MiB' \ '\n VmRSS: \033[33m{}\033[0m MiB' \ '\n VmSwap: \033[33m{}\033[0m MiB' \ '\n CmdLine: \033[33m{}\033[0m'.format( name, pid, uid, victim_badness, oom_score, oom_score_adj, vm_size, str(vm_rss).rjust(len_vm), str(vm_swap).rjust(len_vm), cmdline) if execute_the_command and signal is SIGTERM and name in etc_dict: command = etc_dict[name] exit_status = os.system(etc_dict[name].replace( '$PID', pid).replace('$NAME', pid_to_name(pid))) if exit_status == 0: exit_status = '\033[32m0\033[0m' else: exit_status = '\033[31m{}\033[0m'.format(exit_status) response_time = time() - time0 etc_info = '{}' \ '\n\033[4mImplement corrective action:\033[0m\n Run the command: \033[4m{}\033[0m' \ '\n Exit status: {}; response time: {} ms'.format( victim_info, command.replace( '$PID', pid).replace('$NAME', pid_to_name(pid)), exit_status, round(response_time * 1000)) print(mem_info) print(etc_info) key = "Run the command '\033[35m{}\033[0m'".format(command) update_stat_dict_and_print(key) if gui_notifications: send_notify_etc( pid, name, command.replace('$PID', pid).replace('$NAME', pid_to_name(pid))) else: try: m = check_mem_and_swap() ma = round(int(m[0]) / 1024.0) sf = round(int(m[2]) / 1024.0) print('\nMemory status before sending a signal:\nMemA' 'v: {} MiB, SwFree: {} MiB'.format(ma, sf)) if stop_cont: os.kill(int(pid), SIGCONT) os.kill(int(pid), signal) response_time = time() - time0 send_result = '\033[32mOK\033[0m; response time: {} ms'.format( round(response_time * 1000)) preventing_oom_message = '{}' \ '\n\033[4mImplement a corrective action:\033[0m\n ' \ 'Sending \033[4m{}\033[0m to the victim; {}'.format( victim_info, sig_dict[signal], send_result) key = 'Send \033[35m{}\033[0m to \033[35m{}\033[0m'.format( sig_dict[signal], name) if gui_notifications: send_notify(signal, name, pid) except FileNotFoundError: response_time = time() - time0 send_result = 'no such process; response time: {} ms'.format( round(response_time * 1000)) key = 'The victim died in the search process: FileNotFoundError' except ProcessLookupError: response_time = time() - time0 send_result = 'no such process; response time: {} ms'.format( round(response_time * 1000)) key = 'The victim died in the search process: ProcessLookupError' print(mem_info) print(pid_to_state(pid)) print(preventing_oom_message) update_stat_dict_and_print(key) else: response_time = time() - time0 print(mem_info) victim_badness_is_too_small = 'victim badness {} < min_b' \ 'adness {}; nothing to do; response time: {} ms'.format( victim_badness, min_badness, round(response_time * 1000)) print(victim_badness_is_too_small) # update stat_dict key = 'victim badness < min_badness' update_stat_dict_and_print(key) if stop_cont: cont(stopped_list) sleep_after_send_signal(signal) def sleep_after_check_mem(): """Specify sleep times depends on rates and avialable memory.""" if mem_min_sigkill_kb < mem_min_sigterm_kb: mem_point = mem_available - mem_min_sigterm_kb else: mem_point = mem_available - mem_min_sigkill_kb if swap_min_sigkill_kb < swap_min_sigterm_kb: swap_point = swap_free - swap_min_sigterm_kb else: swap_point = swap_free - swap_min_sigkill_kb t_mem = mem_point / rate_mem t_swap = swap_point / rate_swap t_zram = (mem_total * 0.9 - mem_used_zram) / rate_zram t_mem_swap = t_mem + t_swap t_mem_zram = t_mem + t_zram if t_mem_swap <= t_mem_zram: t = t_mem_swap else: t = t_mem_zram if t > max_sleep_time: t = max_sleep_time elif t < min_sleep_time: t = min_sleep_time else: pass try: if print_sleep_periods: print('sleep', round(t, 2)) # ' (t_mem={}, t_swap={}, t_zram={})'.format( # round(t_mem, 2), # round(t_swap, 2), # round(t_zram, 2))) stdout.flush() sleep(t) except KeyboardInterrupt: exit() def calculate_percent(arg_key): """ parse conf dict Calculate mem_min_KEY_percent. Try use this one) arg_key: str key for config_dict returns int mem_min_percent or NoneType if got some error """ if arg_key in config_dict: mem_min = config_dict[arg_key] if mem_min.endswith('%'): # truncate percents, so we have a number mem_min_percent = mem_min[:-1].strip() # then 'float test' mem_min_percent = string_to_float_convert_test(mem_min_percent) if mem_min_percent is None: print('Invalid {} value, not float\nExit'.format(arg_key)) exit() # Final validations... if mem_min_percent < 0 or mem_min_percent > 100: print( '{}, as percents value, out of range [0; 100]\nExit'.format(arg_key)) exit() # mem_min_sigterm_percent is clean and valid float percentage. Can # translate into Kb mem_min_kb = mem_min_percent / 100 * mem_total mem_min_mb = round(mem_min_kb / 1024) elif mem_min.endswith('M'): mem_min_mb = string_to_float_convert_test(mem_min[:-1].strip()) if mem_min_mb is None: print('Invalid {} value, not float\nExit'.format(arg_key)) exit() mem_min_kb = mem_min_mb * 1024 if mem_min_kb > mem_total: print( '{} value can not be greater then MemTotal ({} MiB)\nExit'.format( arg_key, round( mem_total / 1024))) exit() mem_min_percent = mem_min_kb / mem_total * 100 else: print('Invalid {} units in config.\n Exit'.format(arg_key)) mem_min_percent = None else: print('{} not in config\nExit'.format(arg_key)) mem_min_percent = None return mem_min_kb, mem_min_mb, mem_min_percent ########################################################################## # find mem_total # find positions of SwapFree and SwapTotal in /proc/meminfo with open('/proc/meminfo') as f: mem_list = f.readlines() mem_list_names = [] for s in mem_list: mem_list_names.append(s.split(':')[0]) if mem_list_names[2] != 'MemAvailable': print('Your Linux kernel is too old, Linux 3.14+ requied\nExit') exit() swap_total_index = mem_list_names.index('SwapTotal') swap_free_index = swap_total_index + 1 mem_total = int(mem_list[0].split(':')[1].strip(' kB\n')) # Get names from /proc/*/status to be able to get VmRSS and VmSwap values with open('/proc/self/status') as file: status_list = file.readlines() status_names = [] for s in status_list: status_names.append(s.split(':')[0]) ppid_index = status_names.index('PPid') vm_size_index = status_names.index('VmSize') vm_rss_index = status_names.index('VmRSS') vm_swap_index = status_names.index('VmSwap') uid_index = status_names.index('Uid') try: anon_index = status_names.index('RssAnon') file_index = status_names.index('RssFile') shmem_index = status_names.index('RssShmem') detailed_rss = True # print(detailed_rss, 'detailed_rss') except ValueError: detailed_rss = False # print('It is not Linux 4.5+') ########################################################################## ''' # Configurations cd = os.getcwd() ''' config = '/etc/nohang/nohang.conf' # config = 'nohang.conf' print('Config:', config) ########################################################################## # parsing the config with obtaining the parameters dictionary # conf_parameters_dict # conf_restart_dict # dictionary with config options config_dict = dict() processname_re_list = [] cmdline_re_list = [] uid_re_list = [] # dictionary with names and commands for the parameter # execute_the_command # тут тоже список нужен, а не словарь etc_dict = dict() # will store corrective actions stat stat_dict = dict() try: with open(config) as f: for line in f: a = line.startswith('#') b = line.startswith('\n') c = line.startswith('\t') d = line.startswith(' ') etc = line.startswith('$ETC') if not a and not b and not c and not d and not etc: a = line.partition('=') config_dict[a[0].strip()] = a[2].strip() if etc: a = line[4:].split('///') etc_name = a[0].strip() etc_command = a[1].strip() if len(etc_name) > 15: print('Invalid config, the length of the process ' 'name must not exceed 15 characters\nExit') exit() etc_dict[etc_name] = etc_command # NEED VALIDATION! if line.startswith('@PROCESSNAME_RE'): a = line.partition('@PROCESSNAME_RE')[ 2].strip(' \n').partition('///') processname_re_list.append((a[0].strip(' '), a[2].strip(' '))) if line.startswith('@CMDLINE_RE'): a = line.partition('@CMDLINE_RE')[2].strip( ' \n').partition('///') cmdline_re_list.append((a[0].strip(' '), a[2].strip(' '))) if line.startswith('@UID_RE'): a = line.partition('@UID_RE')[2].strip(' \n').partition('///') uid_re_list.append((a[0].strip(' '), a[2].strip(' '))) except PermissionError: print('PermissionError', conf_err_mess) exit() except UnicodeDecodeError: print('UnicodeDecodeError', conf_err_mess) exit() except IsADirectoryError: print('IsADirectoryError', conf_err_mess) exit() except IndexError: print('IndexError', conf_err_mess) exit() # print(processname_re_list) # print(cmdline_re_list) # print(uid_re_list) ########################################################################## # extracting parameters from the dictionary # check for all necessary parameters # validation of all parameters print_config = conf_parse_bool('print_config') print_mem_check_results = conf_parse_bool('print_mem_check_results') print_sleep_periods = conf_parse_bool('print_sleep_periods') gui_low_memory_warnings = conf_parse_bool('gui_low_memory_warnings') gui_notifications = conf_parse_bool('gui_notifications') decrease_oom_score_adj = conf_parse_bool('decrease_oom_score_adj') execute_the_command = conf_parse_bool('execute_the_command') ignore_psi = conf_parse_bool('ignore_psi') regex_matching = conf_parse_bool('regex_matching') re_match_cmdline = conf_parse_bool('re_match_cmdline') re_match_uid = conf_parse_bool('re_match_uid') if regex_matching or re_match_cmdline or re_match_uid: from re import search import sre_constants mem_min_sigterm_kb, mem_min_sigterm_mb, mem_min_sigterm_percent = calculate_percent( 'mem_min_sigterm') mem_min_sigkill_kb, mem_min_sigkill_mb, mem_min_sigkill_percent = calculate_percent( 'mem_min_sigkill') zram_max_sigterm_kb, zram_max_sigterm_mb, zram_max_sigterm_percent = calculate_percent( 'zram_max_sigterm') zram_max_sigkill_kb, zram_max_sigkill_mb, zram_max_sigkill_percent = calculate_percent( 'zram_max_sigkill') mem_min_warnings_kb, mem_min_warnings_mb, mem_min_warnings_percent = calculate_percent( 'mem_min_warnings') zram_max_warnings_kb, zram_max_warnings_mb, zram_max_warnings_percent = calculate_percent( 'zram_max_warnings') if 'rate_mem' in config_dict: rate_mem = string_to_float_convert_test(config_dict['rate_mem']) if rate_mem is None: print('Invalid rate_mem value, not float\nExit') exit() if rate_mem <= 0: print('rate_mem MUST be > 0\nExit') exit() else: print('rate_mem not in config\nExit') exit() if 'rate_swap' in config_dict: rate_swap = string_to_float_convert_test(config_dict['rate_swap']) if rate_swap is None: print('Invalid rate_swap value, not float\nExit') exit() if rate_swap <= 0: print('rate_swap MUST be > 0\nExit') exit() else: print('rate_swap not in config\nExit') exit() if 'rate_zram' in config_dict: rate_zram = string_to_float_convert_test(config_dict['rate_zram']) if rate_zram is None: print('Invalid rate_zram value, not float\nExit') exit() if rate_zram <= 0: print('rate_zram MUST be > 0\nExit') exit() else: print('rate_zram not in config\nExit') exit() if 'swap_min_sigterm' in config_dict: swap_min_sigterm = config_dict['swap_min_sigterm'] else: print('swap_min_sigterm not in config\nExit') exit() if 'swap_min_sigkill' in config_dict: swap_min_sigkill = config_dict['swap_min_sigkill'] else: print('swap_min_sigkill not in config\nExit') exit() if 'min_delay_after_sigterm' in config_dict: min_delay_after_sigterm = string_to_float_convert_test( config_dict['min_delay_after_sigterm']) if min_delay_after_sigterm is None: print('Invalid min_delay_after_sigterm value, not float\nExit') exit() if min_delay_after_sigterm < 0: print('min_delay_after_sigterm must be positiv\nExit') exit() else: print('min_delay_after_sigterm not in config\nExit') exit() if 'min_delay_after_sigkill' in config_dict: min_delay_after_sigkill = string_to_float_convert_test( config_dict['min_delay_after_sigkill']) if min_delay_after_sigkill is None: print('Invalid min_delay_after_sigkill value, not float\nExit') exit() if min_delay_after_sigkill < 0: print('min_delay_after_sigkill must be positive\nExit') exit() else: print('min_delay_after_sigkill not in config\nExit') exit() if 'psi_avg10_sleep_time' in config_dict: psi_avg10_sleep_time = string_to_float_convert_test( config_dict['psi_avg10_sleep_time']) if psi_avg10_sleep_time is None: print('Invalid psi_avg10_sleep_time value, not float\nExit') exit() if psi_avg10_sleep_time < 0: print('psi_avg10_sleep_time must be positive\nExit') exit() else: print('psi_avg10_sleep_time not in config\nExit') exit() if 'sigkill_psi_avg10' in config_dict: sigkill_psi_avg10 = string_to_float_convert_test( config_dict['sigkill_psi_avg10']) if sigkill_psi_avg10 is None: print('Invalid sigkill_psi_avg10 value, not float\nExit') exit() if sigkill_psi_avg10 < 0 or sigkill_psi_avg10 > 100: print('sigkill_psi_avg10 must be in the range [0; 100]\nExit') exit() else: print('sigkill_psi_avg10 not in config\nExit') exit() if 'sigterm_psi_avg10' in config_dict: sigterm_psi_avg10 = string_to_float_convert_test( config_dict['sigterm_psi_avg10']) if sigterm_psi_avg10 is None: print('Invalid sigterm_psi_avg10 value, not float\nExit') exit() if sigterm_psi_avg10 < 0 or sigterm_psi_avg10 > 100: print('sigterm_psi_avg10 must be in the range [0; 100]\nExit') exit() else: print('sigterm_psi_avg10 not in config\nExit') exit() if 'min_badness' in config_dict: min_badness = string_to_int_convert_test( config_dict['min_badness']) if min_badness is None: print('Invalid min_badness value, not integer\nExit') exit() if min_badness < 0 or min_badness > 1000: print('Invalud min_badness value\nExit') exit() else: print('min_badness not in config\nExit') exit() if 'oom_score_adj_max' in config_dict: oom_score_adj_max = string_to_int_convert_test( config_dict['oom_score_adj_max']) if oom_score_adj_max is None: print('Invalid oom_score_adj_max value, not integer\nExit') exit() if oom_score_adj_max < 0 or oom_score_adj_max > 1000: print('Invalid oom_score_adj_max value\nExit') exit() else: print('oom_score_adj_max not in config\nExit') exit() if 'min_time_between_warnings' in config_dict: min_time_between_warnings = string_to_float_convert_test( config_dict['min_time_between_warnings']) if min_time_between_warnings is None: print('Invalid min_time_between_warnings value, not float\nExit') exit() if min_time_between_warnings < 1 or min_time_between_warnings > 300: print('min_time_between_warnings value out of range [1; 300]\nExit') exit() else: print('min_time_between_warnings not in config\nExit') exit() if 'swap_min_warnings' in config_dict: swap_min_warnings = config_dict['swap_min_warnings'] else: print('swap_min_warnings not in config\nExit') exit() ########################################################################## # Get KiB levels if it's possible. # получ кб. если не кб - то процент. Если процент - находим кб ниже на # основе полученного своптотал и процентов. def get_swap_threshold_tuple(string): # re (Num %, True) or (Num KiB, False) """Returns KiB value if abs val was set in config, or tuple with %""" # return tuple with abs and bool: (abs %, True) or (abs MiB, False) if string.endswith('%'): valid = string_to_float_convert_test(string[:-1]) if valid is None: print('somewhere swap unit is not float_%') exit() value = float(string[:-1].strip()) if value < 0 or value > 100: print('invalid value, must be from the range[0; 100] %') exit() return value, True elif string.endswith('M'): valid = string_to_float_convert_test(string[:-1]) if valid is None: print('somewhere swap unit is not float_M') exit() value = float(string[:-1].strip()) * 1024 if value < 0: print('invalid unit in config (negative value)') exit() return value, False else: print('Invalid config file. There are invalid units somewhere\nExit') exit() swap_min_sigterm_tuple = get_swap_threshold_tuple(swap_min_sigterm) swap_min_sigkill_tuple = get_swap_threshold_tuple(swap_min_sigkill) swap_min_warnings_tuple = get_swap_threshold_tuple(swap_min_warnings) swap_term_is_percent = swap_min_sigterm_tuple[1] if swap_term_is_percent: swap_min_sigterm_percent = swap_min_sigterm_tuple[0] else: swap_min_sigterm_kb = swap_min_sigterm_tuple[0] swap_kill_is_percent = swap_min_sigkill_tuple[1] if swap_kill_is_percent: swap_min_sigkill_percent = swap_min_sigkill_tuple[0] else: swap_min_sigkill_kb = swap_min_sigkill_tuple[0] swap_warn_is_percent = swap_min_warnings_tuple[1] if swap_warn_is_percent: swap_min_warnings_percent = swap_min_warnings_tuple[0] else: swap_min_warnings_kb = swap_min_warnings_tuple[0] ########################################################################## if print_config: print( '\n1. Memory levels to respond to as an OOM threat\n[displaying these options need fix]\n') print('mem_min_sigterm: {} MiB, {} %'.format( round(mem_min_sigterm_mb), round(mem_min_sigterm_percent, 1))) print('mem_min_sigkill: {} MiB, {} %'.format( round(mem_min_sigkill_mb), round(mem_min_sigkill_percent, 1))) print('swap_min_sigterm: {}'.format(swap_min_sigterm)) print('swap_min_sigkill: {}'.format(swap_min_sigkill)) print('zram_max_sigterm: {} MiB, {} %'.format( round(zram_max_sigterm_mb), round(zram_max_sigterm_percent, 1))) print('zram_max_sigkill: {} MiB, {} %'.format( round(zram_max_sigkill_mb), round(zram_max_sigkill_percent, 1))) print('\n2. The frequency of checking the level of available memory (and CPU usage)\n') print('rate_mem: {}'.format(rate_mem)) print('rate_swap: {}'.format(rate_swap)) print('rate_zram: {}'.format(rate_zram)) print('\n3. The prevention of killing innocent victims\n') print('min_delay_after_sigterm: {}'.format(min_delay_after_sigterm)) print('min_delay_after_sigkill: {}'.format(min_delay_after_sigkill)) print('min_badness: {}'.format(min_badness)) # False (OK) - OK не нужен когда фолс print('decrease_oom_score_adj: {}'.format( decrease_oom_score_adj )) if decrease_oom_score_adj: print('oom_score_adj_max: {}'.format(oom_score_adj_max)) print('\n4. Impact on the badness of processes via matching their' ' names, cmdlines ir UIDs with regular expressions\n') print('(todo)') print('\n5. The execution of a specific command instead of sending the\nSIGTERM signal\n') print('execute_the_command: {}'.format(execute_the_command)) if execute_the_command: print('\nPROCESS NAME COMMAND TO EXECUTE') for key in etc_dict: print('{} {}'.format(key.ljust(15), etc_dict[key])) print('\n6. GUI notifications:\n- OOM prevention results and\n- low memory warnings\n') print('gui_notifications: {}'.format(gui_notifications)) print('gui_low_memory_warnings: {}'.format(gui_low_memory_warnings)) if gui_low_memory_warnings: print('min_time_between_warnings: {}'.format(min_time_between_warnings)) print('mem_min_warnings: {} MiB, {} %'.format( round(mem_min_warnings_mb), round(mem_min_warnings_percent, 1))) print('swap_min_warnings: {}'.format(swap_min_warnings)) print('zram_max_warnings: {} MiB, {} %'.format( round(zram_max_warnings_mb), round(zram_max_warnings_percent, 1))) print('\n7. Output verbosity\n') print('print_config: {}'.format(print_config)) print('print_mem_check_results: {}'.format(print_mem_check_results)) print('print_sleep_periods: {}\n'.format(print_sleep_periods)) ########################################################################## # for calculating the column width when printing mem and zram mem_len = len(str(round(mem_total / 1024.0))) if gui_notifications or gui_low_memory_warnings: from subprocess import Popen, TimeoutExpired notify_sig_dict = {SIGKILL: 'Killing', SIGTERM: 'Terminating'} rate_mem = rate_mem * 1048576 rate_swap = rate_swap * 1048576 rate_zram = rate_zram * 1048576 warn_time_now = 0 warn_time_delta = 1000 warn_timer = 0 print('Monitoring started!') stdout.flush() sigterm_psi = sigterm_psi_avg10 sigkill_psi = sigkill_psi_avg10 psi_min_sleep_time_after_action = psi_avg10_sleep_time ########################################################################## if psi_support and not ignore_psi: kill_psi_t0 = time() + psi_avg10_sleep_time term_psi_t0 = time() + psi_avg10_sleep_time avg_value = '' while True: if psi_support and not ignore_psi: avg10 = psi_mem_some_avg10() if print_mem_check_results: avg_value = 'PSI mem some avg10: {} | '.format(str(avg10).rjust(6)) if avg10 >= sigkill_psi and time() - kill_psi_t0 >= psi_min_sleep_time_after_action: time0 = time() mem_info = 'avg ({}) > sigkill_psi ({})'.format(avg10, sigkill_psi) find_victim_and_send_signal(SIGKILL) kill_psi_t0 = time() elif avg10 >= sigterm_psi and time() - term_psi_t0 >= psi_min_sleep_time_after_action: time0 = time() mem_info = 'avg ({}) > sigterm_psi ({})'.format(avg10, sigterm_psi) find_victim_and_send_signal(SIGTERM) term_psi_t0 = time() else: pass mem_available, swap_total, swap_free = check_mem_and_swap() # если метры - получаем киб выше и сразу. см. # if swap_min_sigkill is set in percent if swap_kill_is_percent: swap_min_sigkill_kb = swap_total * swap_min_sigkill_percent / 100.0 if swap_term_is_percent: swap_min_sigterm_kb = swap_total * swap_min_sigterm_percent / 100.0 if swap_warn_is_percent: swap_min_warnings_kb = swap_total * swap_min_warnings_percent / 100.0 # в общем случае для работы нужны килобайты. Если в процентах задано - # находим КБ тут, после получения своптотал. mem_used_zram = check_zram() if print_mem_check_results: # Calculate 'swap-column' width swap_len = len(str(round(swap_total / 1024.0))) # Output avialable mem sizes if swap_total == 0 and mem_used_zram == 0: print('{}MemAvail: {} M, {} %'.format( avg_value, human(mem_available, mem_len), just_percent_mem(mem_available / mem_total))) elif swap_total > 0 and mem_used_zram == 0: print('{}MemAvail: {} M, {} % | SwapFree: {} M, {} %'.format( avg_value, human(mem_available, mem_len), just_percent_mem(mem_available / mem_total), human(swap_free, swap_len), just_percent_swap(swap_free / (swap_total + 0.1)))) else: print('{}MemAvail: {} M, {} % | SwapFree: {} M, {} % | Mem' 'UsedZram: {} M, {} %'.format( avg_value, human(mem_available, mem_len), just_percent_mem(mem_available / mem_total), human(swap_free, swap_len), just_percent_swap(swap_free / (swap_total + 0.1)), human(mem_used_zram, mem_len), just_percent_mem(mem_used_zram / mem_total))) # если swap_min_sigkill задан в абсолютной величине и Swap_total = 0 if swap_total > swap_min_sigkill_kb: # If swap_min_sigkill is absolute swap_sigkill_pc = percent(swap_min_sigkill_kb / (swap_total + 0.1)) else: swap_sigkill_pc = '-' if swap_total > swap_min_sigterm_kb: swap_sigterm_pc = percent(swap_min_sigterm_kb / (swap_total + 0.1)) else: # печатать так: SwapTotal = 0, ignore swapspace swap_sigterm_pc = '-' # это для печати меминфо. Все переработать нахрен. # далее пошло ветвление # MEM SWAP KILL if mem_available <= mem_min_sigkill_kb and \ swap_free <= swap_min_sigkill_kb: time0 = time() mem_info = '{}\n\033[4mMemory status that r' \ 'equires corrective actions:' \ '\033[0m\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ 'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ 'p_min_sigkill [{} MiB, {} %]'.format( HR, kib_to_mib(mem_available), percent(mem_available / mem_total), kib_to_mib(mem_min_sigkill_kb), percent(mem_min_sigkill_kb / mem_total), kib_to_mib(swap_free), percent(swap_free / (swap_total + 0.1)), kib_to_mib(swap_min_sigkill_kb), swap_sigkill_pc) find_victim_and_send_signal(SIGKILL) kill_psi_t0 = time() term_psi_t0 = time() # ZRAM KILL elif mem_used_zram >= zram_max_sigkill_kb: time0 = time() mem_info = '{}\n\033[4mMemory statu' \ 's that requires corrective actions:' \ '\033[0m\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \ 'kill [{} MiB, {} %]'.format( HR, kib_to_mib(mem_used_zram), percent(mem_used_zram / mem_total), kib_to_mib(zram_max_sigkill_kb), percent(zram_max_sigkill_kb / mem_total)) find_victim_and_send_signal(SIGKILL) kill_psi_t0 = time() term_psi_t0 = time() # MEM SWAP TERM elif mem_available <= mem_min_sigterm_kb and \ swap_free <= swap_min_sigterm_kb: time0 = time() mem_info = '{}\n\033[4mMemory status tha' \ 't requires corrective actions:' \ '\033[0m\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ 'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ 'p_min_sigterm [{} MiB, {} %]'.format( HR, kib_to_mib(mem_available), percent(mem_available / mem_total), kib_to_mib(mem_min_sigterm_kb), # percent(mem_min_sigterm_kb / mem_total), # ОКРУГЛЯТЬ НА МЕСТЕ ВЫШЕ (или не выше, хз) round(mem_min_sigterm_percent, 1), kib_to_mib(swap_free), percent(swap_free / (swap_total + 0.1)), kib_to_mib(swap_min_sigterm_kb), swap_sigterm_pc) find_victim_and_send_signal(SIGTERM) kill_psi_t0 = time() term_psi_t0 = time() # ZRAM TERM elif mem_used_zram >= zram_max_sigterm_kb: time0 = time() mem_info = '{}\n\033[4mMemory status that r' \ 'equires corrective actions:' \ '\033[0m\n MemUsedZram [{} MiB, {} %] >= ' \ 'zram_max_sigterm [{} M, {} %]'.format( HR, kib_to_mib(mem_used_zram), percent(mem_used_zram / mem_total), kib_to_mib(zram_max_sigterm_kb), percent(zram_max_sigterm_kb / mem_total)) find_victim_and_send_signal(SIGTERM) kill_psi_t0 = time() term_psi_t0 = time() # LOW MEMORY WARNINGS elif gui_low_memory_warnings: if mem_available <= mem_min_warnings_kb and \ swap_free <= swap_min_warnings_kb + 0.1 or \ mem_used_zram >= zram_max_warnings_kb: warn_time_delta = time() - warn_time_now warn_time_now = time() warn_timer += warn_time_delta if warn_timer > min_time_between_warnings: t0 = time() send_notify_warn() print(time() - t0, 'send notify warning time') warn_timer = 0 sleep_after_check_mem() # SLEEP BETWEEN MEM CHECKS else: sleep_after_check_mem()