#!/usr/bin/env python3 """A daemon that prevents OOM in Linux systems.""" import os from ctypes import CDLL from time import sleep, time from operator import itemgetter from sys import stdout, stderr, argv, exit from re import search from sre_constants import error as invalid_re from signal import signal, SIGKILL, SIGTERM, SIGINT, SIGQUIT, SIGHUP ########################################################################## # define functions def check_config(): """ """ log('#' * 79) log('0. Common zram settings') log(' zram_checking_enabled: {}'.format(zram_checking_enabled)) log('1. Thresholds below which a signal should be sent to the victim') log(' soft_threshold_min_mem: {} MiB, {} %'.format(round(soft_threshold_min_mem_mb), round(soft_threshold_min_mem_percent, 1))) log(' hard_threshold_min_mem: {} MiB, {} %'.format(round(hard_threshold_min_mem_mb), round(hard_threshold_min_mem_percent, 1))) log(' soft_threshold_min_swap: {}'.format(soft_threshold_min_swap)) log(' hard_threshold_min_swap: {}'.format(hard_threshold_min_swap)) log(' soft_threshold_max_zram: {} MiB, {} %'.format(round(soft_threshold_max_zram_mb), round(soft_threshold_max_zram_percent, 1))) log(' hard_threshold_max_zram: {} MiB, {} %'.format(round(hard_threshold_max_zram_mb), round(hard_threshold_max_zram_percent, 1))) log('2. Response on PSI memory metrics') log(' psi_checking_enabled: {}'.format(psi_checking_enabled)) log(' psi_path: {}'.format(psi_path)) log(' psi_metrics: {}'.format(psi_metrics)) log(' soft_threshold_max_psi: {}'.format(soft_threshold_max_psi)) log(' hard_threshold_max_psi: {}'.format(hard_threshold_max_psi)) log(' psi_excess_duration: {} sec'.format(psi_excess_duration)) log(' psi_post_action_delay: {} sec'.format(psi_post_action_delay)) log('3. The frequency of checking the amount of available memory') log(' fill_rate_mem: {}'.format(fill_rate_mem)) log(' fill_rate_swap: {}'.format(fill_rate_swap)) log(' fill_rate_zram: {}'.format(fill_rate_zram)) log(' max_sleep: {} sec'.format(max_sleep)) log(' min_sleep: {} sec'.format(min_sleep)) log(' over_sleep: {} sec'.format(over_sleep)) log('4. The prevention of killing innocent victims') log(' min_badness: {}'.format(min_badness)) log(' post_soft_action_delay: {} sec'.format(post_soft_action_delay)) log(' post_zombie_delay: {} sec'.format(post_zombie_delay)) log(' victim_cache_time: {} sec'.format(victim_cache_time)) log(' ignore_positive_oom_score_adj: {}'.format(ignore_positive_oom_score_adj)) log('5. Impact on the badness of processes') log('5.1. Matching process names with RE patterns') if len(badness_adj_re_name_list) > 0: log(' regexp: badness_adj:') for i in badness_adj_re_name_list: log(' {} {}'.format(i[1], i[0])) else: log(' (not set)') log('5.2. Matching CGroup_v1-line with RE patterns') if len(badness_adj_re_cgroup_v1_list) > 0: log(' regexp: badness_adj:') for i in badness_adj_re_cgroup_v1_list: log(' {} {}'.format(i[1], i[0])) else: log(' (not set)') log('5.3. Matching CGroup_v2-line with RE patterns') if len(badness_adj_re_cgroup_v2_list) > 0: log(' regexp: badness_adj:') for i in badness_adj_re_cgroup_v1_list: log(' {} {}'.format(i[1], i[0])) else: log(' (not set)') log('5.4. Matching eUIDs with RE patterns') if len(badness_adj_re_cgroup_v2_list) > 0: log(' regexp: badness_adj:') for i in badness_adj_re_uid_list: log(' {} {}'.format(i[1], i[0])) else: log(' (not set)') log('5.5. Matching realpath with RE patterns') if len(badness_adj_re_cgroup_v2_list) > 0: log(' regexp: badness_adj:') for i in badness_adj_re_realpath_list: log(' {} {}'.format(i[1], i[0])) else: log(' (not set)') log('5.6. Matching cmdlines with RE patterns') if len(badness_adj_re_cgroup_v2_list) > 0: log(' regexp: badness_adj:') for i in badness_adj_re_cmdline_list: log(' {} {}'.format(i[1], i[0])) else: log(' (not set)') log('5.7. Matching environ with RE patterns') if len(badness_adj_re_cgroup_v2_list) > 0: log(' regexp: badness_adj:') for i in badness_adj_re_environ_list: log(' {} {}'.format(i[1], i[0])) else: log(' (not set)') log('6. Customize corrective actions') if len(soft_actions_list) > 0: log(' Match by: regexp: command: ') for i in soft_actions_list: log(' {} {} {}'.format(i[0], i[1], i[2])) else: log(' (not set)') log('7. GUI notifications') log(' post_action_gui_notifications: {}'.format(post_action_gui_notifications)) log(' low_memory_warnings_enabled: {}'.format(low_memory_warnings_enabled)) log(' warning_exe: {}'.format(warning_exe)) log(' warning_threshold_min_mem: {} MiB, {} %'.format(round(warning_threshold_min_mem_mb), round(warning_threshold_min_mem_percent, 1))) log(' warning_threshold_min_swap: {}'.format(warning_threshold_min_swap)) log(' warning_threshold_max_zram: {} MiB, {} %'.format(round(warning_threshold_max_zram_mb), round(warning_threshold_max_zram_percent, 1))) log(' warning_threshold_max_psi: {}'.format(warning_threshold_max_psi)) log(' min_post_warning_delay: {} sec'.format(min_post_warning_delay)) log('8. Verbosity') log(' print_config_at_startup: {}'.format(print_config_at_startup)) log(' print_mem_check_results: {}'.format(print_mem_check_results)) log(' min_mem_report_interval: {} sec'.format(min_mem_report_interval)) log(' debug_sleep: {}'.format(debug_sleep)) log(' print_statistics: {}'.format(print_statistics)) log(' print_proc_table: {}'.format(print_proc_table)) log(' extra_table_info: {}'.format(extra_table_info)) log(' print_victim_status: {}'.format(print_victim_status)) log(' print_victim_cmdline: {}'.format(print_victim_cmdline)) log(' max_victim_ancestry_depth: {}'.format(max_victim_ancestry_depth)) log(' debug_gui_notifications: {}'.format(debug_gui_notifications)) log(' separate_log: {}'.format(separate_log)) log(' debug_psi: {}'.format(debug_psi)) log('9. Misc') log(' max_soft_exit_time: {} sec'.format(max_soft_exit_time)) log(' post_kill_exe: {}'.format(post_kill_exe)) log(' forbid_negative_badness: {}'.format( forbid_negative_badness)) # log(': {}'.format()) log('#' * 79) if check_config_flag: log('config is OK') exit() def encoder(string): """ """ encoded = '' for i in string: encoded += str(ord(i)) + ':' return encoded[:-1] def get_swap_threshold_tuple(string): # re (Num %, True) or (Num KiB, False) """Returns KiB value if abs val was set in config, or tuple with %""" # return tuple with abs and bool: (abs %, True) or (abs MiB, False) if string.endswith('%'): valid = string_to_float_convert_test(string[:-1]) if valid is None: errprint('somewhere swap unit is not float_%') exit(1) value = float(string[:-1].strip()) if value < 0 or value > 100: errprint('invalid value, must be from the range[0; 100] %') exit(1) return value, True elif string.endswith('M'): valid = string_to_float_convert_test(string[:-1]) if valid is None: errprint('somewhere swap unit is not float_M') exit(1) value = float(string[:-1].strip()) * 1024 if value < 0: errprint('invalid unit in config (negative value)') exit(1) return value, False else: errprint( 'Invalid config file. There are invalid units somewhere\nExit') exit(1) def find_cgroup_indexes(): """ Find cgroup-line positions in /proc/*/cgroup file. """ cgroup_v1_index = cgroup_v2_index = None with open('/proc/self/cgroup') as f: for index, line in enumerate(f): if ':name=' in line: cgroup_v1_index = index if line.startswith('0::'): cgroup_v2_index = index return cgroup_v1_index, cgroup_v2_index def pid_to_rss(pid): """ """ try: rss = int(rline1( '/proc/{}/statm'.format(pid)).split(' ')[1]) * SC_PAGESIZE except IndexError: rss = None except FileNotFoundError: rss = None except ProcessLookupError: rss = None return rss def pid_to_vm_size(pid): """ """ try: vm_size = int(rline1( '/proc/{}/statm'.format(pid)).partition(' ')[0]) * SC_PAGESIZE except IndexError: vm_size = None except FileNotFoundError: vm_size = None except ProcessLookupError: vm_size = None return vm_size def signal_handler(signum, frame): """ """ for i in sig_list: signal(i, signal_handler_inner) log('Signal handler called with the {} signal '.format( sig_dict[signum])) update_stat_dict_and_print(None) log('Exit') exit() def signal_handler_inner(signum, frame): """ """ log('Signal handler called with the {} signal (ignored) '.format( sig_dict[signum])) def exe(cmd): """ """ log('Execute the command: {}'.format(cmd)) t0 = time() write_self_oom_score_adj(self_oom_score_adj_max) err = os.system(cmd) write_self_oom_score_adj(self_oom_score_adj_min) dt = time() - t0 log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3))) return err def write(path, string): """ """ with open(path, 'w') as f: f.write(string) def write_self_oom_score_adj(new_value): """ """ if root: write('/proc/self/oom_score_adj', new_value) def valid_re(reg_exp): """Validate regular expression. """ try: search(reg_exp, '') except invalid_re: log('Invalid config: invalid regexp: {}'.format(reg_exp)) exit(1) def func_print_proc_table(): """ """ print_proc_table = True find_victim(print_proc_table) exit() def log(*msg): """ """ try: print(*msg) except OSError: sleep(0.01) if separate_log: try: logging.info(*msg) except OSError: sleep(0.01) def print_version(): """ """ try: v = rline1('/etc/nohang/version') except FileNotFoundError: v = None if v is None: print('nohang unknown version') else: print('nohang ' + v) exit() def pid_to_cgroup_v1(pid): """ """ cgroup_v1 = '' try: with open('/proc/' + pid + '/cgroup') as f: for index, line in enumerate(f): if index == cgroup_v1_index: cgroup_v1 = '/' + line.partition('/')[2][:-1] return cgroup_v1 except FileNotFoundError: return '' def pid_to_cgroup_v2(pid): """ """ cgroup_v2 = '' try: with open('/proc/' + pid + '/cgroup') as f: for index, line in enumerate(f): if index == cgroup_v2_index: cgroup_v2 = line[3:-1] return cgroup_v2 except FileNotFoundError: return '' def pid_to_starttime(pid): """ handle FNF error! """ try: starttime = rline1('/proc/' + pid + '/stat').rpartition(')')[ 2].split(' ')[20] except UnicodeDecodeError: with open('/proc/' + pid + '/stat', 'rb') as f: starttime = f.read().decode('utf-8', 'ignore').rpartition( ')')[2].split(' ')[20] return float(starttime) / SC_CLK_TCK def get_victim_id(pid): """victim_id is starttime + pid""" try: return rline1('/proc/' + pid + '/stat').rpartition( ')')[2].split(' ')[20] + '_pid' + pid except FileNotFoundError: return '' except ProcessLookupError: return '' def pid_to_state(pid): """ """ try: with open('/proc/' + pid + '/stat', 'rb') as f: return f.read(20).decode('utf-8', 'ignore').rpartition(')')[2][1] except FileNotFoundError: return '' except ProcessLookupError: return '' def pid_to_name(pid): """ """ try: with open('/proc/' + pid + '/comm', 'rb') as f: return f.read().decode('utf-8', 'ignore')[:-1] except FileNotFoundError: return '' except ProcessLookupError: return '' def pid_to_ppid(pid): """ """ try: with open('/proc/' + pid + '/status') as f: for n, line in enumerate(f): if n is ppid_index: return line.split('\t')[1].strip() except FileNotFoundError: return '' except ProcessLookupError: return '' except UnicodeDecodeError: with open('/proc/' + pid + '/status', 'rb') as f: f_list = f.read().decode('utf-8', 'ignore').split('\n') for i in range(len(f_list)): if i is ppid_index: return f_list[i].split('\t')[1] def pid_to_ancestry(pid, max_victim_ancestry_depth=1): """ """ if max_victim_ancestry_depth == 1: ppid = pid_to_ppid(pid) pname = pid_to_name(ppid) return '\n PPID: {} ({})'.format(ppid, pname) if max_victim_ancestry_depth == 0: return '' anc_list = [] for i in range(max_victim_ancestry_depth): ppid = pid_to_ppid(pid) pname = pid_to_name(ppid) anc_list.append((ppid, pname)) if ppid == '1': break pid = ppid a = '' for i in anc_list: a = a + ' <= PID {} ({})'.format(i[0], i[1]) return '\n Ancestry: ' + a[4:] def pid_to_cmdline(pid): """ Get process cmdline by pid. pid: str pid of required process returns string cmdline """ try: with open('/proc/' + pid + '/cmdline') as f: return f.read().replace('\x00', ' ').rstrip() except FileNotFoundError: return '' def pid_to_environ(pid): """ Get process environ by pid. pid: str pid of required process returns string environ """ try: with open('/proc/' + pid + '/environ') as f: return f.read().replace('\x00', ' ').rstrip() except FileNotFoundError: return '' def pid_to_realpath(pid): """ """ try: return os.path.realpath('/proc/' + pid + '/exe') except FileNotFoundError: return '' def pid_to_uid(pid): """return euid""" try: with open('/proc/' + pid + '/status') as f: for n, line in enumerate(f): if n is uid_index: return line.split('\t')[2] except UnicodeDecodeError: with open('/proc/' + pid + '/status', 'rb') as f: f_list = f.read().decode('utf-8', 'ignore').split('\n') return f_list[uid_index].split('\t')[2] except FileNotFoundError: return '' def pid_to_badness(pid): """Find and modify badness (if it needs).""" try: oom_score = int(rline1('/proc/' + pid + '/oom_score')) badness = oom_score if ignore_positive_oom_score_adj: oom_score_adj = int(rline1('/proc/' + pid + '/oom_score_adj')) if oom_score_adj > 0: badness = badness - oom_score_adj if regex_matching: name = pid_to_name(pid) for re_tup in badness_adj_re_name_list: if search(re_tup[1], name) is not None: badness += int(re_tup[0]) if re_match_cgroup_v1: cgroup_v1 = pid_to_cgroup_v1(pid) for re_tup in badness_adj_re_cgroup_v1_list: if search(re_tup[1], cgroup_v1) is not None: badness += int(re_tup[0]) if re_match_cgroup_v2: cgroup_v2 = pid_to_cgroup_v2(pid) for re_tup in badness_adj_re_cgroup_v2_list: if search(re_tup[1], cgroup_v2) is not None: badness += int(re_tup[0]) if re_match_realpath: realpath = pid_to_realpath(pid) for re_tup in badness_adj_re_realpath_list: if search(re_tup[1], realpath) is not None: badness += int(re_tup[0]) if re_match_cmdline: cmdline = pid_to_cmdline(pid) for re_tup in badness_adj_re_cmdline_list: if search(re_tup[1], cmdline) is not None: badness += int(re_tup[0]) if re_match_environ: environ = pid_to_environ(pid) for re_tup in badness_adj_re_environ_list: if search(re_tup[1], environ) is not None: badness += int(re_tup[0]) if re_match_uid: uid = pid_to_uid(pid) for re_tup in badness_adj_re_uid_list: if search(re_tup[1], uid) is not None: badness += int(re_tup[0]) if forbid_negative_badness: if badness < 0: badness = 0 return badness, oom_score except FileNotFoundError: return None, None except ProcessLookupError: return None, None def pid_to_status(pid): """ """ try: with open('/proc/' + pid + '/status') as f: for n, line in enumerate(f): if n == 0: name = line.split('\t')[1][:-1] if n is state_index: state = line.split('\t')[1][0] continue if n is ppid_index: ppid = line.split('\t')[1][:-1] continue if n is uid_index: uid = line.split('\t')[2] continue if n is vm_size_index: vm_size = kib_to_mib(int(line.split('\t')[1][:-4])) continue if n is vm_rss_index: vm_rss = kib_to_mib(int(line.split('\t')[1][:-4])) continue if n is vm_swap_index: vm_swap = kib_to_mib(int(line.split('\t')[1][:-4])) break return name, state, ppid, uid, vm_size, vm_rss, vm_swap except UnicodeDecodeError: return pid_to_status_unicode(pid) except FileNotFoundError: return None except ProcessLookupError: return None except ValueError: return None def pid_to_status_unicode(pid): """ """ try: with open('/proc/' + pid + '/status', 'rb') as f: f_list = f.read().decode('utf-8', 'ignore').split('\n') for i in range(len(f_list)): if i == 0: name = f_list[i].split('\t')[1] if i is state_index: state = f_list[i].split('\t')[1][0] if i is ppid_index: ppid = f_list[i].split('\t')[1] if i is uid_index: uid = f_list[i].split('\t')[2] if i is vm_size_index: vm_size = kib_to_mib( int(f_list[i].split('\t')[1][:-3])) if i is vm_rss_index: vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3])) if i is vm_swap_index: vm_swap = kib_to_mib(int(f_list[i].split('\t')[1][:-3])) return name, state, ppid, uid, vm_size, vm_rss, vm_swap except FileNotFoundError: return None except ProcessLookupError: return None except ValueError: return None def uptime(): """ """ return float(rline1('/proc/uptime').split(' ')[0]) def errprint(*text): """ """ print(*text, file=stderr, flush=True) def mlockall(): """Lock all memory to prevent swapping nohang process.""" MCL_CURRENT = 1 MCL_FUTURE = 2 MCL_ONFAULT = 4 libc = CDLL('libc.so.6', use_errno=True) result = libc.mlockall( MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT ) if result != 0: result = libc.mlockall( MCL_CURRENT | MCL_FUTURE ) if result != 0: log('WARNING: cannot lock all memory') else: pass # log('All memory locked with MCL_CURRENT | MCL_FUTURE') else: pass # log('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT') def update_stat_dict_and_print(key): """ """ if key is not None: if key not in stat_dict: stat_dict.update({key: 1}) else: new_value = stat_dict[key] + 1 stat_dict.update({key: new_value}) if print_statistics: stats_msg = 'Total stat (what happened in the last {}):'.format( format_time(time() - start_time)) for i in stat_dict: stats_msg += '\n {}: {}'.format(i, stat_dict[i]) log(stats_msg) def find_psi_metrics_value(psi_path, psi_metrics): """ """ if psi_support: if psi_metrics == 'some_avg10': return float(rline1(psi_path).split(' ')[1].split('=')[1]) if psi_metrics == 'some_avg60': return float(rline1(psi_path).split(' ')[2].split('=')[1]) if psi_metrics == 'some_avg300': return float(rline1(psi_path).split(' ')[3].split('=')[1]) if psi_metrics == 'full_avg10': with open(psi_path) as f: psi_list = f.readlines() return float(psi_list[1].split(' ')[1].split('=')[1]) if psi_metrics == 'full_avg60': with open(psi_path) as f: psi_list = f.readlines() return float(psi_list[1].split(' ')[2].split('=')[1]) if psi_metrics == 'full_avg300': with open(psi_path) as f: psi_list = f.readlines() return float(psi_list[1].split(' ')[3].split('=')[1]) def check_mem_and_swap(): """find mem_available, swap_total, swap_free""" with open('/proc/meminfo') as f: for n, line in enumerate(f): if n == 2: mem_available = int(line.split(':')[1][:-4]) continue if n is swap_total_index: swap_total = int(line.split(':')[1][:-4]) continue if n is swap_free_index: swap_free = int(line.split(':')[1][:-4]) break return mem_available, swap_total, swap_free def check_zram(): """find MemUsedZram""" disksize_sum = 0 mem_used_total_sum = 0 for dev in os.listdir('/sys/block'): if dev.startswith('zram'): stat = zram_stat(dev) disksize_sum += int(stat[0]) mem_used_total_sum += int(stat[1]) # Means that when setting zram disksize = 1 GiB available memory # decrease by 0.0042 GiB. # Found experimentally, requires clarification with different kernaels and # architectures. # On small disk drives (up to gigabyte) it can be more, up to 0.0045. # The creator of the zram module claims that ZRAM_DISKSIZE_FACTOR should # be 0.001: # ("zram uses about 0.1% of the size of the disk" # - https://www.kernel.org/doc/Documentation/blockdev/zram.txt), # but this statement contradicts the experimental data. # ZRAM_DISKSIZE_FACTOR = deltaMemAvailavle / disksize # Found experimentally. ZRAM_DISKSIZE_FACTOR = 0.0042 return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0 ''' def format_time(t): t = int(t) if t < 60: return '{} sec'.format(t) if t >= 60 and t < 3600: m = t // 60 s = t % 60 return '{} min {} sec'.format(m, s) h = t // 3600 s0 = t - h * 3600 m = s0 // 60 s = s0 % 60 return '{} h {} min {} sec'.format(h, m, s) ''' def format_time(t): t = int(t) if t < 60: return '{} sec'.format(t) if t > 3600: h = t // 3600 s0 = t - h * 3600 m = s0 // 60 s = s0 % 60 return '{} h {} min {} sec'.format(h, m, s) m = t // 60 s = t % 60 return '{} min {} sec'.format(m, s) def string_to_float_convert_test(string): """Try to interprete string values as floats.""" try: return float(string) except ValueError: return None def string_to_int_convert_test(string): """Try to interpret string values as integers.""" try: return int(string) except ValueError: return None def conf_parse_string(param): """ Get string parameters from the config dict. param: config_dict key returns config_dict[param].strip() """ if param in config_dict: return config_dict[param].strip() else: errprint('All the necessary parameters must be in the config') errprint('There is no "{}" parameter in the config'.format(param)) exit(1) def conf_parse_bool(param): """ Get bool parameters from the config_dict. param: config_dicst key returns bool """ if param in config_dict: param_str = config_dict[param] if param_str == 'True': return True elif param_str == 'False': return False else: errprint('Invalid value of the "{}" parameter.'.format(param)) errprint('Valid values are True and False.') errprint('Exit') exit(1) else: errprint('All the necessary parameters must be in the config') errprint('There is no "{}" parameter in the config'.format(param)) exit(1) def rline1(path): """read 1st line from path.""" try: with open(path) as f: for line in f: return line[:-1] except UnicodeDecodeError: with open(path, 'rb') as f: return f.read(999).decode( 'utf-8', 'ignore').split('\n')[0] # use partition()! def kib_to_mib(num): """Convert KiB values to MiB values.""" return round(num / 1024.0) def percent(num): """Interprete num as percentage.""" return round(num * 100, 1) def just_percent_mem(num): """convert num to percent and justify""" return str(round(num * 100, 1)).rjust(4, ' ') def just_percent_swap(num): """ """ return str(round(num * 100, 1)).rjust(5, ' ') def human(num, lenth): """Convert KiB values to MiB values with right alignment""" return str(round(num / 1024)).rjust(lenth, ' ') def zram_stat(zram_id): """ Get zram state. zram_id: str zram block-device id returns bytes disksize, str mem_used_total """ try: disksize = rline1('/sys/block/' + zram_id + '/disksize') except FileNotFoundError: return '0', '0' if disksize == ['0\n']: return '0', '0' try: mm_stat = rline1('/sys/block/' + zram_id + '/mm_stat').split(' ') mm_stat_list = [] for i in mm_stat: if i != '': mm_stat_list.append(i) mem_used_total = mm_stat_list[2] except FileNotFoundError: mem_used_total = rline1('/sys/block/' + zram_id + '/mem_used_total') return disksize, mem_used_total # BYTES, str def send_notify_warn(): """ Look for process with maximum 'badness' and warn user with notification. (implement Low memory warnings) """ log('Warning threshold exceeded') if check_warning_exe: exe(warning_exe) else: title = 'Low memory' body = 'MemAvail: {}%\nSwapFree: {}%'.format( round(mem_available / mem_total * 100), round(swap_free / (swap_total + 0.1) * 100) ) send_notification(title, body) def send_notify(threshold, name, pid): """ Notificate about OOM Preventing. threshold: key for notify_sig_dict name: str process name pid: str process pid """ title = 'Freeze prevention' body = '{} [{}] {}'.format( notify_sig_dict[threshold], pid, name.replace( # symbol '&' can break notifications in some themes, # therefore it is replaced by '*' '&', '*' ) ) send_notification(title, body) def send_notify_etc(pid, name, command): """ Notificate about OOM Preventing. command: str command that will be executed name: str process name pid: str process pid """ title = 'Freeze prevention' body = 'Victim is [{}] {}\nExecute the co' \ 'mmand:\n{}'.format( pid, name.replace('&', '*'), command.replace('&', '*')) send_notification(title, body) def send_notification(title, body): """ """ cmd = '{} "--euid={}" "--debug={}" "--title={}" "--body={}" &'.format( notify_helper_path, self_uid, debug_gui_notifications, title, encoder(body)) exe(cmd) def get_pid_list(): """ Find pid list expect kthreads and zombies """ pid_list = [] for pid in os.listdir('/proc'): if os.path.exists('/proc/' + pid + '/exe'): pid_list.append(pid) return pid_list def get_non_decimal_pids(): """ """ non_decimal_list = [] for pid in pid_list: if pid[0].isdecimal() is False: non_decimal_list.append(pid) return non_decimal_list def find_victim(_print_proc_table): """ Find the process with highest badness and its badness adjustment Return pid and badness """ ft1 = time() pid_list = get_pid_list() pid_list.remove(self_pid) if '1' in pid_list: pid_list.remove('1') non_decimal_list = get_non_decimal_pids() for i in non_decimal_list: if i in pid_list: pid_list.remove(i) pid_badness_list = [] if _print_proc_table: if extra_table_info == 'None': extra_table_title = '' elif extra_table_info == 'cgroup_v1': extra_table_title = 'CGroup_v1' elif extra_table_info == 'cgroup_v2': extra_table_title = 'CGroup_v2' elif extra_table_info == 'cmdline': extra_table_title = 'cmdline' elif extra_table_info == 'environ': extra_table_title = 'environ' elif extra_table_info == 'realpath': extra_table_title = 'realpath' else: extra_table_title = '' hr = '#' * 107 log(hr) log('# PID PPID badness oom_score oom_score_adj e' 'UID S VmSize VmRSS VmSwap Name {}'.format( extra_table_title)) log('#------- ------- ------- --------- ------------- -------' '--- - ------ ----- ------ ---------------') for pid in pid_list: badness = pid_to_badness(pid)[0] if badness is None: continue if _print_proc_table: try: oom_score = rline1('/proc/' + pid + '/oom_score') oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') except FileNotFoundError: continue if pid_to_status(pid) is None: continue else: (name, state, ppid, uid, vm_size, vm_rss, vm_swap) = pid_to_status(pid) if extra_table_info == 'None': extra_table_line = '' elif extra_table_info == 'cgroup_v1': extra_table_line = pid_to_cgroup_v1(pid) elif extra_table_info == 'cgroup_v2': extra_table_line = pid_to_cgroup_v2(pid) elif extra_table_info == 'cmdline': extra_table_line = pid_to_cmdline(pid) elif extra_table_info == 'environ': extra_table_line = pid_to_environ(pid) elif extra_table_info == 'realpath': extra_table_line = pid_to_realpath(pid) else: extra_table_line = '' log('#{} {} {} {} {} {} {} {} {} {} {} {}'.format( pid.rjust(7), ppid.rjust(7), str(badness).rjust(7), oom_score.rjust(9), oom_score_adj.rjust(13), uid.rjust(10), state, str(vm_size).rjust(6), str(vm_rss).rjust(5), str(vm_swap).rjust(6), name.ljust(15), extra_table_line ) ) pid_badness_list.append((pid, badness)) real_proc_num = len(pid_badness_list) # Make list of (pid, badness) tuples, sorted by 'badness' values # print(pid_badness_list) pid_tuple_list = sorted( pid_badness_list, key=itemgetter(1), reverse=True )[0] pid = pid_tuple_list[0] victim_id = get_victim_id(pid) # Get maximum 'badness' value victim_badness = pid_tuple_list[1] victim_name = pid_to_name(pid) if _print_proc_table: log(hr) log('Found {} processes with existing /proc/[pid]/exe realpath'.format( real_proc_num)) log( 'Process with highest badness (found in {} ms):\n PID: {}, Na' 'me: {}, badness: {}'.format( round((time() - ft1) * 1000), pid, victim_name, victim_badness ) ) return pid, victim_badness, victim_name, victim_id def find_victim_info(pid, victim_badness, name): """ """ status0 = time() try: with open('/proc/' + pid + '/status') as f: for n, line in enumerate(f): if n is state_index: state = line.split('\t')[1].rstrip() continue """ if n is ppid_index: # ppid = line.split('\t')[1] continue """ if n is uid_index: uid = line.split('\t')[2] continue if n is vm_size_index: vm_size = kib_to_mib(int(line.split('\t')[1][:-4])) continue if n is vm_rss_index: vm_rss = kib_to_mib(int(line.split('\t')[1][:-4])) continue if detailed_rss: if n is anon_index: anon_rss = kib_to_mib( int(line.split('\t')[1][:-4])) continue if n is file_index: file_rss = kib_to_mib( int(line.split('\t')[1][:-4])) continue if n is shmem_index: shmem_rss = kib_to_mib( int(line.split('\t')[1][:-4])) continue if n is vm_swap_index: vm_swap = kib_to_mib(int(line.split('\t')[1][:-4])) break if print_victim_cmdline: cmdline = pid_to_cmdline(pid) oom_score = rline1('/proc/' + pid + '/oom_score') oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') except FileNotFoundError: log('The victim died in the search process: FileNotFoundError') update_stat_dict_and_print( 'The victim died in the search process: FileNotFoundError') return None except ProcessLookupError: log('The victim died in the search process: ProcessLookupError') update_stat_dict_and_print( 'The victim died in the search process: ProcessLookupError') return None except UnicodeDecodeError: with open('/proc/' + pid + '/status', 'rb') as f: f_list = f.read().decode('utf-8', 'ignore').split('\n') for i in range(len(f_list)): if i is state_index: state = f_list[i].split('\t')[1].rstrip() """ if i is ppid_index: pass # ppid = f_list[i].split('\t')[1] """ if i is uid_index: uid = f_list[i].split('\t')[2] if i is vm_size_index: vm_size = kib_to_mib( int(f_list[i].split('\t')[1][:-3])) if i is vm_rss_index: vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3])) if detailed_rss: if i is anon_index: anon_rss = kib_to_mib( int(f_list[i].split('\t')[1][:-3])) if i is file_index: file_rss = kib_to_mib( int(f_list[i].split('\t')[1][:-3])) if i is shmem_index: shmem_rss = kib_to_mib( int(f_list[i].split('\t')[1][:-3])) if i is vm_swap_index: vm_swap = kib_to_mib( int(f_list[i].split('\t')[1][:-3])) if print_victim_cmdline: cmdline = pid_to_cmdline(pid) oom_score = rline1('/proc/' + pid + '/oom_score') oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') except IndexError: log('The victim died in the search process: IndexError') update_stat_dict_and_print( 'The victim died in the search process: IndexError') return None except ValueError: log('The victim died in the search process: ValueError') update_stat_dict_and_print( 'The victim died in the search process: ValueError') return None except FileNotFoundError: log('The victim died in the search process: FileNotFoundError') update_stat_dict_and_print( 'The victim died in the search process: FileNotFoundError') return None except ProcessLookupError: log('The victim died in the search process: ProcessLookupError') update_stat_dict_and_print( 'The victim died in the search process: ProcessLookupError') return None len_vm = len(str(vm_size)) try: realpath = os.path.realpath('/proc/' + pid + '/exe') victim_lifetime = format_time(uptime() - pid_to_starttime(pid)) victim_cgroup_v1 = pid_to_cgroup_v1(pid) victim_cgroup_v2 = pid_to_cgroup_v2(pid) except FileNotFoundError: log('The victim died in the search process: FileNotFoundError') update_stat_dict_and_print( 'The victim died in the search process: FileNotFoundError') return None ancestry = pid_to_ancestry(pid, max_victim_ancestry_depth) if print_victim_cmdline is False: cmdline = '' c1 = '' else: c1 = '\n Cmdline: ' if detailed_rss: detailed_rss_info = ' (' \ 'Anon: {} MiB, ' \ 'File: {} MiB, ' \ 'Shmem: {} MiB)'.format( anon_rss, file_rss, shmem_rss) else: detailed_rss_info = '' victim_info = 'Victim status (found in {} ms):' \ '\n Name: {}' \ '\n State: {}' \ '\n PID: {}' \ '{}' \ '\n EUID: {}' \ '\n badness: {}, ' \ 'oom_score: {}, ' \ 'oom_score_adj: {}' \ '\n VmSize: {} MiB' \ '\n VmRSS: {} MiB {}' \ '\n VmSwap: {} MiB' \ '\n CGroup_v1: {}' \ '\n CGroup_v2: {}' \ '\n Realpath: {}' \ '{}{}' \ '\n Lifetime: {}'.format( round((time() - status0) * 1000), name, state, pid, ancestry, uid, victim_badness, oom_score, oom_score_adj, vm_size, str(vm_rss).rjust(len_vm), detailed_rss_info, str(vm_swap).rjust(len_vm), victim_cgroup_v1, victim_cgroup_v2, realpath, c1, cmdline, victim_lifetime) return victim_info def check_mem_swap_ex(): """ Check: is mem and swap threshold exceeded? Return: None, (SIGTERM, meminfo), (SIGKILL, meminfo) """ mem_available, swap_total, swap_free = check_mem_and_swap() # if hard_threshold_min_swap is set in percent if swap_kill_is_percent: hard_threshold_min_swap_kb = swap_total * hard_threshold_min_swap_percent / 100.0 else: hard_threshold_min_swap_kb = swap_kb_dict['hard_threshold_min_swap_kb'] if swap_term_is_percent: soft_threshold_min_swap_kb = swap_total * soft_threshold_min_swap_percent / 100.0 else: soft_threshold_min_swap_kb = swap_kb_dict['soft_threshold_min_swap_kb'] if swap_warn_is_percent: warning_threshold_min_swap_kb = swap_total * warning_threshold_min_swap_percent / 100.0 else: warning_threshold_min_swap_kb = swap_kb_dict['warning_threshold_min_swap_kb'] if swap_total > hard_threshold_min_swap_kb: swap_sigkill_pc = percent(hard_threshold_min_swap_kb / (swap_total + 0.1)) else: swap_sigkill_pc = '-' if swap_total > soft_threshold_min_swap_kb: swap_sigterm_pc = percent(soft_threshold_min_swap_kb / (swap_total + 0.1)) else: swap_sigterm_pc = '-' if (mem_available <= hard_threshold_min_mem_kb and swap_free <= hard_threshold_min_swap_kb): mem_info = 'Memory status that requ' \ 'ires corrective actions (hard threshold exceeded):' \ '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ 'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ 'p_min_sigkill [{} MiB, {} %]'.format( kib_to_mib(mem_available), percent(mem_available / mem_total), kib_to_mib(hard_threshold_min_mem_kb), percent(hard_threshold_min_mem_kb / mem_total), kib_to_mib(swap_free), percent(swap_free / (swap_total + 0.1)), kib_to_mib(hard_threshold_min_swap_kb), swap_sigkill_pc) return (SIGKILL, mem_info, mem_available, hard_threshold_min_swap_kb, soft_threshold_min_swap_kb, swap_free, swap_total) if (mem_available <= soft_threshold_min_mem_kb and swap_free <= soft_threshold_min_swap_kb): mem_info = 'Memory status that requi' \ 'res corrective actions (soft threshold exceeded):' \ '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ 'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ 'p_min_sigterm [{} MiB, {} %]'.format( kib_to_mib(mem_available), percent(mem_available / mem_total), kib_to_mib(soft_threshold_min_mem_kb), round(soft_threshold_min_mem_percent, 1), kib_to_mib(swap_free), percent(swap_free / (swap_total + 0.1)), kib_to_mib(soft_threshold_min_swap_kb), swap_sigterm_pc) return (SIGTERM, mem_info, mem_available, hard_threshold_min_swap_kb, soft_threshold_min_swap_kb, swap_free, swap_total) if low_memory_warnings_enabled: if (mem_available <= warning_threshold_min_mem_kb and swap_free <= warning_threshold_min_swap_kb + 0.1): return ('WARN', None, mem_available, hard_threshold_min_swap_kb, soft_threshold_min_swap_kb, swap_free, swap_total) return (None, None, mem_available, hard_threshold_min_swap_kb, soft_threshold_min_swap_kb, swap_free, swap_total) def check_zram_ex(): """ """ mem_used_zram = check_zram() if mem_used_zram >= hard_threshold_max_zram_kb: mem_info = 'Memory status that requir' \ 'es corrective actions (hard threshold exceeded):' \ '\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \ 'kill [{} MiB, {} %]'.format( kib_to_mib(mem_used_zram), percent(mem_used_zram / mem_total), kib_to_mib(hard_threshold_max_zram_kb), percent(hard_threshold_max_zram_kb / mem_total)) return SIGKILL, mem_info, mem_used_zram if mem_used_zram >= soft_threshold_max_zram_kb: mem_info = 'Memory status that requires corrective actions (soft th' \ 'reshold exceeded):\n MemUsedZram [{} MiB, {} %] >= zram_max_s' \ 'igterm [{} M, {} %]'.format( kib_to_mib(mem_used_zram), percent(mem_used_zram / mem_total), kib_to_mib(soft_threshold_max_zram_kb), percent(soft_threshold_max_zram_kb / mem_total)) return SIGTERM, mem_info, mem_used_zram if low_memory_warnings_enabled: if mem_used_zram >= warning_threshold_max_zram_kb: return 'WARN', None, mem_used_zram return None, None, mem_used_zram def check_psi_ex(psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0): """ """ delta0 = time() - x0 x0 = time() psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics) # print(psi_avg_value) psi_post_action_delay_timer = time() - last_action_dict['t'] # psi_t0 if psi_post_action_delay_timer >= psi_post_action_delay: psi_post_action_delay_exceeded = True else: psi_post_action_delay_exceeded = False if psi_avg_value >= hard_threshold_max_psi: sigkill_psi_exceeded = True psi_kill_exceeded_timer += delta0 else: sigkill_psi_exceeded = False psi_kill_exceeded_timer = 0 if debug_psi: log('psi_post_action_delay_timer: {}'.format( round(psi_post_action_delay_timer, 3))) log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded' ': {}\npsi_kill_exceeded_timer: {}'.format( psi_post_action_delay_exceeded, sigkill_psi_exceeded, round(psi_kill_exceeded_timer, 1) ) ) if (psi_kill_exceeded_timer >= psi_excess_duration and psi_post_action_delay_exceeded): mem_info = 'PSI avg ({}) > hard_threshold_max_psi ({})\n' \ 'PSI avg exceeded psi_excess_duration (value' \ ' = {} sec) for {} seconds'.format( psi_avg_value, hard_threshold_max_psi, psi_excess_duration, round(psi_kill_exceeded_timer, 1) ) return (SIGKILL, mem_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0) if psi_avg_value >= soft_threshold_max_psi: sigterm_psi_exceeded = True psi_term_exceeded_timer += delta0 else: sigterm_psi_exceeded = False psi_term_exceeded_timer = 0 if debug_psi: log('sigterm_psi_exceeded: {}\n' 'psi_term_exceeded_timer: {}\n'.format( sigterm_psi_exceeded, round(psi_term_exceeded_timer, 1) ) ) if (psi_term_exceeded_timer >= psi_excess_duration and psi_post_action_delay_exceeded): mem_info = 'PSI avg ({}) > soft_threshold_max_psi ({})\n' \ 'PSI avg exceeded psi_excess_duration (value' \ ' = {} sec) for {} seconds'.format( psi_avg_value, soft_threshold_max_psi, psi_excess_duration, round(psi_term_exceeded_timer, 1) ) return (SIGTERM, mem_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0) if low_memory_warnings_enabled: if psi_avg_value >= warning_threshold_max_psi: return ('WARN', None, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0) return (None, None, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0) def is_victim_alive(victim_id): """ We do not have a reliable sign of the end of the release of memory: https://github.com/rfjakob/earlyoom/issues/128#issuecomment-507023717 Варианты возврата: 0 X, nonexist, другой процесс (полн конец имплементации, можно не делать POST SIGKILL DELAY) 1 rp true 2 R освобождает память. Ждем смерти. 3 Z возможно уже освободил память. Конец отслеживания """ # Проверка целостности жертвы starttime, pid = victim_id.split('_pid') new_victim_id = get_victim_id(pid) if victim_id != new_victim_id: return 0 # Жива ли жертва? exe_exists = os.path.exists('/proc/{}/exe'.format(pid)) if exe_exists: return 1 # далее жертва смертельно ранена. Дифференцируемся по State. # R -> 2 # отслеживать жертву дальше # X, FNFE, PLE -> 0 state = pid_to_state(pid) if state == 'R': return 2 if state == 'Z': return 3 if state == 'X' or state == '': return 0 return 0 def implement_corrective_action( threshold, mem_info_list, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0, psi_threshold, zram_threshold, zram_info, psi_info): log('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') debug_corrective_action = True time0 = time() # 1. Очистка словаря от мертвых. Итерация по словарю, отслеживание умирающих. # 2. Итерация по оставшемуся словарю. Поиск дельт. Если хоть у одного # дельта НЕ истекла - ЖДЕМ, выход из фции. # print(v_dict) nu = [] for victim_id in v_dict: iva = is_victim_alive(victim_id) #print(iva, victim_id) if iva == 0 or iva == 3: nu.append(victim_id) """ continue if iva == 1: continue if iva == 2: pass # быстро отследить умирающего """ for i in nu: if debug_corrective_action: log('Remove {} from v_dict'.format(i)) v_dict.pop(i) x = False cache_list = [] #cache_list.append(('foo', 0.01)) #cache_list.append(('boo', 1111.01)) # 2 # print(v_dict) for victim_id in v_dict: tx = v_dict[victim_id]['time'] ddt = time() - tx if ddt < victim_cache_time: if debug_corrective_action: log( 'victim_cache_time is not exceeded for {} ({} < {})'.format( victim_id, round(ddt, 3), victim_cache_time ) ) x = True cache_list.append((victim_id, ddt)) break if x: # print(cache_list) e = sorted(cache_list, key=itemgetter(1), reverse=False) cached_victim_id = e[0][0] for i in mem_info_list: log(i) if x: victim_id = cached_victim_id pid = victim_id.partition('_pid')[2] victim_badness = pid_to_badness(pid)[0] name = v_dict[victim_id]['name'] log('New victim is cached victim {} ({})'.format(pid, name)) else: pid, victim_badness, name, victim_id = find_victim(print_proc_table) log('Recheck memory levels...') (masf_threshold, masf_info, mem_available, hard_threshold_min_swap_kb, soft_threshold_min_swap_kb, swap_free, swap_total) = check_mem_swap_ex() if CHECK_ZRAM: zram_threshold, zram_info, mem_used_zram = check_zram_ex() if CHECK_PSI: (psi_threshold, psi_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0) = check_psi_ex( psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0) if (masf_threshold is SIGKILL or zram_threshold is SIGKILL or psi_threshold is SIGKILL): new_threshold = SIGKILL mem_info_list = [] if masf_threshold is SIGKILL or masf_threshold is SIGTERM: mem_info_list.append(masf_info) if zram_threshold is SIGKILL or zram_threshold is SIGTERM: mem_info_list.append(zram_info) if psi_threshold is SIGKILL or psi_threshold is SIGTERM: mem_info_list.append(psi_info) elif (masf_threshold is SIGTERM or zram_threshold is SIGTERM or psi_threshold is SIGTERM): new_threshold = SIGTERM mem_info_list = [] if masf_threshold is SIGKILL or masf_threshold is SIGTERM: mem_info_list.append(masf_info) if zram_threshold is SIGKILL or zram_threshold is SIGTERM: mem_info_list.append(zram_info) if psi_threshold is SIGKILL or psi_threshold is SIGTERM: mem_info_list.append(psi_info) else: log('Thresholds is not exceeded now') return psi_t0 for i in mem_info_list: log(i) if new_threshold is None or new_threshold == 'WARN': log('Thresholds is not exceeded now') return psi_t0 threshold = new_threshold vwd = None # Victim Will Die if victim_badness >= min_badness: if threshold is SIGTERM: if victim_id in v_dict: dt = time() - v_dict[victim_id]['time'] if dt > max_soft_exit_time: log('max_soft_exit_time is exceeded: the ' 'victim will get SIGKILL') threshold = SIGKILL else: log('max_soft_exit_time is not exceeded (' '{} < {}) for the victim'.format(round( dt, 1), max_soft_exit_time)) if debug_sleep: log('Sleep {} sec (over_sleep)'.format(over_sleep)) sleep(over_sleep) log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') return psi_t0 # log('Try to implement a corrective action...') if print_victim_status: # victim badness ищи снова, не полагайся на старое victim_info = find_victim_info(pid, victim_badness, name) log(victim_info) soft_match = False if soft_actions and threshold is SIGTERM: name = pid_to_name(pid) cgroup_v1 = pid_to_cgroup_v1(pid) service = '' cgroup_v1_tail = cgroup_v1.rpartition('/')[2] if cgroup_v1_tail.endswith('.service'): service = cgroup_v1_tail for i in soft_actions_list: unit = i[0] if unit == 'name': u = name else: u = cgroup_v1 regexp = i[1] command = i[2] if search(regexp, u) is not None: log("Regexp '{}' matches with {} '{}'".format( regexp, unit, u)) soft_match = True break if soft_match: cmd = command.replace('$PID', pid).replace('$NAME', pid_to_name( pid)).replace('$SERVICE', service) exit_status = exe(cmd) """ if exit_status == 0: success = True else: success = False """ response_time = time() - time0 preventing_oom_message = 'Implement a corrective act' \ 'ion:\n Run the command: {}' \ '\n Exit status: {}; total response ' \ 'time: {} ms'.format( cmd, exit_status, round(response_time * 1000)) else: try: os.kill(int(pid), threshold) response_time = time() - time0 send_result = 'total response time: {} ms'.format( round(response_time * 1000)) preventing_oom_message = 'Implement a corrective action:' \ '\n Send {} to the victim; {}'.format( sig_dict[threshold], send_result) # success = True if threshold is SIGKILL: vwd = True except FileNotFoundError: vwd = True # success = False # response_time = time() - time0 # send_result = 'no such process; response time: {} ms'.format(round(response_time * 1000)) key = 'The victim died in the search process: ' \ 'FileNotFoundError' except ProcessLookupError: vwd = True # success = False # response_time = time() - time0 # send_result = 'no such process; response time: {} ms'.format(round(response_time * 1000)) key = 'The victim died in the search process: ' \ 'ProcessLookupError' try: log(preventing_oom_message) except UnboundLocalError: pass # preventing_oom_message = key if not vwd: if victim_id not in v_dict: v_dict[victim_id] = dict() v_dict[victim_id]['time'] = time() v_dict[victim_id]['name'] = name else: pass last_action_dict['t'] = kill_timestamp = time() # print(v_dict) # response_time = time() - time0 # log('success: ' + str(success)) # log('victim will die: ' + str(vwd)) # log('response_time: ' + str(response_time) + ' sec') # НАЧАЛО ОТСЛЕЖИВАНИЯ СОСТОЯНИЯ ЖЕРТВЫ. Можно вынести в отд фц. Приним # айди, логирует, возвращает что-то. # Далее поработать со словарями. Жертва тут умерла - сброс таймера. Все # старые жертвы умерли до 3х секунд с следующих циклах - сброс таймера. # После этого все должно быть супер охуенно. while True: sleep(0.005) d = time() - kill_timestamp #print('Прошло времени:', d) iva = is_victim_alive(victim_id) if iva == 0: log('The victim died in {} sec'.format(round(d, 3))) if victim_id in v_dict: v_dict.pop(victim_id) break elif iva == 1: #print('Жива и занимает память') if not vwd and d > sensitivity_test_time: log("The victim doesn't respond on corrective action in {} sec".format( round(d, 3))) break elif iva == 2: pass #print('Смертельно ранена и освобождает память. Дождаться окончания освобождения памяти.') else: # 3 #print('Z и быстро освобождает память, если еще не. Поспать немножно и выйти из цикла.') log('The victim became a zombie in {} sec'.format(round(d, 3))) if victim_id in v_dict: v_dict.pop(victim_id) sleep(post_zombie_delay) break mem_available, swap_total, swap_free = check_mem_and_swap() ma_mib = int(mem_available) / 1024.0 sf_mib = int(swap_free) / 1024.0 log('Memory status after implementing a corrective act' 'ion:\n MemAvailable' ': {} MiB, SwapFree: {} MiB'.format( round(ma_mib, 1), round(sf_mib, 1))) if soft_match is False: key = 'Send {} to {}'.format(sig_dict[threshold], name) update_stat_dict_and_print(key) else: key = "Run the command '{}'".format(command) update_stat_dict_and_print(key) if threshold is SIGKILL and post_kill_exe != '': cmd = post_kill_exe.replace('$PID', pid).replace( '$NAME', pid_to_name(pid)) log('Execute post_kill_exe') exe(cmd) if post_action_gui_notifications: if soft_match: send_notify_etc(pid, name, cmd) else: send_notify(threshold, name, pid) else: response_time = time() - time0 victim_badness_is_too_small = 'victim badness ({}) < min_b' \ 'adness ({}); nothing to do; response time: {} ms'.format( victim_badness, min_badness, round(response_time * 1000)) log(victim_badness_is_too_small) # update stat_dict key = 'victim badness < min_badness' update_stat_dict_and_print(key) if vwd is None: if debug_sleep: log('Sleep {} sec (over_sleep)'.format(over_sleep)) sleep(over_sleep) log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') return psi_t0 def sleep_after_check_mem(): """Specify sleep times depends on rates and avialable memory.""" if stable_sleep: if debug_sleep: log('Sleep {} sec'.format(min_sleep)) stdout.flush() sleep(min_sleep) return None if hard_threshold_min_mem_kb < soft_threshold_min_mem_kb: mem_point = mem_available - soft_threshold_min_mem_kb else: mem_point = mem_available - hard_threshold_min_mem_kb if hard_threshold_min_swap_kb < soft_threshold_min_swap_kb: swap_point = swap_free - soft_threshold_min_swap_kb else: swap_point = swap_free - hard_threshold_min_swap_kb if swap_point < 0: swap_point = 0 if mem_point < 0: mem_point = 0 t_mem = mem_point / fill_rate_mem t_swap = swap_point / fill_rate_swap if CHECK_ZRAM: t_zram = (mem_total * 0.8 - mem_used_zram) / fill_rate_zram if t_zram < 0: t_zram = 0 t_mem_zram = t_mem + t_zram z = ', t_zram={}'.format(round(t_zram, 2)) else: z = '' t_mem_swap = t_mem + t_swap if CHECK_ZRAM: if t_mem_swap <= t_mem_zram: t = t_mem_swap else: t = t_mem_zram else: t = t_mem_swap if t > max_sleep: t = max_sleep elif t < min_sleep: t = min_sleep else: pass if debug_sleep: log('Sleep {} sec (t_mem={}, t_swap={}{})'.format(round(t, 2), round( t_mem, 2), round(t_swap, 2), z)) try: stdout.flush() except OSError: pass sleep(t) def calculate_percent(arg_key): """ parse conf dict Calculate mem_min_KEY_percent. Try use this one) arg_key: str key for config_dict returns int mem_min_percent or NoneType if got some error """ if arg_key in config_dict: mem_min = config_dict[arg_key] if mem_min.endswith('%'): # truncate percents, so we have a number mem_min_percent = mem_min[:-1].strip() # then 'float test' mem_min_percent = string_to_float_convert_test(mem_min_percent) if mem_min_percent is None: errprint('Invalid {} value, not float\nExit'.format(arg_key)) exit(1) # Final validations... if mem_min_percent < 0 or mem_min_percent > 100: errprint( '{}, as percents value, out of ran' 'ge [0; 100]\nExit'.format(arg_key)) exit(1) # soft_threshold_min_mem_percent is clean and valid float percentage. Can # translate into Kb mem_min_kb = mem_min_percent / 100 * mem_total mem_min_mb = round(mem_min_kb / 1024) elif mem_min.endswith('M'): mem_min_mb = string_to_float_convert_test(mem_min[:-1].strip()) if mem_min_mb is None: errprint('Invalid {} value, not float\nExit'.format(arg_key)) exit(1) mem_min_kb = mem_min_mb * 1024 if mem_min_kb > mem_total: errprint( '{} value can not be greater then MemT' 'otal ({} MiB)\nExit'.format( arg_key, round( mem_total / 1024))) exit(1) mem_min_percent = mem_min_kb / mem_total * 100 else: log('Invalid {} units in config.\n Exit'.format(arg_key)) exit(1) mem_min_percent = None else: log('{} not in config\nExit'.format(arg_key)) exit(1) mem_min_percent = None return mem_min_kb, mem_min_mb, mem_min_percent ########################################################################## # {victim_id : {'time': timestamp, 'name': name} v_dict = dict() start_time = time() help_mess = """usage: nohang [-h] [-v] [-p] [-c CONFIG] [-cc CONFIG] optional arguments: -h, --help show this help message and exit -v, --version print version -p, --print-proc-table print table of processes with their badness values -c CONFIG, --config CONFIG path to the config file, default values: ./nohang.conf, /etc/nohang/nohang.conf -cc CONFIG, --check-config CONFIG check and print config""" SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK']) SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE']) conf_err_mess = 'Invalid config. Exit.' sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP] sig_dict = { SIGKILL: 'SIGKILL', SIGINT: 'SIGINT', SIGQUIT: 'SIGQUIT', SIGHUP: 'SIGHUP', SIGTERM: 'SIGTERM' } self_pid = str(os.getpid()) self_uid = os.geteuid() if self_uid == 0: root = True else: root = False if os.path.exists('./nohang_notify_helper'): notify_helper_path = './nohang_notify_helper' else: notify_helper_path = 'nohang_notify_helper' last_action_dict = dict() last_action_dict['t'] = time() # will store corrective actions stat stat_dict = dict() separate_log = False # will be overwritten after parse config cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes() self_oom_score_adj_min = '-600' self_oom_score_adj_max = '-6' write_self_oom_score_adj(self_oom_score_adj_min) pid_list = get_pid_list() print_proc_table_flag = False check_config_flag = False if os.path.exists('./nohang.conf'): config = os.getcwd() + '/nohang.conf' else: config = '/etc/nohang/nohang.conf' if len(argv) == 1: pass elif len(argv) == 2: if argv[1] == '--help' or argv[1] == '-h': print(help_mess) exit() elif argv[1] == '--check-config' or argv[1] == '-cc': check_config_flag = True elif argv[1] == '--version' or argv[1] == '-v': print_version() elif argv[1] == '--print-proc-table' or argv[1] == '-p': print_proc_table_flag = True if os.path.exists('./nohang.conf'): config = os.getcwd() + '/nohang.conf' else: config = '/etc/nohang/nohang.conf' else: errprint('Unknown option: {}'.format(argv[1])) exit(1) elif len(argv) == 3: if argv[1] == '--config' or argv[1] == '-c': config = argv[2] elif argv[1] == '--check-config' or argv[1] == '-cc': config = argv[2] check_config_flag = True else: errprint('Unknown option: {}'.format(argv[1])) exit(1) else: errprint('Invalid CLI input: too many options') exit(1) # find mem_total # find positions of SwapFree and SwapTotal in /proc/meminfo with open('/proc/meminfo') as f: mem_list = f.readlines() mem_list_names = [] for s in mem_list: mem_list_names.append(s.split(':')[0]) if mem_list_names[2] != 'MemAvailable': errprint('WARNING: Your Linux kernel is too old, Linux 3.14+ requied') exit(1) swap_total_index = mem_list_names.index('SwapTotal') swap_free_index = swap_total_index + 1 mem_total = int(mem_list[0].split(':')[1][:-4]) # Get names from /proc/*/status to be able to get VmRSS and VmSwap values with open('/proc/self/status') as file: status_list = file.readlines() status_names = [] for s in status_list: status_names.append(s.split(':')[0]) ppid_index = status_names.index('PPid') vm_size_index = status_names.index('VmSize') vm_rss_index = status_names.index('VmRSS') vm_swap_index = status_names.index('VmSwap') uid_index = status_names.index('Uid') state_index = status_names.index('State') try: anon_index = status_names.index('RssAnon') file_index = status_names.index('RssFile') shmem_index = status_names.index('RssShmem') detailed_rss = True # print(detailed_rss, 'detailed_rss') except ValueError: detailed_rss = False # print('It is not Linux 4.5+') log('config: ' + config) ########################################################################## # parsing the config with obtaining the parameters dictionary # conf_parameters_dict # conf_restart_dict # dictionary with config options config_dict = dict() badness_adj_re_name_list = [] badness_adj_re_cmdline_list = [] badness_adj_re_environ_list = [] badness_adj_re_uid_list = [] badness_adj_re_cgroup_v1_list = [] badness_adj_re_cgroup_v2_list = [] badness_adj_re_realpath_list = [] soft_actions_list = [] # separator for optional parameters (that starts with @) opt_separator = '///' # stupid conf parsing, need refactoring try: with open(config) as f: for line in f: a = line.startswith('#') b = line.startswith('\n') c = line.startswith('\t') d = line.startswith(' ') etc = line.startswith('@SOFT_ACTION_RE_NAME') etc2 = line.startswith('@SOFT_ACTION_RE_CGROUP_V1') if not a and not b and not c and not d and not etc and not etc2: a = line.partition('=') key = a[0].strip() value = a[2].strip() if key not in config_dict: config_dict[key] = value else: log('ERROR: config key duplication: {}'.format(key)) exit(1) if etc: a = line.partition('@SOFT_ACTION_RE_NAME')[ 2].partition(opt_separator) a1 = 'name' a2 = a[0].strip() valid_re(a2) a3 = a[2].strip() zzz = (a1, a2, a3) soft_actions_list.append(zzz) if etc2: a = line.partition('@SOFT_ACTION_RE_CGROUP_V1')[ 2].partition(opt_separator) a1 = 'cgroup_v1' a2 = a[0].strip() valid_re(a2) a3 = a[2].strip() zzz = (a1, a2, a3) soft_actions_list.append(zzz) if line.startswith('@BADNESS_ADJ_RE_NAME'): a = line.partition('@BADNESS_ADJ_RE_NAME')[2].strip( ' \n').partition(opt_separator) badness_adj = a[0].strip(' ') reg_exp = a[2].strip(' ') valid_re(reg_exp) badness_adj_re_name_list.append((badness_adj, reg_exp)) if line.startswith('@BADNESS_ADJ_RE_CMDLINE'): a = line.partition('@BADNESS_ADJ_RE_CMDLINE')[2].strip( ' \n').partition(opt_separator) badness_adj = a[0].strip(' ') reg_exp = a[2].strip(' ') valid_re(reg_exp) badness_adj_re_cmdline_list.append((badness_adj, reg_exp)) if line.startswith('@BADNESS_ADJ_RE_UID'): a = line.partition('@BADNESS_ADJ_RE_UID')[2].strip( ' \n').partition(opt_separator) badness_adj = a[0].strip(' ') reg_exp = a[2].strip(' ') valid_re(reg_exp) badness_adj_re_uid_list.append((badness_adj, reg_exp)) if line.startswith('@BADNESS_ADJ_RE_CGROUP_V1'): a = line.partition('@BADNESS_ADJ_RE_CGROUP_V1')[2].strip( ' \n').partition(opt_separator) badness_adj = a[0].strip(' ') reg_exp = a[2].strip(' ') valid_re(reg_exp) badness_adj_re_cgroup_v1_list.append((badness_adj, reg_exp)) if line.startswith('@BADNESS_ADJ_RE_CGROUP_V2'): a = line.partition('@BADNESS_ADJ_RE_CGROUP_V2')[2].strip( ' \n').partition(opt_separator) badness_adj = a[0].strip(' ') reg_exp = a[2].strip(' ') valid_re(reg_exp) badness_adj_re_cgroup_v2_list.append((badness_adj, reg_exp)) if line.startswith('@BADNESS_ADJ_RE_REALPATH'): a = line.partition('@BADNESS_ADJ_RE_REALPATH')[2].strip( ' \n').partition(opt_separator) badness_adj = a[0].strip(' ') reg_exp = a[2].strip(' ') valid_re(reg_exp) badness_adj_re_realpath_list.append((badness_adj, reg_exp)) if line.startswith('@BADNESS_ADJ_RE_ENVIRON'): a = line.partition('@BADNESS_ADJ_RE_ENVIRON')[2].strip( ' \n').partition(opt_separator) badness_adj = a[0].strip(' ') reg_exp = a[2].strip(' ') valid_re(reg_exp) badness_adj_re_environ_list.append((badness_adj, reg_exp)) except PermissionError: errprint('PermissionError', conf_err_mess) exit(1) except UnicodeDecodeError: errprint('UnicodeDecodeError', conf_err_mess) exit(1) except IsADirectoryError: errprint('IsADirectoryError', conf_err_mess) exit(1) except IndexError: errprint('IndexError', conf_err_mess) exit(1) except FileNotFoundError: errprint('FileNotFoundError', conf_err_mess) exit(1) if badness_adj_re_name_list == []: regex_matching = False else: regex_matching = True if badness_adj_re_cmdline_list == []: re_match_cmdline = False else: re_match_cmdline = True if badness_adj_re_uid_list == []: re_match_uid = False else: re_match_uid = True if badness_adj_re_environ_list == []: re_match_environ = False else: re_match_environ = True if badness_adj_re_realpath_list == []: re_match_realpath = False else: re_match_realpath = True if badness_adj_re_cgroup_v1_list == []: re_match_cgroup_v1 = False else: re_match_cgroup_v1 = True if badness_adj_re_cgroup_v2_list == []: re_match_cgroup_v2 = False else: re_match_cgroup_v2 = True if soft_actions_list == []: soft_actions = False else: soft_actions = True ########################################################################## # post_zombie_delay = 0.1 # victim_cache_time = 50 # extracting parameters from the dictionary # check for all necessary parameters # validation of all parameters debug_psi = conf_parse_bool('debug_psi') print_statistics = conf_parse_bool('print_statistics') print_proc_table = conf_parse_bool('print_proc_table') forbid_negative_badness = conf_parse_bool('forbid_negative_badness') print_victim_status = conf_parse_bool('print_victim_status') print_victim_cmdline = conf_parse_bool('print_victim_cmdline') print_config_at_startup = conf_parse_bool('print_config_at_startup') print_mem_check_results = conf_parse_bool('print_mem_check_results') debug_sleep = conf_parse_bool('debug_sleep') low_memory_warnings_enabled = conf_parse_bool('low_memory_warnings_enabled') post_action_gui_notifications = conf_parse_bool('post_action_gui_notifications') psi_checking_enabled = conf_parse_bool('psi_checking_enabled') ignore_psi = not psi_checking_enabled zram_checking_enabled = conf_parse_bool('zram_checking_enabled') ignore_zram = not zram_checking_enabled debug_gui_notifications = conf_parse_bool('debug_gui_notifications') ignore_positive_oom_score_adj = conf_parse_bool('ignore_positive_oom_score_adj') (soft_threshold_min_mem_kb, soft_threshold_min_mem_mb, soft_threshold_min_mem_percent ) = calculate_percent('soft_threshold_min_mem') (hard_threshold_min_mem_kb, hard_threshold_min_mem_mb, hard_threshold_min_mem_percent ) = calculate_percent('hard_threshold_min_mem') (soft_threshold_max_zram_kb, soft_threshold_max_zram_mb, soft_threshold_max_zram_percent ) = calculate_percent('soft_threshold_max_zram') (hard_threshold_max_zram_kb, hard_threshold_max_zram_mb, hard_threshold_max_zram_percent ) = calculate_percent('hard_threshold_max_zram') (warning_threshold_min_mem_kb, warning_threshold_min_mem_mb, warning_threshold_min_mem_percent ) = calculate_percent('warning_threshold_min_mem') (warning_threshold_max_zram_kb, warning_threshold_max_zram_mb, warning_threshold_max_zram_percent ) = calculate_percent('warning_threshold_max_zram') if 'post_zombie_delay' in config_dict: post_zombie_delay = string_to_float_convert_test( config_dict['post_zombie_delay']) if post_zombie_delay is None: errprint('Invalid post_zombie_delay, not float\nExit') exit(1) if post_zombie_delay < 0: errprint('post_zombie_delay MUST be >= 0\nExit') exit(1) else: errprint('post_zombie_delay not in config\nExit') exit(1) if 'victim_cache_time' in config_dict: victim_cache_time = string_to_float_convert_test( config_dict['victim_cache_time']) if victim_cache_time is None: errprint('Invalid victim_cache_time, not float\nExit') exit(1) if victim_cache_time < 0: errprint('victim_cache_time MUST be >= 0\nExit') exit(1) else: errprint('victim_cache_time not in config\nExit') exit(1) if 'fill_rate_mem' in config_dict: fill_rate_mem = string_to_float_convert_test(config_dict['fill_rate_mem']) if fill_rate_mem is None: errprint('Invalid fill_rate_mem value, not float\nExit') exit(1) if fill_rate_mem <= 0: errprint('fill_rate_mem MUST be > 0\nExit') exit(1) else: errprint('fill_rate_mem not in config\nExit') exit(1) if 'fill_rate_swap' in config_dict: fill_rate_swap = string_to_float_convert_test(config_dict['fill_rate_swap']) if fill_rate_swap is None: errprint('Invalid fill_rate_swap value, not float\nExit') exit(1) if fill_rate_swap <= 0: errprint('fill_rate_swap MUST be > 0\nExit') exit(1) else: errprint('fill_rate_swap not in config\nExit') exit(1) if 'fill_rate_zram' in config_dict: fill_rate_zram = string_to_float_convert_test(config_dict['fill_rate_zram']) if fill_rate_zram is None: errprint('Invalid fill_rate_zram value, not float\nExit') exit(1) if fill_rate_zram <= 0: errprint('fill_rate_zram MUST be > 0\nExit') exit(1) else: errprint('fill_rate_zram not in config\nExit') exit(1) if 'soft_threshold_min_swap' in config_dict: soft_threshold_min_swap = config_dict['soft_threshold_min_swap'] else: errprint('soft_threshold_min_swap not in config\nExit') exit(1) if 'hard_threshold_min_swap' in config_dict: hard_threshold_min_swap = config_dict['hard_threshold_min_swap'] else: errprint('hard_threshold_min_swap not in config\nExit') exit(1) if 'post_soft_action_delay' in config_dict: post_soft_action_delay = string_to_float_convert_test( config_dict['post_soft_action_delay']) if post_soft_action_delay is None: errprint('Invalid post_soft_action_delay value, not float\nExit') exit(1) if post_soft_action_delay < 0: errprint('post_soft_action_delay must be positiv\nExit') exit(1) else: errprint('post_soft_action_delay not in config\nExit') exit(1) if 'psi_post_action_delay' in config_dict: psi_post_action_delay = string_to_float_convert_test( config_dict['psi_post_action_delay']) if psi_post_action_delay is None: errprint('Invalid psi_post_action_delay value, not float\nExit') exit(1) if psi_post_action_delay < 0: errprint('psi_post_action_delay must be positive\nExit') exit(1) else: errprint('psi_post_action_delay not in config\nExit') exit(1) if 'hard_threshold_max_psi' in config_dict: hard_threshold_max_psi = string_to_float_convert_test( config_dict['hard_threshold_max_psi']) if hard_threshold_max_psi is None: errprint('Invalid hard_threshold_max_psi value, not float\nExit') exit(1) if hard_threshold_max_psi < 0 or hard_threshold_max_psi > 100: errprint('hard_threshold_max_psi must be in the range [0; 100]\nExit') exit(1) else: errprint('hard_threshold_max_psi not in config\nExit') exit(1) if 'soft_threshold_max_psi' in config_dict: soft_threshold_max_psi = string_to_float_convert_test( config_dict['soft_threshold_max_psi']) if soft_threshold_max_psi is None: errprint('Invalid soft_threshold_max_psi value, not float\nExit') exit(1) if soft_threshold_max_psi < 0 or soft_threshold_max_psi > 100: errprint('soft_threshold_max_psi must be in the range [0; 100]\nExit') exit(1) else: errprint('soft_threshold_max_psi not in config\nExit') exit(1) if 'warning_threshold_max_psi' in config_dict: warning_threshold_max_psi = string_to_float_convert_test( config_dict['warning_threshold_max_psi']) if warning_threshold_max_psi is None: errprint('Invalid warning_threshold_max_psi value, not float\nExit') exit(1) if warning_threshold_max_psi < 0 or warning_threshold_max_psi > 100: errprint('warning_threshold_max_psi must be in the range [0; 100]\nExit') exit(1) else: errprint('warning_threshold_max_psi not in config\nExit') exit(1) if 'min_badness' in config_dict: min_badness = string_to_int_convert_test( config_dict['min_badness']) if min_badness is None: errprint('Invalid min_badness value, not integer\nExit') exit(1) if min_badness < 0 or min_badness > 1000: errprint('Invalud min_badness value\nExit') exit(1) else: errprint('min_badness not in config\nExit') exit(1) if 'min_post_warning_delay' in config_dict: min_post_warning_delay = string_to_float_convert_test( config_dict['min_post_warning_delay']) if min_post_warning_delay is None: errprint('Invalid min_post_warning_delay value, not float\nExit') exit(1) if min_post_warning_delay < 1 or min_post_warning_delay > 300: errprint('min_post_warning_delay value out of range [1; 300]\nExit') exit(1) else: errprint('min_post_warning_delay not in config\nExit') exit(1) if 'warning_threshold_min_swap' in config_dict: warning_threshold_min_swap = config_dict['warning_threshold_min_swap'] else: errprint('warning_threshold_min_swap not in config\nExit') exit(1) if 'max_victim_ancestry_depth' in config_dict: max_victim_ancestry_depth = string_to_int_convert_test( config_dict['max_victim_ancestry_depth']) if min_badness is None: errprint('Invalid max_victim_ancestry_depth value, not integer\nExit') exit(1) if max_victim_ancestry_depth < 1: errprint('Invalud max_victim_ancestry_depth value\nExit') exit(1) else: errprint('max_victim_ancestry_depth is not in config\nExit') exit(1) if 'max_soft_exit_time' in config_dict: max_soft_exit_time = string_to_float_convert_test( config_dict['max_soft_exit_time']) if max_soft_exit_time is None: errprint('Invalid max_soft_exit_time val' 'ue, not float\nExit') exit(1) if max_soft_exit_time < 0: errprint('max_soft_exit_time must be non-n' 'egative number\nExit') exit(1) else: errprint('max_soft_exit_time is not in config\nExit') exit(1) if 'post_kill_exe' in config_dict: post_kill_exe = config_dict['post_kill_exe'] else: errprint('post_kill_exe is not in config\nExit') exit(1) if 'psi_path' in config_dict: psi_path = config_dict['psi_path'] else: errprint('psi_path is not in config\nExit') exit(1) if 'psi_metrics' in config_dict: psi_metrics = config_dict['psi_metrics'] else: errprint('psi_metrics is not in config\nExit') exit(1) if 'warning_exe' in config_dict: warning_exe = config_dict['warning_exe'] if warning_exe != '': check_warning_exe = True else: check_warning_exe = False else: errprint('warning_exe is not in config\nExit') exit(1) if 'extra_table_info' in config_dict: extra_table_info = config_dict['extra_table_info'] if (extra_table_info != 'None' and extra_table_info != 'cgroup_v1' and extra_table_info != 'cgroup_v2' and extra_table_info != 'cmdline' and extra_table_info != 'environ' and extra_table_info != 'realpath'): errprint('Invalid config: invalid extra_table_info value\nExit') exit(1) else: errprint('Invalid config: extra_table_info is not in config\nExit') exit(1) separate_log = conf_parse_bool('separate_log') if separate_log: import logging log_dir = '/var/log/nohang' try: os.mkdir(log_dir) except PermissionError: print('ERROR: can not create log dir') except FileExistsError: pass logfile = log_dir + '/nohang.log' try: with open(logfile, 'a') as f: pass except FileNotFoundError: print('ERROR: log FileNotFoundError') except PermissionError: print('ERROR: log PermissionError') try: logging.basicConfig( filename=logfile, level=logging.INFO, format="%(asctime)s: %(message)s") except PermissionError: errprint('ERROR: Permission denied: {}'.format(logfile)) except FileNotFoundError: errprint('ERROR: FileNotFoundError: {}'.format(logfile)) if 'min_mem_report_interval' in config_dict: min_mem_report_interval = string_to_float_convert_test( config_dict['min_mem_report_interval']) if min_mem_report_interval is None: errprint('Invalid min_mem_report_interval value, not float\nExit') exit(1) if min_mem_report_interval < 0: errprint('min_mem_report_interval must be non-negative number\nExit') exit(1) else: errprint('min_mem_report_interval is not in config\nExit') exit(1) if 'psi_excess_duration' in config_dict: psi_excess_duration = string_to_float_convert_test( config_dict['psi_excess_duration']) if psi_excess_duration is None: errprint('Invalid psi_excess_duration value, not float\nExit') exit(1) if psi_excess_duration < 0: errprint('psi_excess_duration must be non-negative number\nExit') exit(1) else: errprint('psi_excess_duration is not in config\nExit') exit(1) if 'max_sleep' in config_dict: max_sleep = string_to_float_convert_test( config_dict['max_sleep']) if max_sleep is None: errprint('Invalid max_sleep value, not float\nExit') exit(1) if max_sleep <= 0: errprint('max_sleep must be positive number\nExit') exit(1) else: errprint('max_sleep is not in config\nExit') exit(1) if 'min_sleep' in config_dict: min_sleep = string_to_float_convert_test( config_dict['min_sleep']) if min_sleep is None: errprint('Invalid min_sleep value, not float\nExit') exit(1) if min_sleep <= 0: errprint('min_sleep must be positive number\nExit') exit(1) else: errprint('min_sleep is not in config\nExit') exit(1) if 'over_sleep' in config_dict: over_sleep = string_to_float_convert_test( config_dict['over_sleep']) if over_sleep is None: errprint('Invalid over_sleep value, not float\nExit') exit(1) if over_sleep <= 0: errprint('over_sleep must be positive number\nExit') exit(1) else: errprint('over_sleep is not in config\nExit') exit(1) sensitivity_test_time = over_sleep / 2 if max_sleep < min_sleep: errprint('min_sleep value must not exceed max_sleep value.\nExit') exit(1) if min_sleep < over_sleep: errprint('over_sleep value must not exceed min_sleep value.\nExit') exit(1) if max_sleep == min_sleep: stable_sleep = True else: stable_sleep = False if print_proc_table_flag: if not root: log('WARNING: effective UID != 0; euid={}; processes with other e' 'uids will be invisible for nohang'.format(self_uid)) func_print_proc_table() ########################################################################## psi_support = os.path.exists(psi_path) ########################################################################## # Get KiB levels if it's possible. soft_threshold_min_swap_tuple = get_swap_threshold_tuple(soft_threshold_min_swap) hard_threshold_min_swap_tuple = get_swap_threshold_tuple(hard_threshold_min_swap) warning_threshold_min_swap_tuple = get_swap_threshold_tuple(warning_threshold_min_swap) swap_kb_dict = dict() swap_term_is_percent = soft_threshold_min_swap_tuple[1] if swap_term_is_percent: soft_threshold_min_swap_percent = soft_threshold_min_swap_tuple[0] else: soft_threshold_min_swap_kb = soft_threshold_min_swap_tuple[0] swap_kb_dict['soft_threshold_min_swap_kb'] = soft_threshold_min_swap_kb swap_kill_is_percent = hard_threshold_min_swap_tuple[1] if swap_kill_is_percent: hard_threshold_min_swap_percent = hard_threshold_min_swap_tuple[0] else: hard_threshold_min_swap_kb = hard_threshold_min_swap_tuple[0] swap_kb_dict['hard_threshold_min_swap_kb'] = hard_threshold_min_swap_kb swap_warn_is_percent = warning_threshold_min_swap_tuple[1] if swap_warn_is_percent: warning_threshold_min_swap_percent = warning_threshold_min_swap_tuple[0] else: warning_threshold_min_swap_kb = warning_threshold_min_swap_tuple[0] swap_kb_dict['warning_threshold_min_swap_kb'] = warning_threshold_min_swap_kb ########################################################################## if print_config_at_startup or check_config_flag: check_config() ########################################################################## # for calculating the column width when printing mem and zram mem_len = len(str(round(mem_total / 1024.0))) if post_action_gui_notifications: notify_sig_dict = {SIGKILL: 'Killing', SIGTERM: 'Terminating'} # convert rates from MiB/s to KiB/s fill_rate_mem = fill_rate_mem * 1024 fill_rate_swap = fill_rate_swap * 1024 fill_rate_zram = fill_rate_zram * 1024 warn_time_now = 0 warn_time_delta = 1000 warn_timer = 0 ########################################################################## if not root: log('WARNING: effective UID != 0; euid={}; processes with other e' 'uids will be invisible for nohang'.format(self_uid)) # Try to lock all memory mlockall() ########################################################################## # print_self_rss() psi_avg_string = '' # will be overwritten if PSI monitoring enabled mem_used_zram = 0 if print_mem_check_results: # to find delta mem wt2 = 0 new_mem = 0 # init mem report interval report0 = 0 # handle signals for i in sig_list: signal(i, signal_handler) x0 = time() delta0 = 0 threshold = None mem_info = None CHECK_PSI = False if psi_support and not ignore_psi: CHECK_PSI = True psi_kill_exceeded_timer = 0 psi_term_exceeded_timer = 0 psi_t0 = time() psi_threshold = zram_threshold = zram_info = psi_info = None CHECK_ZRAM = not ignore_zram log('Monitoring has started!') stdout.flush() ########################################################################## while True: (masf_threshold, masf_info, mem_available, hard_threshold_min_swap_kb, soft_threshold_min_swap_kb, swap_free, swap_total) = check_mem_swap_ex() if CHECK_ZRAM: zram_threshold, zram_info, mem_used_zram = check_zram_ex() if CHECK_PSI: (psi_threshold, psi_info, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0) = check_psi_ex( psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0) if print_mem_check_results: if CHECK_PSI: psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics) if time() - psi_t0 >= psi_post_action_delay: psi_post_action_delay_exceeded = True else: psi_post_action_delay_exceeded = False if print_mem_check_results: psi_avg_string = 'PSI avg: {} | '.format( str(psi_avg_value).rjust(6)) wt1 = time() delta = (mem_available + swap_free) - new_mem t_cycle = wt1 - wt2 report_delta = wt1 - report0 if report_delta >= min_mem_report_interval: mem_report = True new_mem = mem_available + swap_free report0 = wt1 else: mem_report = False wt2 = time() if mem_report: speed = delta / 1024.0 / report_delta speed_info = ' | dMem: {} M/s'.format( str(round(speed)).rjust(5) ) # Calculate 'swap-column' width swap_len = len(str(round(swap_total / 1024.0))) # Output available mem sizes if swap_total == 0 and mem_used_zram == 0: log('{}MemAvail: {} M, {} %{}'.format( psi_avg_string, human(mem_available, mem_len), just_percent_mem(mem_available / mem_total), speed_info ) ) elif swap_total > 0 and mem_used_zram == 0: log('{}MemAvail: {} M, {} % | SwapFree: {} M, {} %{}'.format( psi_avg_string, human(mem_available, mem_len), just_percent_mem(mem_available / mem_total), human(swap_free, swap_len), just_percent_swap(swap_free / (swap_total + 0.1)), speed_info ) ) else: log('{}MemAvail: {} M, {} % | SwapFree: {} M, {} % | Mem' 'UsedZram: {} M, {} %{}'.format( psi_avg_string, human(mem_available, mem_len), just_percent_mem(mem_available / mem_total), human(swap_free, swap_len), just_percent_swap(swap_free / (swap_total + 0.1)), human(mem_used_zram, mem_len), just_percent_mem(mem_used_zram / mem_total), speed_info ) ) if (masf_threshold is SIGKILL or zram_threshold is SIGKILL or psi_threshold is SIGKILL): threshold = SIGKILL mem_info_list = [] if masf_info is not None: mem_info_list.append(masf_info) if zram_info is not None: mem_info_list.append(zram_info) if psi_info is not None: mem_info_list.append(psi_info) psi_t0 = implement_corrective_action( threshold, mem_info_list, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0, psi_threshold, zram_threshold, zram_info, psi_info) continue if (masf_threshold is SIGTERM or zram_threshold is SIGTERM or psi_threshold is SIGTERM): threshold = SIGTERM mem_info_list = [] if masf_info is not None: mem_info_list.append(masf_info) if zram_info is not None: mem_info_list.append(zram_info) if psi_info is not None: mem_info_list.append(psi_info) psi_t0 = implement_corrective_action( threshold, mem_info_list, psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0, psi_threshold, zram_threshold, zram_info, psi_info) continue if low_memory_warnings_enabled: if (masf_threshold == 'WARN' or zram_threshold == 'WARN' or psi_threshold == 'WARN'): warn_time_delta = time() - warn_time_now warn_time_now = time() warn_timer += warn_time_delta if warn_timer > min_post_warning_delay: send_notify_warn() warn_timer = 0 sleep_after_check_mem()