From 286ed840e54176602638e836c079572fc9c481ba Mon Sep 17 00:00:00 2001 From: Alexey Avramov Date: Fri, 20 Sep 2019 00:45:40 +0900 Subject: [PATCH] Update readme and remove trash --- README.md | 2 +- old/nohang | 3360 -------------------------------------- old/nohang.conf | 359 ---- old/nohang_notify_helper | 233 --- 4 files changed, 1 insertion(+), 3953 deletions(-) delete mode 100755 old/nohang delete mode 100644 old/nohang.conf delete mode 100755 old/nohang_notify_helper diff --git a/README.md b/README.md index 30415d1..36da9c0 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ Of course, you can also [download more RAM](https://downloadmoreram.com/), tune For basic usage: - `Linux` 3.14+ (since `MemAvailable` appeared in `/proc/meminfo`) -- `Python` 3.3+ (not tested with previous) +- `Python` 3.3+ To show GUI notifications: - [notification server](https://wiki.archlinux.org/index.php/Desktop_notifications#Notification_servers) (most of desktop environments use their own implementations) diff --git a/old/nohang b/old/nohang deleted file mode 100755 index da010ea..0000000 --- a/old/nohang +++ /dev/null @@ -1,3360 +0,0 @@ -#!/usr/bin/env python3 -"""A daemon that prevents OOM in Linux systems.""" - -import os -from ctypes import CDLL -from time import sleep, time -from operator import itemgetter -from sys import stdout, stderr, argv, exit -from re import search -from sre_constants import error as invalid_re -from signal import signal, SIGKILL, SIGTERM, SIGINT, SIGQUIT, SIGHUP -from threading import Thread - - -########################################################################## - -# define functions - - -def check_config(): - """ - """ - - log('#' * 79) - - log('0. Common zram settings') - - log(' zram_checking_enabled: {}'.format(zram_checking_enabled)) - - log('1. Thresholds below which a signal should be sent to the victim') - - log(' soft_threshold_min_mem: {} MiB, {} %'.format( - round(soft_threshold_min_mem_mb), round(soft_threshold_min_mem_percent, 1))) - log(' hard_threshold_min_mem: {} MiB, {} %'.format( - round(hard_threshold_min_mem_mb), round(hard_threshold_min_mem_percent, 1))) - log(' soft_threshold_min_swap: {}'.format(soft_threshold_min_swap)) - log(' hard_threshold_min_swap: {}'.format(hard_threshold_min_swap)) - log(' soft_threshold_max_zram: {} MiB, {} %'.format( - round(soft_threshold_max_zram_mb), round(soft_threshold_max_zram_percent, 1))) - log(' hard_threshold_max_zram: {} MiB, {} %'.format( - round(hard_threshold_max_zram_mb), round(hard_threshold_max_zram_percent, 1))) - - log('2. Response on PSI memory metrics') - - log(' psi_checking_enabled: {}'.format(psi_checking_enabled)) - log(' psi_path: {}'.format(psi_path)) - log(' psi_metrics: {}'.format(psi_metrics)) - log(' soft_threshold_max_psi: {}'.format(soft_threshold_max_psi)) - log(' hard_threshold_max_psi: {}'.format(hard_threshold_max_psi)) - log(' psi_excess_duration: {} sec'.format(psi_excess_duration)) - log(' psi_post_action_delay: {} sec'.format(psi_post_action_delay)) - - log('3. The frequency of checking the amount of available memory') - - log(' fill_rate_mem: {}'.format(fill_rate_mem)) - log(' fill_rate_swap: {}'.format(fill_rate_swap)) - log(' fill_rate_zram: {}'.format(fill_rate_zram)) - log(' max_sleep: {} sec'.format(max_sleep)) - log(' min_sleep: {} sec'.format(min_sleep)) - log(' over_sleep: {} sec'.format(over_sleep)) - - log('4. The prevention of killing innocent victims') - - log(' min_badness: {}'.format(min_badness)) - log(' post_soft_action_delay: {} sec'.format(post_soft_action_delay)) - log(' post_zombie_delay: {} sec'.format(post_zombie_delay)) - log(' victim_cache_time: {} sec'.format(victim_cache_time)) - log(' ignore_positive_oom_score_adj: {}'.format( - ignore_positive_oom_score_adj)) - - log('5. Impact on the badness of processes') - - log('5.1. Matching process names with RE patterns') - if len(badness_adj_re_name_list) > 0: - log(' regexp: badness_adj:') - for i in badness_adj_re_name_list: - log(' {} {}'.format(i[1], i[0])) - else: - log(' (not set)') - - log('5.2. Matching CGroup_v1-line with RE patterns') - if len(badness_adj_re_cgroup_v1_list) > 0: - log(' regexp: badness_adj:') - for i in badness_adj_re_cgroup_v1_list: - log(' {} {}'.format(i[1], i[0])) - else: - log(' (not set)') - - log('5.3. Matching CGroup_v2-line with RE patterns') - if len(badness_adj_re_cgroup_v2_list) > 0: - log(' regexp: badness_adj:') - for i in badness_adj_re_cgroup_v1_list: - log(' {} {}'.format(i[1], i[0])) - else: - log(' (not set)') - - log('5.4. Matching eUIDs with RE patterns') - if len(badness_adj_re_cgroup_v2_list) > 0: - log(' regexp: badness_adj:') - for i in badness_adj_re_uid_list: - log(' {} {}'.format(i[1], i[0])) - else: - log(' (not set)') - - log('5.5. Matching realpath with RE patterns') - if len(badness_adj_re_cgroup_v2_list) > 0: - log(' regexp: badness_adj:') - for i in badness_adj_re_realpath_list: - log(' {} {}'.format(i[1], i[0])) - else: - log(' (not set)') - - log('5.6. Matching cmdlines with RE patterns') - if len(badness_adj_re_cgroup_v2_list) > 0: - log(' regexp: badness_adj:') - for i in badness_adj_re_cmdline_list: - log(' {} {}'.format(i[1], i[0])) - else: - log(' (not set)') - - log('5.7. Matching environ with RE patterns') - if len(badness_adj_re_cgroup_v2_list) > 0: - log(' regexp: badness_adj:') - for i in badness_adj_re_environ_list: - log(' {} {}'.format(i[1], i[0])) - else: - log(' (not set)') - - log('6. Customize corrective actions') - - if len(soft_actions_list) > 0: - log(' Match by: regexp: command: ') - for i in soft_actions_list: - log(' {} {} {}'.format(i[0], i[1], i[2])) - else: - log(' (not set)') - - log('7. GUI notifications') - - log(' post_action_gui_notifications: {}'.format( - post_action_gui_notifications)) - log(' low_memory_warnings_enabled: {}'.format( - low_memory_warnings_enabled)) - log(' warning_exe: {}'.format(warning_exe)) - log(' warning_threshold_min_mem: {} MiB, {} %'.format(round( - warning_threshold_min_mem_mb), round(warning_threshold_min_mem_percent, 1))) - log(' warning_threshold_min_swap: {}'.format(warning_threshold_min_swap)) - log(' warning_threshold_max_zram: {} MiB, {} %'.format(round( - warning_threshold_max_zram_mb), round(warning_threshold_max_zram_percent, 1))) - log(' warning_threshold_max_psi: {}'.format(warning_threshold_max_psi)) - log(' min_post_warning_delay: {} sec'.format(min_post_warning_delay)) - - log('8. Verbosity') - - log(' print_config_at_startup: {}'.format(print_config_at_startup)) - log(' print_mem_check_results: {}'.format(print_mem_check_results)) - log(' min_mem_report_interval: {} sec'.format(min_mem_report_interval)) - log(' debug_sleep: {}'.format(debug_sleep)) - log(' print_statistics: {}'.format(print_statistics)) - log(' print_proc_table: {}'.format(print_proc_table)) - log(' extra_table_info: {}'.format(extra_table_info)) - log(' print_victim_status: {}'.format(print_victim_status)) - log(' print_victim_cmdline: {}'.format(print_victim_cmdline)) - log(' max_victim_ancestry_depth: {}'.format(max_victim_ancestry_depth)) - log(' debug_gui_notifications: {}'.format(debug_gui_notifications)) - log(' separate_log: {}'.format(separate_log)) - log(' debug_psi: {}'.format(debug_psi)) - - log('9. Misc') - - log(' max_soft_exit_time: {} sec'.format(max_soft_exit_time)) - log(' post_kill_exe: {}'.format(post_kill_exe)) - log(' forbid_negative_badness: {}'.format( - forbid_negative_badness)) - - # log(': {}'.format()) - log('#' * 79) - - if check_config_flag: - log('config is OK') - exit() - - -def encoder(string): - """ - """ - encoded = '' - for i in string: - encoded += str(ord(i)) + ':' - return encoded[:-1] - - -def get_swap_threshold_tuple(string): - # re (Num %, True) or (Num KiB, False) - """Returns KiB value if abs val was set in config, or tuple with %""" - # return tuple with abs and bool: (abs %, True) or (abs MiB, False) - - if string.endswith('%'): - valid = string_to_float_convert_test(string[:-1]) - if valid is None: - errprint('somewhere swap unit is not float_%') - exit(1) - - value = float(string[:-1].strip()) - if value < 0 or value > 100: - errprint('invalid value, must be from the range[0; 100] %') - exit(1) - - return value, True - - elif string.endswith('M'): - valid = string_to_float_convert_test(string[:-1]) - if valid is None: - errprint('somewhere swap unit is not float_M') - exit(1) - - value = float(string[:-1].strip()) * 1024 - if value < 0: - errprint('invalid unit in config (negative value)') - exit(1) - - return value, False - - else: - errprint( - 'Invalid config file. There are invalid units somewhere\nExit') - exit(1) - - -def find_cgroup_indexes(): - """ Find cgroup-line positions in /proc/*/cgroup file. - """ - - cgroup_v1_index = cgroup_v2_index = None - - with open('/proc/self/cgroup') as f: - for index, line in enumerate(f): - if ':name=' in line: - cgroup_v1_index = index - if line.startswith('0::'): - cgroup_v2_index = index - - return cgroup_v1_index, cgroup_v2_index - - -def pid_to_rss(pid): - """ - """ - try: - rss = int(rline1( - '/proc/{}/statm'.format(pid)).split(' ')[1]) * SC_PAGESIZE - except IndexError: - rss = None - except FileNotFoundError: - rss = None - except ProcessLookupError: - rss = None - return rss - - -def pid_to_vm_size(pid): - """ - """ - try: - vm_size = int(rline1( - '/proc/{}/statm'.format(pid)).partition(' ')[0]) * SC_PAGESIZE - except IndexError: - vm_size = None - except FileNotFoundError: - vm_size = None - except ProcessLookupError: - vm_size = None - return vm_size - - -def signal_handler(signum, frame): - """ - """ - for i in sig_list: - signal(i, signal_handler_inner) - log('Signal handler called with the {} signal '.format( - sig_dict[signum])) - update_stat_dict_and_print(None) - log('Exit') - exit() - - -def signal_handler_inner(signum, frame): - """ - """ - log('Signal handler called with the {} signal (ignored) '.format( - sig_dict[signum])) - - -def exe(cmd): - """ - """ - - log('Execute the command: {}'.format(cmd)) - t0 = time() - write_self_oom_score_adj(self_oom_score_adj_max) - err = os.system(cmd) - write_self_oom_score_adj(self_oom_score_adj_min) - dt = time() - t0 - log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3))) - return err - - -def go(func, *a): - """ run func in new thread - """ - t1 = time() - try: - Thread(target=func, args=a).start() - except RuntimeError: - print('RuntimeError: cannot spawn a new thread') - return 1 - t2 = time() - log('New thread spawned in {} ms'.format( - round((t2 - t1) * 1000, 1) - )) - return 0 - - -def write(path, string): - """ - """ - with open(path, 'w') as f: - f.write(string) - - -def write_self_oom_score_adj(new_value): - """ - """ - if root: - write('/proc/self/oom_score_adj', new_value) - - -def valid_re(reg_exp): - """Validate regular expression. - """ - try: - search(reg_exp, '') - except invalid_re: - log('Invalid config: invalid regexp: {}'.format(reg_exp)) - exit(1) - - -def func_print_proc_table(): - """ - """ - print_proc_table = True - find_victim(print_proc_table) - exit() - - -def log(*msg): - """ - """ - try: - print(*msg) - except OSError: - sleep(0.01) - if separate_log: - try: - logging.info(*msg) - except OSError: - sleep(0.01) - - -def print_version(): - """ - """ - try: - v = rline1('/etc/nohang/version') - except FileNotFoundError: - v = None - if v is None: - print('nohang unknown version') - else: - print('nohang ' + v) - exit() - - -def pid_to_cgroup_v1(pid): - """ - """ - cgroup_v1 = '' - try: - with open('/proc/' + pid + '/cgroup') as f: - for index, line in enumerate(f): - if index == cgroup_v1_index: - cgroup_v1 = '/' + line.partition('/')[2][:-1] - return cgroup_v1 - except FileNotFoundError: - return '' - - -def pid_to_cgroup_v2(pid): - """ - """ - cgroup_v2 = '' - try: - with open('/proc/' + pid + '/cgroup') as f: - for index, line in enumerate(f): - if index == cgroup_v2_index: - cgroup_v2 = line[3:-1] - return cgroup_v2 - except FileNotFoundError: - return '' - - -def pid_to_starttime(pid): - """ handle FNF error! - """ - try: - starttime = rline1('/proc/' + pid + '/stat').rpartition(')')[ - 2].split(' ')[20] - - except UnicodeDecodeError: - with open('/proc/' + pid + '/stat', 'rb') as f: - starttime = f.read().decode('utf-8', 'ignore').rpartition( - ')')[2].split(' ')[20] - - return float(starttime) / SC_CLK_TCK - - -def get_victim_id(pid): - """victim_id is starttime + pid""" - try: - return rline1('/proc/' + pid + '/stat').rpartition( - ')')[2].split(' ')[20] + '_pid' + pid - except FileNotFoundError: - return '' - except ProcessLookupError: - return '' - - -def pid_to_state(pid): - """ - """ - try: - with open('/proc/' + pid + '/stat', 'rb') as f: - return f.read(40).decode('utf-8', 'ignore').rpartition(')')[2][1] - except FileNotFoundError: - return '' - except ProcessLookupError: - return '' - except IndexError: - with open('/proc/' + pid + '/stat', 'rb') as f: - return f.read().decode('utf-8', 'ignore').rpartition(')')[2][1] - - -def pid_to_name(pid): - """ - """ - try: - with open('/proc/' + pid + '/comm', 'rb') as f: - return f.read().decode('utf-8', 'ignore')[:-1] - except FileNotFoundError: - return '' - except ProcessLookupError: - return '' - - -def pid_to_ppid(pid): - """ - """ - try: - with open('/proc/' + pid + '/status') as f: - for n, line in enumerate(f): - if n is ppid_index: - return line.split('\t')[1].strip() - except FileNotFoundError: - return '' - except ProcessLookupError: - return '' - except UnicodeDecodeError: - with open('/proc/' + pid + '/status', 'rb') as f: - f_list = f.read().decode('utf-8', 'ignore').split('\n') - for i in range(len(f_list)): - if i is ppid_index: - return f_list[i].split('\t')[1] - - -def pid_to_ancestry(pid, max_victim_ancestry_depth=1): - """ - """ - if max_victim_ancestry_depth == 1: - ppid = pid_to_ppid(pid) - pname = pid_to_name(ppid) - return '\n PPID: {} ({})'.format(ppid, pname) - if max_victim_ancestry_depth == 0: - return '' - anc_list = [] - for i in range(max_victim_ancestry_depth): - ppid = pid_to_ppid(pid) - pname = pid_to_name(ppid) - anc_list.append((ppid, pname)) - if ppid == '1': - break - pid = ppid - a = '' - for i in anc_list: - a = a + ' <= PID {} ({})'.format(i[0], i[1]) - return '\n Ancestry: ' + a[4:] - - -def pid_to_cmdline(pid): - """ - Get process cmdline by pid. - - pid: str pid of required process - returns string cmdline - """ - try: - with open('/proc/' + pid + '/cmdline') as f: - return f.read().replace('\x00', ' ').rstrip() - except FileNotFoundError: - return '' - - -def pid_to_environ(pid): - """ - Get process environ by pid. - - pid: str pid of required process - returns string environ - """ - try: - with open('/proc/' + pid + '/environ') as f: - return f.read().replace('\x00', ' ').rstrip() - except FileNotFoundError: - return '' - - -def pid_to_realpath(pid): - """ - """ - try: - return os.path.realpath('/proc/' + pid + '/exe') - except FileNotFoundError: - return '' - - -def pid_to_uid(pid): - """return euid""" - try: - with open('/proc/' + pid + '/status') as f: - for n, line in enumerate(f): - if n is uid_index: - return line.split('\t')[2] - except UnicodeDecodeError: - with open('/proc/' + pid + '/status', 'rb') as f: - f_list = f.read().decode('utf-8', 'ignore').split('\n') - return f_list[uid_index].split('\t')[2] - except FileNotFoundError: - return '' - - -def pid_to_badness(pid): - """Find and modify badness (if it needs).""" - - try: - - oom_score = int(rline1('/proc/' + pid + '/oom_score')) - badness = oom_score - - if ignore_positive_oom_score_adj: - oom_score_adj = int(rline1('/proc/' + pid + '/oom_score_adj')) - if oom_score_adj > 0: - badness = badness - oom_score_adj - - if regex_matching: - name = pid_to_name(pid) - for re_tup in badness_adj_re_name_list: - if search(re_tup[1], name) is not None: - badness += int(re_tup[0]) - - if re_match_cgroup_v1: - cgroup_v1 = pid_to_cgroup_v1(pid) - for re_tup in badness_adj_re_cgroup_v1_list: - if search(re_tup[1], cgroup_v1) is not None: - badness += int(re_tup[0]) - - if re_match_cgroup_v2: - cgroup_v2 = pid_to_cgroup_v2(pid) - for re_tup in badness_adj_re_cgroup_v2_list: - if search(re_tup[1], cgroup_v2) is not None: - badness += int(re_tup[0]) - - if re_match_realpath: - realpath = pid_to_realpath(pid) - for re_tup in badness_adj_re_realpath_list: - if search(re_tup[1], realpath) is not None: - badness += int(re_tup[0]) - - if re_match_cmdline: - cmdline = pid_to_cmdline(pid) - for re_tup in badness_adj_re_cmdline_list: - if search(re_tup[1], cmdline) is not None: - badness += int(re_tup[0]) - - if re_match_environ: - environ = pid_to_environ(pid) - for re_tup in badness_adj_re_environ_list: - if search(re_tup[1], environ) is not None: - badness += int(re_tup[0]) - - if re_match_uid: - uid = pid_to_uid(pid) - for re_tup in badness_adj_re_uid_list: - if search(re_tup[1], uid) is not None: - badness += int(re_tup[0]) - - if forbid_negative_badness: - if badness < 0: - badness = 0 - - return badness, oom_score - - except FileNotFoundError: - return None, None - except ProcessLookupError: - return None, None - - -def pid_to_status(pid): - """ - """ - - try: - - with open('/proc/' + pid + '/status') as f: - - for n, line in enumerate(f): - - if n == 0: - name = line.split('\t')[1][:-1] - - if n is state_index: - state = line.split('\t')[1][0] - continue - - if n is ppid_index: - ppid = line.split('\t')[1][:-1] - continue - - if n is uid_index: - uid = line.split('\t')[2] - continue - - if n is vm_size_index: - vm_size = kib_to_mib(int(line.split('\t')[1][:-4])) - continue - - if n is vm_rss_index: - vm_rss = kib_to_mib(int(line.split('\t')[1][:-4])) - continue - - if n is vm_swap_index: - vm_swap = kib_to_mib(int(line.split('\t')[1][:-4])) - break - - return name, state, ppid, uid, vm_size, vm_rss, vm_swap - - except UnicodeDecodeError: - return pid_to_status_unicode(pid) - - except FileNotFoundError: - return None - - except ProcessLookupError: - return None - - except ValueError: - return None - - -def pid_to_status_unicode(pid): - """ - """ - try: - - with open('/proc/' + pid + '/status', 'rb') as f: - f_list = f.read().decode('utf-8', 'ignore').split('\n') - - for i in range(len(f_list)): - - if i == 0: - name = f_list[i].split('\t')[1] - - if i is state_index: - state = f_list[i].split('\t')[1][0] - - if i is ppid_index: - ppid = f_list[i].split('\t')[1] - - if i is uid_index: - uid = f_list[i].split('\t')[2] - - if i is vm_size_index: - vm_size = kib_to_mib( - int(f_list[i].split('\t')[1][:-3])) - - if i is vm_rss_index: - vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3])) - - if i is vm_swap_index: - vm_swap = kib_to_mib(int(f_list[i].split('\t')[1][:-3])) - - return name, state, ppid, uid, vm_size, vm_rss, vm_swap - - except FileNotFoundError: - return None - - except ProcessLookupError: - return None - - except ValueError: - return None - - -def uptime(): - """ - """ - return float(rline1('/proc/uptime').split(' ')[0]) - - -def errprint(*text): - """ - """ - print(*text, file=stderr, flush=True) - - -def mlockall(): - """Lock all memory to prevent swapping nohang process.""" - - MCL_CURRENT = 1 - MCL_FUTURE = 2 - MCL_ONFAULT = 4 - - libc = CDLL('libc.so.6', use_errno=True) - - result = libc.mlockall( - MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT - ) - if result != 0: - result = libc.mlockall( - MCL_CURRENT | MCL_FUTURE - ) - if result != 0: - log('WARNING: cannot lock all memory') - else: - pass - # log('All memory locked with MCL_CURRENT | MCL_FUTURE') - else: - pass - # log('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT') - - -def update_stat_dict_and_print(key): - """ - """ - - if key is not None: - - if key not in stat_dict: - - stat_dict.update({key: 1}) - - else: - - new_value = stat_dict[key] + 1 - stat_dict.update({key: new_value}) - - if print_statistics: - - stats_msg = 'Total stat (what happened in the last {}):'.format( - format_time(time() - start_time)) - - for i in stat_dict: - stats_msg += '\n {}: {}'.format(i, stat_dict[i]) - - log(stats_msg) - - -def find_psi_metrics_value(psi_path, psi_metrics): - """ - """ - - if psi_support: - - if psi_metrics == 'some_avg10': - return float(rline1(psi_path).split(' ')[1].split('=')[1]) - if psi_metrics == 'some_avg60': - return float(rline1(psi_path).split(' ')[2].split('=')[1]) - if psi_metrics == 'some_avg300': - return float(rline1(psi_path).split(' ')[3].split('=')[1]) - - if psi_metrics == 'full_avg10': - with open(psi_path) as f: - psi_list = f.readlines() - return float(psi_list[1].split(' ')[1].split('=')[1]) - if psi_metrics == 'full_avg60': - with open(psi_path) as f: - psi_list = f.readlines() - return float(psi_list[1].split(' ')[2].split('=')[1]) - if psi_metrics == 'full_avg300': - with open(psi_path) as f: - psi_list = f.readlines() - return float(psi_list[1].split(' ')[3].split('=')[1]) - - -def check_mem_and_swap(): - """find mem_available, swap_total, swap_free""" - with open('/proc/meminfo') as f: - for n, line in enumerate(f): - if n == 2: - mem_available = int(line.split(':')[1][:-4]) - continue - if n is swap_total_index: - swap_total = int(line.split(':')[1][:-4]) - continue - if n is swap_free_index: - swap_free = int(line.split(':')[1][:-4]) - break - return mem_available, swap_total, swap_free - - -def check_zram(): - """find MemUsedZram""" - disksize_sum = 0 - mem_used_total_sum = 0 - - for dev in os.listdir('/sys/block'): - if dev.startswith('zram'): - stat = zram_stat(dev) - disksize_sum += int(stat[0]) - mem_used_total_sum += int(stat[1]) - - # Means that when setting zram disksize = 1 GiB available memory - # decrease by 0.0042 GiB. - # Found experimentally, requires clarification with different kernaels and - # architectures. - # On small disk drives (up to gigabyte) it can be more, up to 0.0045. - # The creator of the zram module claims that ZRAM_DISKSIZE_FACTOR should - # be 0.001: - # ("zram uses about 0.1% of the size of the disk" - # - https://www.kernel.org/doc/Documentation/blockdev/zram.txt), - # but this statement contradicts the experimental data. - # ZRAM_DISKSIZE_FACTOR = deltaMemAvailavle / disksize - # Found experimentally. - ZRAM_DISKSIZE_FACTOR = 0.0042 - - return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0 - - -''' -def format_time(t): - t = int(t) - if t < 60: - return '{} sec'.format(t) - if t >= 60 and t < 3600: - m = t // 60 - s = t % 60 - return '{} min {} sec'.format(m, s) - h = t // 3600 - s0 = t - h * 3600 - m = s0 // 60 - s = s0 % 60 - return '{} h {} min {} sec'.format(h, m, s) -''' - - -def format_time(t): - t = int(t) - - if t < 60: - return '{} sec'.format(t) - - if t > 3600: - h = t // 3600 - s0 = t - h * 3600 - m = s0 // 60 - s = s0 % 60 - return '{} h {} min {} sec'.format(h, m, s) - - m = t // 60 - s = t % 60 - return '{} min {} sec'.format(m, s) - - -def string_to_float_convert_test(string): - """Try to interprete string values as floats.""" - try: - return float(string) - except ValueError: - return None - - -def string_to_int_convert_test(string): - """Try to interpret string values as integers.""" - try: - return int(string) - except ValueError: - return None - - -def conf_parse_string(param): - """ - Get string parameters from the config dict. - - param: config_dict key - returns config_dict[param].strip() - """ - if param in config_dict: - return config_dict[param].strip() - else: - errprint('All the necessary parameters must be in the config') - errprint('There is no "{}" parameter in the config'.format(param)) - exit(1) - - -def conf_parse_bool(param): - """ - Get bool parameters from the config_dict. - - param: config_dicst key - returns bool - """ - if param in config_dict: - param_str = config_dict[param] - if param_str == 'True': - return True - elif param_str == 'False': - return False - else: - errprint('Invalid value of the "{}" parameter.'.format(param)) - errprint('Valid values are True and False.') - errprint('Exit') - exit(1) - else: - errprint('All the necessary parameters must be in the config') - errprint('There is no "{}" parameter in the config'.format(param)) - exit(1) - - -def rline1(path): - """read 1st line from path.""" - try: - with open(path) as f: - for line in f: - return line[:-1] - except UnicodeDecodeError: - with open(path, 'rb') as f: - return f.read(999).decode( - 'utf-8', 'ignore').split('\n')[0] # use partition()! - - -def kib_to_mib(num): - """Convert KiB values to MiB values.""" - return round(num / 1024.0) - - -def percent(num): - """Interprete num as percentage.""" - return round(num * 100, 1) - - -def just_percent_mem(num): - """convert num to percent and justify""" - return str(round(num * 100, 1)).rjust(4, ' ') - - -def just_percent_swap(num): - """ - """ - return str(round(num * 100, 1)).rjust(5, ' ') - - -def human(num, lenth): - """Convert KiB values to MiB values with right alignment""" - return str(round(num / 1024)).rjust(lenth, ' ') - - -def zram_stat(zram_id): - """ - Get zram state. - - zram_id: str zram block-device id - returns bytes disksize, str mem_used_total - """ - try: - disksize = rline1('/sys/block/' + zram_id + '/disksize') - except FileNotFoundError: - return '0', '0' - if disksize == ['0\n']: - return '0', '0' - try: - mm_stat = rline1('/sys/block/' + zram_id + '/mm_stat').split(' ') - mm_stat_list = [] - for i in mm_stat: - if i != '': - mm_stat_list.append(i) - mem_used_total = mm_stat_list[2] - except FileNotFoundError: - mem_used_total = rline1('/sys/block/' + zram_id + '/mem_used_total') - return disksize, mem_used_total # BYTES, str - - -def send_notify_warn(): - """ - Look for process with maximum 'badness' and warn user with notification. - (implement Low memory warnings) - """ - log('Warning threshold exceeded') - - if check_warning_exe: - exe(warning_exe) - - else: - - title = 'Low memory' - - body = 'MemAvail: {}%\nSwapFree: {}%'.format( - round(mem_available / mem_total * 100), - round(swap_free / (swap_total + 0.1) * 100) - ) - - send_notification(title, body) - - -def send_notify(threshold, name, pid): - """ - Notificate about OOM Preventing. - - threshold: key for notify_sig_dict - name: str process name - pid: str process pid - """ - - title = 'Freeze prevention' - body = '{} [{}] {}'.format( - notify_sig_dict[threshold], - pid, - name.replace( - # symbol '&' can break notifications in some themes, - # therefore it is replaced by '*' - '&', '*' - ) - ) - - send_notification(title, body) - - -def send_notify_etc(pid, name, command): - """ - Notificate about OOM Preventing. - - command: str command that will be executed - name: str process name - pid: str process pid - """ - title = 'Freeze prevention' - body = 'Victim is [{}] {}\nExecute the co' \ - 'mmand:\n{}'.format( - pid, name.replace('&', '*'), command.replace('&', '*')) - - send_notification(title, body) - - -def send_notification(title, body): - """ - """ - cmd = '{} "--euid={}" "--debug={}" "--title={}" "--body={}" &'.format( - notify_helper_path, - self_uid, - debug_gui_notifications, - title, - encoder(body)) - - go(exe, cmd) - - -def get_pid_list(): - """ - Find pid list expect kthreads and zombies - """ - pid_list = [] - for pid in os.listdir('/proc'): - if os.path.exists('/proc/' + pid + '/exe'): - pid_list.append(pid) - return pid_list - - -def get_non_decimal_pids(): - """ - """ - non_decimal_list = [] - for pid in pid_list: - if pid[0].isdecimal() is False: - non_decimal_list.append(pid) - return non_decimal_list - - -def find_victim(_print_proc_table): - """ - Find the process with highest badness and its badness adjustment - Return pid and badness - """ - - ft1 = time() - - pid_list = get_pid_list() - - pid_list.remove(self_pid) - - if '1' in pid_list: - pid_list.remove('1') - - non_decimal_list = get_non_decimal_pids() - - for i in non_decimal_list: - if i in pid_list: - pid_list.remove(i) - - pid_badness_list = [] - - if _print_proc_table: - - if extra_table_info == 'None': - extra_table_title = '' - - elif extra_table_info == 'cgroup_v1': - extra_table_title = 'CGroup_v1' - - elif extra_table_info == 'cgroup_v2': - extra_table_title = 'CGroup_v2' - - elif extra_table_info == 'cmdline': - extra_table_title = 'cmdline' - - elif extra_table_info == 'environ': - extra_table_title = 'environ' - - elif extra_table_info == 'realpath': - extra_table_title = 'realpath' - - else: - extra_table_title = '' - - hr = '#' * 107 - - log(hr) - log('# PID PPID badness oom_score oom_score_adj e' - 'UID S VmSize VmRSS VmSwap Name {}'.format( - extra_table_title)) - log('#------- ------- ------- --------- ------------- -------' - '--- - ------ ----- ------ ---------------') - - for pid in pid_list: - - badness = pid_to_badness(pid)[0] - - if badness is None: - continue - - if _print_proc_table: - - try: - oom_score = rline1('/proc/' + pid + '/oom_score') - oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') - except FileNotFoundError: - continue - - if pid_to_status(pid) is None: - continue - else: - (name, state, ppid, uid, vm_size, vm_rss, - vm_swap) = pid_to_status(pid) - - if extra_table_info == 'None': - extra_table_line = '' - - elif extra_table_info == 'cgroup_v1': - extra_table_line = pid_to_cgroup_v1(pid) - - elif extra_table_info == 'cgroup_v2': - extra_table_line = pid_to_cgroup_v2(pid) - - elif extra_table_info == 'cmdline': - extra_table_line = pid_to_cmdline(pid) - - elif extra_table_info == 'environ': - extra_table_line = pid_to_environ(pid) - - elif extra_table_info == 'realpath': - extra_table_line = pid_to_realpath(pid) - - else: - extra_table_line = '' - - log('#{} {} {} {} {} {} {} {} {} {} {} {}'.format( - pid.rjust(7), - ppid.rjust(7), - str(badness).rjust(7), - oom_score.rjust(9), - oom_score_adj.rjust(13), - uid.rjust(10), - state, - str(vm_size).rjust(6), - str(vm_rss).rjust(5), - str(vm_swap).rjust(6), - name.ljust(15), - extra_table_line - ) - ) - - pid_badness_list.append((pid, badness)) - - real_proc_num = len(pid_badness_list) - - # Make list of (pid, badness) tuples, sorted by 'badness' values - # print(pid_badness_list) - pid_tuple_list = sorted( - pid_badness_list, - key=itemgetter(1), - reverse=True - )[0] - - pid = pid_tuple_list[0] - victim_id = get_victim_id(pid) - - # Get maximum 'badness' value - victim_badness = pid_tuple_list[1] - victim_name = pid_to_name(pid) - - if _print_proc_table: - log(hr) - - log('Found {} processes with existing /proc/[pid]/exe realpath'.format( - real_proc_num)) - - log( - 'Process with highest badness (found in {} ms):\n PID: {}, Na' - 'me: {}, badness: {}'.format( - round((time() - ft1) * 1000), - pid, - victim_name, - victim_badness - ) - ) - - return pid, victim_badness, victim_name, victim_id - - -def find_victim_info(pid, victim_badness, name): - """ - """ - status0 = time() - - try: - - with open('/proc/' + pid + '/status') as f: - - for n, line in enumerate(f): - - if n is state_index: - state = line.split('\t')[1].rstrip() - continue - - """ - if n is ppid_index: - # ppid = line.split('\t')[1] - continue - """ - - if n is uid_index: - uid = line.split('\t')[2] - continue - - if n is vm_size_index: - vm_size = kib_to_mib(int(line.split('\t')[1][:-4])) - continue - - if n is vm_rss_index: - vm_rss = kib_to_mib(int(line.split('\t')[1][:-4])) - continue - - if detailed_rss: - - if n is anon_index: - anon_rss = kib_to_mib( - int(line.split('\t')[1][:-4])) - continue - - if n is file_index: - file_rss = kib_to_mib( - int(line.split('\t')[1][:-4])) - continue - - if n is shmem_index: - shmem_rss = kib_to_mib( - int(line.split('\t')[1][:-4])) - continue - - if n is vm_swap_index: - vm_swap = kib_to_mib(int(line.split('\t')[1][:-4])) - break - - if print_victim_cmdline: - cmdline = pid_to_cmdline(pid) - oom_score = rline1('/proc/' + pid + '/oom_score') - oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') - - except FileNotFoundError: - log('The victim died in the search process: FileNotFoundError') - update_stat_dict_and_print( - 'The victim died in the search process: FileNotFoundError') - return None - except ProcessLookupError: - log('The victim died in the search process: ProcessLookupError') - update_stat_dict_and_print( - 'The victim died in the search process: ProcessLookupError') - return None - except UnicodeDecodeError: - - with open('/proc/' + pid + '/status', 'rb') as f: - f_list = f.read().decode('utf-8', 'ignore').split('\n') - - for i in range(len(f_list)): - - if i is state_index: - state = f_list[i].split('\t')[1].rstrip() - - """ - if i is ppid_index: - pass - # ppid = f_list[i].split('\t')[1] - """ - - if i is uid_index: - uid = f_list[i].split('\t')[2] - - if i is vm_size_index: - vm_size = kib_to_mib( - int(f_list[i].split('\t')[1][:-3])) - - if i is vm_rss_index: - vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3])) - - if detailed_rss: - - if i is anon_index: - anon_rss = kib_to_mib( - int(f_list[i].split('\t')[1][:-3])) - - if i is file_index: - file_rss = kib_to_mib( - int(f_list[i].split('\t')[1][:-3])) - - if i is shmem_index: - shmem_rss = kib_to_mib( - int(f_list[i].split('\t')[1][:-3])) - - if i is vm_swap_index: - vm_swap = kib_to_mib( - int(f_list[i].split('\t')[1][:-3])) - - if print_victim_cmdline: - cmdline = pid_to_cmdline(pid) - oom_score = rline1('/proc/' + pid + '/oom_score') - oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') - - except IndexError: - log('The victim died in the search process: IndexError') - update_stat_dict_and_print( - 'The victim died in the search process: IndexError') - return None - except ValueError: - log('The victim died in the search process: ValueError') - update_stat_dict_and_print( - 'The victim died in the search process: ValueError') - return None - except FileNotFoundError: - log('The victim died in the search process: FileNotFoundError') - update_stat_dict_and_print( - 'The victim died in the search process: FileNotFoundError') - return None - except ProcessLookupError: - log('The victim died in the search process: ProcessLookupError') - update_stat_dict_and_print( - 'The victim died in the search process: ProcessLookupError') - return None - - len_vm = len(str(vm_size)) - - try: - realpath = os.path.realpath('/proc/' + pid + '/exe') - victim_lifetime = format_time(uptime() - pid_to_starttime(pid)) - victim_cgroup_v1 = pid_to_cgroup_v1(pid) - victim_cgroup_v2 = pid_to_cgroup_v2(pid) - - except FileNotFoundError: - log('The victim died in the search process: FileNotFoundError') - update_stat_dict_and_print( - 'The victim died in the search process: FileNotFoundError') - return None - - ancestry = pid_to_ancestry(pid, max_victim_ancestry_depth) - - if print_victim_cmdline is False: - cmdline = '' - c1 = '' - else: - c1 = '\n Cmdline: ' - - if detailed_rss: - detailed_rss_info = ' (' \ - 'Anon: {} MiB, ' \ - 'File: {} MiB, ' \ - 'Shmem: {} MiB)'.format( - anon_rss, - file_rss, - shmem_rss) - else: - detailed_rss_info = '' - - victim_info = 'Victim status (found in {} ms):' \ - '\n Name: {}' \ - '\n State: {}' \ - '\n PID: {}' \ - '{}' \ - '\n EUID: {}' \ - '\n badness: {}, ' \ - 'oom_score: {}, ' \ - 'oom_score_adj: {}' \ - '\n VmSize: {} MiB' \ - '\n VmRSS: {} MiB {}' \ - '\n VmSwap: {} MiB' \ - '\n CGroup_v1: {}' \ - '\n CGroup_v2: {}' \ - '\n Realpath: {}' \ - '{}{}' \ - '\n Lifetime: {}'.format( - round((time() - status0) * 1000), - name, - state, - pid, - ancestry, - uid, - victim_badness, - oom_score, - oom_score_adj, - vm_size, - str(vm_rss).rjust(len_vm), - detailed_rss_info, - str(vm_swap).rjust(len_vm), - victim_cgroup_v1, - victim_cgroup_v2, - realpath, - c1, cmdline, - victim_lifetime) - - return victim_info - - -def check_mem_swap_ex(): - """ - Check: is mem and swap threshold exceeded? - Return: None, (SIGTERM, meminfo), (SIGKILL, meminfo) - """ - - mem_available, swap_total, swap_free = check_mem_and_swap() - - # if hard_threshold_min_swap is set in percent - if swap_kill_is_percent: - hard_threshold_min_swap_kb = swap_total * \ - hard_threshold_min_swap_percent / 100.0 - else: - hard_threshold_min_swap_kb = swap_kb_dict['hard_threshold_min_swap_kb'] - - if swap_term_is_percent: - soft_threshold_min_swap_kb = swap_total * \ - soft_threshold_min_swap_percent / 100.0 - else: - soft_threshold_min_swap_kb = swap_kb_dict['soft_threshold_min_swap_kb'] - - if swap_warn_is_percent: - warning_threshold_min_swap_kb = swap_total * \ - warning_threshold_min_swap_percent / 100.0 - else: - warning_threshold_min_swap_kb = swap_kb_dict['warning_threshold_min_swap_kb'] - - if swap_total > hard_threshold_min_swap_kb: - swap_sigkill_pc = percent( - hard_threshold_min_swap_kb / (swap_total + 0.1)) - else: - swap_sigkill_pc = '-' - - if swap_total > soft_threshold_min_swap_kb: - swap_sigterm_pc = percent( - soft_threshold_min_swap_kb / (swap_total + 0.1)) - else: - swap_sigterm_pc = '-' - - if (mem_available <= hard_threshold_min_mem_kb and - swap_free <= hard_threshold_min_swap_kb): - - mem_info = 'Memory status that requ' \ - 'ires corrective actions (hard threshold exceeded):' \ - '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ - 'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ - 'p_min_sigkill [{} MiB, {} %]'.format( - kib_to_mib(mem_available), - percent(mem_available / mem_total), - kib_to_mib(hard_threshold_min_mem_kb), - percent(hard_threshold_min_mem_kb / mem_total), - kib_to_mib(swap_free), - percent(swap_free / (swap_total + 0.1)), - kib_to_mib(hard_threshold_min_swap_kb), - swap_sigkill_pc) - - return (SIGKILL, mem_info, mem_available, hard_threshold_min_swap_kb, - soft_threshold_min_swap_kb, swap_free, swap_total) - - if (mem_available <= soft_threshold_min_mem_kb and - swap_free <= soft_threshold_min_swap_kb): - - mem_info = 'Memory status that requi' \ - 'res corrective actions (soft threshold exceeded):' \ - '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ - 'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ - 'p_min_sigterm [{} MiB, {} %]'.format( - kib_to_mib(mem_available), - percent(mem_available / mem_total), - kib_to_mib(soft_threshold_min_mem_kb), - round(soft_threshold_min_mem_percent, 1), - kib_to_mib(swap_free), - percent(swap_free / (swap_total + 0.1)), - kib_to_mib(soft_threshold_min_swap_kb), - swap_sigterm_pc) - - return (SIGTERM, mem_info, mem_available, hard_threshold_min_swap_kb, - soft_threshold_min_swap_kb, swap_free, swap_total) - - if low_memory_warnings_enabled: - - if (mem_available <= warning_threshold_min_mem_kb and swap_free <= - warning_threshold_min_swap_kb + 0.1): - return ('WARN', None, mem_available, hard_threshold_min_swap_kb, - soft_threshold_min_swap_kb, swap_free, swap_total) - - return (None, None, mem_available, hard_threshold_min_swap_kb, - soft_threshold_min_swap_kb, swap_free, swap_total) - - -def check_zram_ex(): - """ - """ - mem_used_zram = check_zram() - - if mem_used_zram >= hard_threshold_max_zram_kb: - - mem_info = 'Memory status that requir' \ - 'es corrective actions (hard threshold exceeded):' \ - '\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \ - 'kill [{} MiB, {} %]'.format( - kib_to_mib(mem_used_zram), - percent(mem_used_zram / mem_total), - kib_to_mib(hard_threshold_max_zram_kb), - percent(hard_threshold_max_zram_kb / mem_total)) - - return SIGKILL, mem_info, mem_used_zram - - if mem_used_zram >= soft_threshold_max_zram_kb: - - mem_info = 'Memory status that requires corrective actions (soft th' \ - 'reshold exceeded):\n MemUsedZram [{} MiB, {} %] >= zram_max_s' \ - 'igterm [{} M, {} %]'.format( - kib_to_mib(mem_used_zram), - percent(mem_used_zram / mem_total), - kib_to_mib(soft_threshold_max_zram_kb), - percent(soft_threshold_max_zram_kb / mem_total)) - - return SIGTERM, mem_info, mem_used_zram - - if low_memory_warnings_enabled: - if mem_used_zram >= warning_threshold_max_zram_kb: - return 'WARN', None, mem_used_zram - - return None, None, mem_used_zram - - -def check_psi_ex(psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0): - """ - """ - - delta0 = time() - x0 - x0 = time() - - psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics) - # print(psi_avg_value) - - psi_post_action_delay_timer = time() - last_action_dict['t'] # psi_t0 - - if psi_post_action_delay_timer >= psi_post_action_delay: - psi_post_action_delay_exceeded = True - else: - psi_post_action_delay_exceeded = False - - if psi_avg_value >= hard_threshold_max_psi: - sigkill_psi_exceeded = True - psi_kill_exceeded_timer += delta0 - else: - sigkill_psi_exceeded = False - psi_kill_exceeded_timer = 0 - - if debug_psi: - - log('psi_post_action_delay_timer: {}'.format( - round(psi_post_action_delay_timer, 3))) - - log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded' - ': {}\npsi_kill_exceeded_timer: {}'.format( - psi_post_action_delay_exceeded, - sigkill_psi_exceeded, - round(psi_kill_exceeded_timer, 1) - ) - ) - - if (psi_kill_exceeded_timer >= psi_excess_duration and - psi_post_action_delay_exceeded): - - mem_info = 'PSI avg ({}) > hard_threshold_max_psi ({})\n' \ - 'PSI avg exceeded psi_excess_duration (value' \ - ' = {} sec) for {} seconds'.format( - psi_avg_value, - hard_threshold_max_psi, - psi_excess_duration, - round(psi_kill_exceeded_timer, 1) - ) - - return (SIGKILL, mem_info, psi_t0, psi_kill_exceeded_timer, - psi_term_exceeded_timer, x0) - - if psi_avg_value >= soft_threshold_max_psi: - sigterm_psi_exceeded = True - psi_term_exceeded_timer += delta0 - else: - sigterm_psi_exceeded = False - psi_term_exceeded_timer = 0 - - if debug_psi: - - log('sigterm_psi_exceeded: {}\n' - 'psi_term_exceeded_timer: {}\n'.format( - sigterm_psi_exceeded, - round(psi_term_exceeded_timer, 1) - ) - ) - - if (psi_term_exceeded_timer >= psi_excess_duration and - psi_post_action_delay_exceeded): - - mem_info = 'PSI avg ({}) > soft_threshold_max_psi ({})\n' \ - 'PSI avg exceeded psi_excess_duration (value' \ - ' = {} sec) for {} seconds'.format( - psi_avg_value, - soft_threshold_max_psi, - psi_excess_duration, - round(psi_term_exceeded_timer, 1) - ) - - return (SIGTERM, mem_info, psi_t0, psi_kill_exceeded_timer, - psi_term_exceeded_timer, x0) - - if low_memory_warnings_enabled: - - if psi_avg_value >= warning_threshold_max_psi: - return ('WARN', None, psi_t0, psi_kill_exceeded_timer, - psi_term_exceeded_timer, x0) - - return (None, None, psi_t0, psi_kill_exceeded_timer, - psi_term_exceeded_timer, x0) - - -def is_victim_alive(victim_id): - """ - We do not have a reliable sign of the end of the release of memory: - https://github.com/rfjakob/earlyoom/issues/128#issuecomment-507023717 - - Варианты возврата: - 0 X, nonexist, другой процесс (полн конец имплементации, можно не делать POST SIGKILL DELAY) - 1 rp true - 2 R освобождает память. Ждем смерти. - 3 Z возможно уже освободил память. Конец отслеживания - """ - - # Проверка целостности жертвы - starttime, pid = victim_id.split('_pid') - new_victim_id = get_victim_id(pid) - if victim_id != new_victim_id: - return 0 - - # Жива ли жертва? - exe_exists = os.path.exists('/proc/{}/exe'.format(pid)) - if exe_exists: - return 1 - - # далее жертва смертельно ранена. Дифференцируемся по State. - # R -> 2 # отслеживать жертву дальше - # X, FNFE, PLE -> 0 - - state = pid_to_state(pid) - - if state == 'R': - return 2 - - if state == 'Z': - return 3 - - if state == 'X' or state == '': - return 0 - - return 0 - - -def implement_corrective_action( - threshold, - mem_info_list, - psi_t0, - psi_kill_exceeded_timer, - psi_term_exceeded_timer, - x0, - psi_threshold, - zram_threshold, - zram_info, - psi_info): - - log('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') - - debug_corrective_action = True - - time0 = time() - - # 1. Очистка словаря от мертвых. Итерация по словарю, отслеживание умирающих. - # 2. Итерация по оставшемуся словарю. Поиск дельт. Если хоть у одного - # дельта НЕ истекла - ЖДЕМ, выход из фции. - - # print(v_dict) - nu = [] - - for victim_id in v_dict: - iva = is_victim_alive(victim_id) - #print(iva, victim_id) - if iva == 0 or iva == 3: - nu.append(victim_id) - """ - continue - if iva == 1: - continue - if iva == 2: - pass # быстро отследить умирающего - """ - - for i in nu: - if debug_corrective_action: - log('Remove {} from v_dict'.format(i)) - v_dict.pop(i) - - x = False - cache_list = [] - #cache_list.append(('foo', 0.01)) - #cache_list.append(('boo', 1111.01)) - # 2 - # print(v_dict) - - for victim_id in v_dict: - tx = v_dict[victim_id]['time'] - ddt = time() - tx - if ddt < victim_cache_time: - - if debug_corrective_action: - log( - 'victim_cache_time is not exceeded for {} ({} < {})'.format( - victim_id, round(ddt, 3), victim_cache_time - ) - ) - x = True - cache_list.append((victim_id, ddt)) - break - - if x: - # print(cache_list) - e = sorted(cache_list, key=itemgetter(1), reverse=False) - cached_victim_id = e[0][0] - - for i in mem_info_list: - log(i) - - if x: - victim_id = cached_victim_id - pid = victim_id.partition('_pid')[2] - victim_badness = pid_to_badness(pid)[0] - name = v_dict[victim_id]['name'] - log('New victim is cached victim {} ({})'.format(pid, name)) - else: - pid, victim_badness, name, victim_id = find_victim(print_proc_table) - - log('Recheck memory levels...') - - (masf_threshold, masf_info, mem_available, hard_threshold_min_swap_kb, - soft_threshold_min_swap_kb, swap_free, swap_total) = check_mem_swap_ex() - - if CHECK_ZRAM: - zram_threshold, zram_info, mem_used_zram = check_zram_ex() - - if CHECK_PSI: - (psi_threshold, psi_info, psi_t0, psi_kill_exceeded_timer, - psi_term_exceeded_timer, x0) = check_psi_ex( - psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0) - - if (masf_threshold is SIGKILL or zram_threshold is SIGKILL or - psi_threshold is SIGKILL): - - new_threshold = SIGKILL - mem_info_list = [] - - if masf_threshold is SIGKILL or masf_threshold is SIGTERM: - mem_info_list.append(masf_info) - - if zram_threshold is SIGKILL or zram_threshold is SIGTERM: - mem_info_list.append(zram_info) - - if psi_threshold is SIGKILL or psi_threshold is SIGTERM: - mem_info_list.append(psi_info) - - elif (masf_threshold is SIGTERM or zram_threshold is SIGTERM or - psi_threshold is SIGTERM): - - new_threshold = SIGTERM - mem_info_list = [] - - if masf_threshold is SIGKILL or masf_threshold is SIGTERM: - mem_info_list.append(masf_info) - - if zram_threshold is SIGKILL or zram_threshold is SIGTERM: - mem_info_list.append(zram_info) - - if psi_threshold is SIGKILL or psi_threshold is SIGTERM: - mem_info_list.append(psi_info) - - else: - log('Thresholds is not exceeded now') - return psi_t0 - - for i in mem_info_list: - log(i) - - if new_threshold is None or new_threshold == 'WARN': - log('Thresholds is not exceeded now') - return psi_t0 - - threshold = new_threshold - - vwd = None # Victim Will Die - - if victim_badness >= min_badness: - - if threshold is SIGTERM: - if victim_id in v_dict: - dt = time() - v_dict[victim_id]['time'] - if dt > max_soft_exit_time: - log('max_soft_exit_time is exceeded: the ' - 'victim will get SIGKILL') - threshold = SIGKILL - else: - log('max_soft_exit_time is not exceeded (' - '{} < {}) for the victim'.format(round( - dt, 1), max_soft_exit_time)) - - if debug_sleep: - log('Sleep {} sec (over_sleep)'.format(over_sleep)) - sleep(over_sleep) - - log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') - - return psi_t0 - - # log('Try to implement a corrective action...') - - if print_victim_status: - # victim badness ищи снова, не полагайся на старое - victim_info = find_victim_info(pid, victim_badness, name) - log(victim_info) - - soft_match = False - if soft_actions and threshold is SIGTERM: - name = pid_to_name(pid) - cgroup_v1 = pid_to_cgroup_v1(pid) - service = '' - cgroup_v1_tail = cgroup_v1.rpartition('/')[2] - if cgroup_v1_tail.endswith('.service'): - service = cgroup_v1_tail - for i in soft_actions_list: - unit = i[0] - if unit == 'name': - u = name - else: - u = cgroup_v1 - regexp = i[1] - command = i[2] - - if search(regexp, u) is not None: - log("Regexp '{}' matches with {} '{}'".format( - regexp, unit, u)) - soft_match = True - break - - if soft_match: - - cmd = command.replace('$PID', pid).replace('$NAME', pid_to_name( - pid)).replace('$SERVICE', service) - go(exe, cmd) - - """ - if exit_status == 0: - success = True - else: - success = False - """ - - response_time = time() - time0 - - exit_status = None - - preventing_oom_message = 'Implement a corrective act' \ - 'ion:\n Run the command: {}' \ - '\n Exit status: {}; total response ' \ - 'time: {} ms'.format( - cmd, - exit_status, - round(response_time * 1000)) - - else: - - try: - os.kill(int(pid), threshold) - - response_time = time() - time0 - - send_result = 'total response time: {} ms'.format( - round(response_time * 1000)) - - preventing_oom_message = 'Implement a corrective action:' \ - '\n Send {} to the victim; {}'.format( - sig_dict[threshold], send_result) - - # success = True - - if threshold is SIGKILL: - vwd = True - - except FileNotFoundError: - vwd = True - # success = False - # response_time = time() - time0 - # send_result = 'no such process; response time: {} ms'.format(round(response_time * 1000)) - key = 'The victim died in the search process: ' \ - 'FileNotFoundError' - except ProcessLookupError: - vwd = True - # success = False - # response_time = time() - time0 - # send_result = 'no such process; response time: {} ms'.format(round(response_time * 1000)) - key = 'The victim died in the search process: ' \ - 'ProcessLookupError' - - try: - log(preventing_oom_message) - except UnboundLocalError: - pass - # preventing_oom_message = key - - if not vwd: - if victim_id not in v_dict: - v_dict[victim_id] = dict() - v_dict[victim_id]['time'] = time() - v_dict[victim_id]['name'] = name - else: - pass - - last_action_dict['t'] = kill_timestamp = time() - - # print(v_dict) - - # response_time = time() - time0 - - # log('success: ' + str(success)) - # log('victim will die: ' + str(vwd)) - # log('response_time: ' + str(response_time) + ' sec') - - # НАЧАЛО ОТСЛЕЖИВАНИЯ СОСТОЯНИЯ ЖЕРТВЫ. Можно вынести в отд фц. Приним - # айди, логирует, возвращает что-то. - - # Далее поработать со словарями. Жертва тут умерла - сброс таймера. Все - # старые жертвы умерли до 3х секунд с следующих циклах - сброс таймера. - # После этого все должно быть супер охуенно. - - while True: - sleep(0.005) - d = time() - kill_timestamp - #print('Прошло времени:', d) - iva = is_victim_alive(victim_id) - - if iva == 0: - - log('The victim died in {} sec'.format(round(d, 3))) - - if victim_id in v_dict: - v_dict.pop(victim_id) - break - - elif iva == 1: - #print('Жива и занимает память') - if not vwd and d > sensitivity_test_time: - - log("The victim doesn't respond on corrective action in {} sec".format( - round(d, 3))) - - break - - elif iva == 2: - pass - #print('Смертельно ранена и освобождает память. Дождаться окончания освобождения памяти.') - - else: # 3 - #print('Z и быстро освобождает память, если еще не. Поспать немножно и выйти из цикла.') - - log('The victim became a zombie in {} sec'.format(round(d, 3))) - - if victim_id in v_dict: - v_dict.pop(victim_id) - sleep(post_zombie_delay) - break - - mem_available, swap_total, swap_free = check_mem_and_swap() - ma_mib = int(mem_available) / 1024.0 - sf_mib = int(swap_free) / 1024.0 - log('Memory status after implementing a corrective act' - 'ion:\n MemAvailable' - ': {} MiB, SwapFree: {} MiB'.format( - round(ma_mib, 1), round(sf_mib, 1))) - - if soft_match is False: - key = 'Send {} to {}'.format(sig_dict[threshold], name) - update_stat_dict_and_print(key) - else: - key = "Run the command '{}'".format(command) - update_stat_dict_and_print(key) - - if threshold is SIGKILL and post_kill_exe != '': - - cmd = post_kill_exe.replace('$PID', pid).replace( - '$NAME', pid_to_name(pid)) - - log('Execute post_kill_exe') - - go(exe, cmd) - - if post_action_gui_notifications: - if soft_match: - send_notify_etc(pid, name, cmd) - else: - send_notify(threshold, name, pid) - - else: - - response_time = time() - time0 - victim_badness_is_too_small = 'victim badness ({}) < min_b' \ - 'adness ({}); nothing to do; response time: {} ms'.format( - victim_badness, - min_badness, - round(response_time * 1000)) - - log(victim_badness_is_too_small) - - # update stat_dict - key = 'victim badness < min_badness' - update_stat_dict_and_print(key) - - if vwd is None: - - if debug_sleep: - log('Sleep {} sec (over_sleep)'.format(over_sleep)) - sleep(over_sleep) - - log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') - - return psi_t0 - - -def sleep_after_check_mem(): - """Specify sleep times depends on rates and avialable memory.""" - - if stable_sleep: - - if debug_sleep: - log('Sleep {} sec'.format(min_sleep)) - stdout.flush() - sleep(min_sleep) - return None - - if hard_threshold_min_mem_kb < soft_threshold_min_mem_kb: - mem_point = mem_available - soft_threshold_min_mem_kb - else: - mem_point = mem_available - hard_threshold_min_mem_kb - - if hard_threshold_min_swap_kb < soft_threshold_min_swap_kb: - swap_point = swap_free - soft_threshold_min_swap_kb - else: - swap_point = swap_free - hard_threshold_min_swap_kb - - if swap_point < 0: - swap_point = 0 - - if mem_point < 0: - mem_point = 0 - - t_mem = mem_point / fill_rate_mem - t_swap = swap_point / fill_rate_swap - - if CHECK_ZRAM: - t_zram = (mem_total * 0.8 - mem_used_zram) / fill_rate_zram - if t_zram < 0: - t_zram = 0 - t_mem_zram = t_mem + t_zram - z = ', t_zram={}'.format(round(t_zram, 2)) - else: - z = '' - - t_mem_swap = t_mem + t_swap - - if CHECK_ZRAM: - - if t_mem_swap <= t_mem_zram: - t = t_mem_swap - else: - t = t_mem_zram - else: - t = t_mem_swap - - if t > max_sleep: - t = max_sleep - elif t < min_sleep: - t = min_sleep - else: - pass - - if debug_sleep: - log('Sleep {} sec (t_mem={}, t_swap={}{})'.format(round(t, 2), round( - t_mem, 2), round(t_swap, 2), z)) - - try: - stdout.flush() - except OSError: - pass - - sleep(t) - - -def calculate_percent(arg_key): - """ - parse conf dict - Calculate mem_min_KEY_percent. - - Try use this one) - arg_key: str key for config_dict - returns int mem_min_percent or NoneType if got some error - """ - - if arg_key in config_dict: - mem_min = config_dict[arg_key] - - if mem_min.endswith('%'): - # truncate percents, so we have a number - mem_min_percent = mem_min[:-1].strip() - # then 'float test' - mem_min_percent = string_to_float_convert_test(mem_min_percent) - if mem_min_percent is None: - errprint('Invalid {} value, not float\nExit'.format(arg_key)) - exit(1) - # Final validations... - if mem_min_percent < 0 or mem_min_percent > 100: - errprint( - '{}, as percents value, out of ran' - 'ge [0; 100]\nExit'.format(arg_key)) - exit(1) - - # soft_threshold_min_mem_percent is clean and valid float percentage. Can - # translate into Kb - mem_min_kb = mem_min_percent / 100 * mem_total - mem_min_mb = round(mem_min_kb / 1024) - - elif mem_min.endswith('M'): - mem_min_mb = string_to_float_convert_test(mem_min[:-1].strip()) - if mem_min_mb is None: - errprint('Invalid {} value, not float\nExit'.format(arg_key)) - exit(1) - mem_min_kb = mem_min_mb * 1024 - if mem_min_kb > mem_total: - errprint( - '{} value can not be greater then MemT' - 'otal ({} MiB)\nExit'.format( - arg_key, round( - mem_total / 1024))) - exit(1) - mem_min_percent = mem_min_kb / mem_total * 100 - - else: - log('Invalid {} units in config.\n Exit'.format(arg_key)) - exit(1) - mem_min_percent = None - - else: - log('{} not in config\nExit'.format(arg_key)) - exit(1) - mem_min_percent = None - - return mem_min_kb, mem_min_mb, mem_min_percent - - -########################################################################## - - -# {victim_id : {'time': timestamp, 'name': name} -v_dict = dict() - - -start_time = time() - - -help_mess = """usage: nohang [-h] [-v] [-p] [-c CONFIG] [-cc CONFIG] - -optional arguments: - -h, --help show this help message and exit - -v, --version print version - -p, --print-proc-table - print table of processes with their badness values - -c CONFIG, --config CONFIG - path to the config file, default values: - ./nohang.conf, /etc/nohang/nohang.conf - -cc CONFIG, --check-config CONFIG - check and print config""" - - -SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK']) - -SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE']) - -conf_err_mess = 'Invalid config. Exit.' - -sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP] - -sig_dict = { - SIGKILL: 'SIGKILL', - SIGINT: 'SIGINT', - SIGQUIT: 'SIGQUIT', - SIGHUP: 'SIGHUP', - SIGTERM: 'SIGTERM' -} - -self_pid = str(os.getpid()) - -self_uid = os.geteuid() - -if self_uid == 0: - root = True -else: - root = False - - -if os.path.exists('./nohang_notify_helper'): - notify_helper_path = './nohang_notify_helper' -else: - notify_helper_path = 'nohang_notify_helper' - - -last_action_dict = dict() - -last_action_dict['t'] = time() - - -# will store corrective actions stat -stat_dict = dict() - - -separate_log = False # will be overwritten after parse config - - -cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes() - - -self_oom_score_adj_min = '-600' -self_oom_score_adj_max = '-6' - - -write_self_oom_score_adj(self_oom_score_adj_min) - - -pid_list = get_pid_list() - - -print_proc_table_flag = False - -check_config_flag = False - - -if os.path.exists('./nohang.conf'): - config = os.getcwd() + '/nohang.conf' -else: - config = '/etc/nohang/nohang.conf' - - -if len(argv) == 1: - pass -elif len(argv) == 2: - if argv[1] == '--help' or argv[1] == '-h': - print(help_mess) - exit() - elif argv[1] == '--check-config' or argv[1] == '-cc': - check_config_flag = True - elif argv[1] == '--version' or argv[1] == '-v': - print_version() - elif argv[1] == '--print-proc-table' or argv[1] == '-p': - print_proc_table_flag = True - if os.path.exists('./nohang.conf'): - config = os.getcwd() + '/nohang.conf' - else: - config = '/etc/nohang/nohang.conf' - else: - errprint('Unknown option: {}'.format(argv[1])) - exit(1) -elif len(argv) == 3: - if argv[1] == '--config' or argv[1] == '-c': - config = argv[2] - elif argv[1] == '--check-config' or argv[1] == '-cc': - config = argv[2] - check_config_flag = True - else: - errprint('Unknown option: {}'.format(argv[1])) - exit(1) -else: - errprint('Invalid CLI input: too many options') - exit(1) - - -# find mem_total -# find positions of SwapFree and SwapTotal in /proc/meminfo - -with open('/proc/meminfo') as f: - mem_list = f.readlines() - -mem_list_names = [] -for s in mem_list: - mem_list_names.append(s.split(':')[0]) - -if mem_list_names[2] != 'MemAvailable': - errprint('WARNING: Your Linux kernel is too old, Linux 3.14+ requied') - exit(1) - -swap_total_index = mem_list_names.index('SwapTotal') -swap_free_index = swap_total_index + 1 - -mem_total = int(mem_list[0].split(':')[1][:-4]) - -# Get names from /proc/*/status to be able to get VmRSS and VmSwap values - -with open('/proc/self/status') as file: - status_list = file.readlines() - -status_names = [] -for s in status_list: - status_names.append(s.split(':')[0]) - -ppid_index = status_names.index('PPid') -vm_size_index = status_names.index('VmSize') -vm_rss_index = status_names.index('VmRSS') -vm_swap_index = status_names.index('VmSwap') -uid_index = status_names.index('Uid') -state_index = status_names.index('State') - - -try: - anon_index = status_names.index('RssAnon') - file_index = status_names.index('RssFile') - shmem_index = status_names.index('RssShmem') - detailed_rss = True - # print(detailed_rss, 'detailed_rss') -except ValueError: - detailed_rss = False - # print('It is not Linux 4.5+') - - -log('config: ' + config) - - -########################################################################## - -# parsing the config with obtaining the parameters dictionary - -# conf_parameters_dict -# conf_restart_dict - -# dictionary with config options -config_dict = dict() - -badness_adj_re_name_list = [] -badness_adj_re_cmdline_list = [] -badness_adj_re_environ_list = [] -badness_adj_re_uid_list = [] -badness_adj_re_cgroup_v1_list = [] -badness_adj_re_cgroup_v2_list = [] -badness_adj_re_realpath_list = [] - -soft_actions_list = [] - -# separator for optional parameters (that starts with @) -opt_separator = '///' - -# stupid conf parsing, need refactoring -try: - with open(config) as f: - - for line in f: - - a = line.startswith('#') - b = line.startswith('\n') - c = line.startswith('\t') - d = line.startswith(' ') - - etc = line.startswith('@SOFT_ACTION_RE_NAME') - etc2 = line.startswith('@SOFT_ACTION_RE_CGROUP_V1') - - if not a and not b and not c and not d and not etc and not etc2: - a = line.partition('=') - - key = a[0].strip() - value = a[2].strip() - - if key not in config_dict: - config_dict[key] = value - else: - log('ERROR: config key duplication: {}'.format(key)) - exit(1) - - if etc: - - a = line.partition('@SOFT_ACTION_RE_NAME')[ - 2].partition(opt_separator) - - a1 = 'name' - - a2 = a[0].strip() - valid_re(a2) - - a3 = a[2].strip() - - zzz = (a1, a2, a3) - - soft_actions_list.append(zzz) - - if etc2: - - a = line.partition('@SOFT_ACTION_RE_CGROUP_V1')[ - 2].partition(opt_separator) - - a1 = 'cgroup_v1' - - a2 = a[0].strip() - valid_re(a2) - - a3 = a[2].strip() - - zzz = (a1, a2, a3) - - soft_actions_list.append(zzz) - - if line.startswith('@BADNESS_ADJ_RE_NAME'): - a = line.partition('@BADNESS_ADJ_RE_NAME')[2].strip( - ' \n').partition(opt_separator) - badness_adj = a[0].strip(' ') - reg_exp = a[2].strip(' ') - valid_re(reg_exp) - badness_adj_re_name_list.append((badness_adj, reg_exp)) - - if line.startswith('@BADNESS_ADJ_RE_CMDLINE'): - a = line.partition('@BADNESS_ADJ_RE_CMDLINE')[2].strip( - ' \n').partition(opt_separator) - badness_adj = a[0].strip(' ') - reg_exp = a[2].strip(' ') - valid_re(reg_exp) - badness_adj_re_cmdline_list.append((badness_adj, reg_exp)) - - if line.startswith('@BADNESS_ADJ_RE_UID'): - a = line.partition('@BADNESS_ADJ_RE_UID')[2].strip( - ' \n').partition(opt_separator) - badness_adj = a[0].strip(' ') - reg_exp = a[2].strip(' ') - valid_re(reg_exp) - badness_adj_re_uid_list.append((badness_adj, reg_exp)) - - if line.startswith('@BADNESS_ADJ_RE_CGROUP_V1'): - a = line.partition('@BADNESS_ADJ_RE_CGROUP_V1')[2].strip( - ' \n').partition(opt_separator) - badness_adj = a[0].strip(' ') - reg_exp = a[2].strip(' ') - valid_re(reg_exp) - badness_adj_re_cgroup_v1_list.append((badness_adj, reg_exp)) - - if line.startswith('@BADNESS_ADJ_RE_CGROUP_V2'): - a = line.partition('@BADNESS_ADJ_RE_CGROUP_V2')[2].strip( - ' \n').partition(opt_separator) - badness_adj = a[0].strip(' ') - reg_exp = a[2].strip(' ') - valid_re(reg_exp) - badness_adj_re_cgroup_v2_list.append((badness_adj, reg_exp)) - - if line.startswith('@BADNESS_ADJ_RE_REALPATH'): - a = line.partition('@BADNESS_ADJ_RE_REALPATH')[2].strip( - ' \n').partition(opt_separator) - badness_adj = a[0].strip(' ') - reg_exp = a[2].strip(' ') - valid_re(reg_exp) - badness_adj_re_realpath_list.append((badness_adj, reg_exp)) - - if line.startswith('@BADNESS_ADJ_RE_ENVIRON'): - a = line.partition('@BADNESS_ADJ_RE_ENVIRON')[2].strip( - ' \n').partition(opt_separator) - badness_adj = a[0].strip(' ') - reg_exp = a[2].strip(' ') - valid_re(reg_exp) - badness_adj_re_environ_list.append((badness_adj, reg_exp)) - - -except PermissionError: - errprint('PermissionError', conf_err_mess) - exit(1) -except UnicodeDecodeError: - errprint('UnicodeDecodeError', conf_err_mess) - exit(1) -except IsADirectoryError: - errprint('IsADirectoryError', conf_err_mess) - exit(1) -except IndexError: - errprint('IndexError', conf_err_mess) - exit(1) -except FileNotFoundError: - errprint('FileNotFoundError', conf_err_mess) - exit(1) - - -if badness_adj_re_name_list == []: - regex_matching = False -else: - regex_matching = True - - -if badness_adj_re_cmdline_list == []: - re_match_cmdline = False -else: - re_match_cmdline = True - - -if badness_adj_re_uid_list == []: - re_match_uid = False -else: - re_match_uid = True - - -if badness_adj_re_environ_list == []: - re_match_environ = False -else: - re_match_environ = True - - -if badness_adj_re_realpath_list == []: - re_match_realpath = False -else: - re_match_realpath = True - - -if badness_adj_re_cgroup_v1_list == []: - re_match_cgroup_v1 = False -else: - re_match_cgroup_v1 = True - - -if badness_adj_re_cgroup_v2_list == []: - re_match_cgroup_v2 = False -else: - re_match_cgroup_v2 = True - - -if soft_actions_list == []: - soft_actions = False -else: - soft_actions = True - - -########################################################################## - - -# post_zombie_delay = 0.1 - -# victim_cache_time = 50 - - -# extracting parameters from the dictionary -# check for all necessary parameters -# validation of all parameters -debug_psi = conf_parse_bool('debug_psi') -print_statistics = conf_parse_bool('print_statistics') -print_proc_table = conf_parse_bool('print_proc_table') -forbid_negative_badness = conf_parse_bool('forbid_negative_badness') -print_victim_status = conf_parse_bool('print_victim_status') -print_victim_cmdline = conf_parse_bool('print_victim_cmdline') -print_config_at_startup = conf_parse_bool('print_config_at_startup') -print_mem_check_results = conf_parse_bool('print_mem_check_results') -debug_sleep = conf_parse_bool('debug_sleep') -low_memory_warnings_enabled = conf_parse_bool('low_memory_warnings_enabled') -post_action_gui_notifications = conf_parse_bool( - 'post_action_gui_notifications') - - -psi_checking_enabled = conf_parse_bool('psi_checking_enabled') -ignore_psi = not psi_checking_enabled - -zram_checking_enabled = conf_parse_bool('zram_checking_enabled') -ignore_zram = not zram_checking_enabled - - -debug_gui_notifications = conf_parse_bool('debug_gui_notifications') -ignore_positive_oom_score_adj = conf_parse_bool( - 'ignore_positive_oom_score_adj') - - -(soft_threshold_min_mem_kb, soft_threshold_min_mem_mb, - soft_threshold_min_mem_percent) = calculate_percent('soft_threshold_min_mem') - -(hard_threshold_min_mem_kb, hard_threshold_min_mem_mb, - hard_threshold_min_mem_percent) = calculate_percent('hard_threshold_min_mem') - -(soft_threshold_max_zram_kb, soft_threshold_max_zram_mb, - soft_threshold_max_zram_percent) = calculate_percent('soft_threshold_max_zram') - -(hard_threshold_max_zram_kb, hard_threshold_max_zram_mb, - hard_threshold_max_zram_percent) = calculate_percent('hard_threshold_max_zram') - -(warning_threshold_min_mem_kb, warning_threshold_min_mem_mb, - warning_threshold_min_mem_percent) = calculate_percent('warning_threshold_min_mem') - -(warning_threshold_max_zram_kb, warning_threshold_max_zram_mb, - warning_threshold_max_zram_percent) = calculate_percent('warning_threshold_max_zram') - - -if 'post_zombie_delay' in config_dict: - post_zombie_delay = string_to_float_convert_test( - config_dict['post_zombie_delay']) - if post_zombie_delay is None: - errprint('Invalid post_zombie_delay, not float\nExit') - exit(1) - if post_zombie_delay < 0: - errprint('post_zombie_delay MUST be >= 0\nExit') - exit(1) -else: - errprint('post_zombie_delay not in config\nExit') - exit(1) - - -if 'victim_cache_time' in config_dict: - victim_cache_time = string_to_float_convert_test( - config_dict['victim_cache_time']) - if victim_cache_time is None: - errprint('Invalid victim_cache_time, not float\nExit') - exit(1) - if victim_cache_time < 0: - errprint('victim_cache_time MUST be >= 0\nExit') - exit(1) -else: - errprint('victim_cache_time not in config\nExit') - exit(1) - - -if 'fill_rate_mem' in config_dict: - fill_rate_mem = string_to_float_convert_test(config_dict['fill_rate_mem']) - if fill_rate_mem is None: - errprint('Invalid fill_rate_mem value, not float\nExit') - exit(1) - if fill_rate_mem <= 0: - errprint('fill_rate_mem MUST be > 0\nExit') - exit(1) -else: - errprint('fill_rate_mem not in config\nExit') - exit(1) - - -if 'fill_rate_swap' in config_dict: - fill_rate_swap = string_to_float_convert_test( - config_dict['fill_rate_swap']) - if fill_rate_swap is None: - errprint('Invalid fill_rate_swap value, not float\nExit') - exit(1) - if fill_rate_swap <= 0: - errprint('fill_rate_swap MUST be > 0\nExit') - exit(1) -else: - errprint('fill_rate_swap not in config\nExit') - exit(1) - - -if 'fill_rate_zram' in config_dict: - fill_rate_zram = string_to_float_convert_test( - config_dict['fill_rate_zram']) - if fill_rate_zram is None: - errprint('Invalid fill_rate_zram value, not float\nExit') - exit(1) - if fill_rate_zram <= 0: - errprint('fill_rate_zram MUST be > 0\nExit') - exit(1) -else: - errprint('fill_rate_zram not in config\nExit') - exit(1) - - -if 'soft_threshold_min_swap' in config_dict: - soft_threshold_min_swap = config_dict['soft_threshold_min_swap'] -else: - errprint('soft_threshold_min_swap not in config\nExit') - exit(1) - - -if 'hard_threshold_min_swap' in config_dict: - hard_threshold_min_swap = config_dict['hard_threshold_min_swap'] -else: - errprint('hard_threshold_min_swap not in config\nExit') - exit(1) - - -if 'post_soft_action_delay' in config_dict: - post_soft_action_delay = string_to_float_convert_test( - config_dict['post_soft_action_delay']) - if post_soft_action_delay is None: - errprint('Invalid post_soft_action_delay value, not float\nExit') - exit(1) - if post_soft_action_delay < 0: - errprint('post_soft_action_delay must be positiv\nExit') - exit(1) -else: - errprint('post_soft_action_delay not in config\nExit') - exit(1) - - -if 'psi_post_action_delay' in config_dict: - psi_post_action_delay = string_to_float_convert_test( - config_dict['psi_post_action_delay']) - if psi_post_action_delay is None: - errprint('Invalid psi_post_action_delay value, not float\nExit') - exit(1) - if psi_post_action_delay < 0: - errprint('psi_post_action_delay must be positive\nExit') - exit(1) -else: - errprint('psi_post_action_delay not in config\nExit') - exit(1) - - -if 'hard_threshold_max_psi' in config_dict: - hard_threshold_max_psi = string_to_float_convert_test( - config_dict['hard_threshold_max_psi']) - if hard_threshold_max_psi is None: - errprint('Invalid hard_threshold_max_psi value, not float\nExit') - exit(1) - if hard_threshold_max_psi < 0 or hard_threshold_max_psi > 100: - errprint('hard_threshold_max_psi must be in the range [0; 100]\nExit') - exit(1) -else: - errprint('hard_threshold_max_psi not in config\nExit') - exit(1) - - -if 'soft_threshold_max_psi' in config_dict: - soft_threshold_max_psi = string_to_float_convert_test( - config_dict['soft_threshold_max_psi']) - if soft_threshold_max_psi is None: - errprint('Invalid soft_threshold_max_psi value, not float\nExit') - exit(1) - if soft_threshold_max_psi < 0 or soft_threshold_max_psi > 100: - errprint('soft_threshold_max_psi must be in the range [0; 100]\nExit') - exit(1) -else: - errprint('soft_threshold_max_psi not in config\nExit') - exit(1) - - -if 'warning_threshold_max_psi' in config_dict: - warning_threshold_max_psi = string_to_float_convert_test( - config_dict['warning_threshold_max_psi']) - if warning_threshold_max_psi is None: - errprint('Invalid warning_threshold_max_psi value, not float\nExit') - exit(1) - if warning_threshold_max_psi < 0 or warning_threshold_max_psi > 100: - errprint( - 'warning_threshold_max_psi must be in the range [0; 100]\nExit') - exit(1) -else: - errprint('warning_threshold_max_psi not in config\nExit') - exit(1) - - -if 'min_badness' in config_dict: - min_badness = string_to_int_convert_test( - config_dict['min_badness']) - if min_badness is None: - errprint('Invalid min_badness value, not integer\nExit') - exit(1) - if min_badness < 0 or min_badness > 1000: - errprint('Invalud min_badness value\nExit') - exit(1) -else: - errprint('min_badness not in config\nExit') - exit(1) - - -if 'min_post_warning_delay' in config_dict: - min_post_warning_delay = string_to_float_convert_test( - config_dict['min_post_warning_delay']) - if min_post_warning_delay is None: - errprint('Invalid min_post_warning_delay value, not float\nExit') - exit(1) - if min_post_warning_delay < 1 or min_post_warning_delay > 300: - errprint('min_post_warning_delay value out of range [1; 300]\nExit') - exit(1) -else: - errprint('min_post_warning_delay not in config\nExit') - exit(1) - - -if 'warning_threshold_min_swap' in config_dict: - warning_threshold_min_swap = config_dict['warning_threshold_min_swap'] -else: - errprint('warning_threshold_min_swap not in config\nExit') - exit(1) - - -if 'max_victim_ancestry_depth' in config_dict: - max_victim_ancestry_depth = string_to_int_convert_test( - config_dict['max_victim_ancestry_depth']) - if min_badness is None: - errprint('Invalid max_victim_ancestry_depth value, not integer\nExit') - exit(1) - if max_victim_ancestry_depth < 1: - errprint('Invalud max_victim_ancestry_depth value\nExit') - exit(1) -else: - errprint('max_victim_ancestry_depth is not in config\nExit') - exit(1) - - -if 'max_soft_exit_time' in config_dict: - max_soft_exit_time = string_to_float_convert_test( - config_dict['max_soft_exit_time']) - if max_soft_exit_time is None: - errprint('Invalid max_soft_exit_time val' - 'ue, not float\nExit') - exit(1) - if max_soft_exit_time < 0: - errprint('max_soft_exit_time must be non-n' - 'egative number\nExit') - exit(1) -else: - errprint('max_soft_exit_time is not in config\nExit') - exit(1) - - -if 'post_kill_exe' in config_dict: - post_kill_exe = config_dict['post_kill_exe'] -else: - errprint('post_kill_exe is not in config\nExit') - exit(1) - - -if 'psi_path' in config_dict: - psi_path = config_dict['psi_path'] -else: - errprint('psi_path is not in config\nExit') - exit(1) - - -if 'psi_metrics' in config_dict: - psi_metrics = config_dict['psi_metrics'] -else: - errprint('psi_metrics is not in config\nExit') - exit(1) - - -if 'warning_exe' in config_dict: - warning_exe = config_dict['warning_exe'] - if warning_exe != '': - check_warning_exe = True - else: - check_warning_exe = False -else: - errprint('warning_exe is not in config\nExit') - exit(1) - - -if 'extra_table_info' in config_dict: - extra_table_info = config_dict['extra_table_info'] - if (extra_table_info != 'None' and - extra_table_info != 'cgroup_v1' and - extra_table_info != 'cgroup_v2' and - extra_table_info != 'cmdline' and - extra_table_info != 'environ' and - extra_table_info != 'realpath'): - - errprint('Invalid config: invalid extra_table_info value\nExit') - exit(1) -else: - errprint('Invalid config: extra_table_info is not in config\nExit') - exit(1) - - -separate_log = conf_parse_bool('separate_log') - -if separate_log: - - import logging - - log_dir = '/var/log/nohang' - - try: - os.mkdir(log_dir) - except PermissionError: - print('ERROR: can not create log dir') - except FileExistsError: - pass - - logfile = log_dir + '/nohang.log' - - try: - with open(logfile, 'a') as f: - pass - except FileNotFoundError: - print('ERROR: log FileNotFoundError') - except PermissionError: - print('ERROR: log PermissionError') - - try: - logging.basicConfig( - filename=logfile, - level=logging.INFO, - format="%(asctime)s: %(message)s") - except PermissionError: - errprint('ERROR: Permission denied: {}'.format(logfile)) - except FileNotFoundError: - errprint('ERROR: FileNotFoundError: {}'.format(logfile)) - - -if 'min_mem_report_interval' in config_dict: - min_mem_report_interval = string_to_float_convert_test( - config_dict['min_mem_report_interval']) - if min_mem_report_interval is None: - errprint('Invalid min_mem_report_interval value, not float\nExit') - exit(1) - if min_mem_report_interval < 0: - errprint('min_mem_report_interval must be non-negative number\nExit') - exit(1) -else: - errprint('min_mem_report_interval is not in config\nExit') - exit(1) - - -if 'psi_excess_duration' in config_dict: - psi_excess_duration = string_to_float_convert_test( - config_dict['psi_excess_duration']) - if psi_excess_duration is None: - errprint('Invalid psi_excess_duration value, not float\nExit') - exit(1) - if psi_excess_duration < 0: - errprint('psi_excess_duration must be non-negative number\nExit') - exit(1) -else: - errprint('psi_excess_duration is not in config\nExit') - exit(1) - - -if 'max_sleep' in config_dict: - max_sleep = string_to_float_convert_test( - config_dict['max_sleep']) - if max_sleep is None: - errprint('Invalid max_sleep value, not float\nExit') - exit(1) - if max_sleep <= 0: - errprint('max_sleep must be positive number\nExit') - exit(1) -else: - errprint('max_sleep is not in config\nExit') - exit(1) - - -if 'min_sleep' in config_dict: - min_sleep = string_to_float_convert_test( - config_dict['min_sleep']) - if min_sleep is None: - errprint('Invalid min_sleep value, not float\nExit') - exit(1) - if min_sleep <= 0: - errprint('min_sleep must be positive number\nExit') - exit(1) -else: - errprint('min_sleep is not in config\nExit') - exit(1) - - -if 'over_sleep' in config_dict: - over_sleep = string_to_float_convert_test( - config_dict['over_sleep']) - if over_sleep is None: - errprint('Invalid over_sleep value, not float\nExit') - exit(1) - if over_sleep <= 0: - errprint('over_sleep must be positive number\nExit') - exit(1) -else: - errprint('over_sleep is not in config\nExit') - exit(1) - - -sensitivity_test_time = over_sleep / 2 - - -if max_sleep < min_sleep: - errprint('min_sleep value must not exceed max_sleep value.\nExit') - exit(1) - - -if min_sleep < over_sleep: - errprint('over_sleep value must not exceed min_sleep value.\nExit') - exit(1) - - -if max_sleep == min_sleep: - stable_sleep = True -else: - stable_sleep = False - - -if print_proc_table_flag: - - if not root: - log('WARNING: effective UID != 0; euid={}; processes with other e' - 'uids will be invisible for nohang'.format(self_uid)) - - func_print_proc_table() - - -########################################################################## - - -psi_support = os.path.exists(psi_path) - - -########################################################################## - -# Get KiB levels if it's possible. - -soft_threshold_min_swap_tuple = get_swap_threshold_tuple( - soft_threshold_min_swap) -hard_threshold_min_swap_tuple = get_swap_threshold_tuple( - hard_threshold_min_swap) -warning_threshold_min_swap_tuple = get_swap_threshold_tuple( - warning_threshold_min_swap) - - -swap_kb_dict = dict() - -swap_term_is_percent = soft_threshold_min_swap_tuple[1] -if swap_term_is_percent: - soft_threshold_min_swap_percent = soft_threshold_min_swap_tuple[0] -else: - soft_threshold_min_swap_kb = soft_threshold_min_swap_tuple[0] - swap_kb_dict['soft_threshold_min_swap_kb'] = soft_threshold_min_swap_kb - -swap_kill_is_percent = hard_threshold_min_swap_tuple[1] -if swap_kill_is_percent: - hard_threshold_min_swap_percent = hard_threshold_min_swap_tuple[0] -else: - hard_threshold_min_swap_kb = hard_threshold_min_swap_tuple[0] - swap_kb_dict['hard_threshold_min_swap_kb'] = hard_threshold_min_swap_kb - - -swap_warn_is_percent = warning_threshold_min_swap_tuple[1] -if swap_warn_is_percent: - warning_threshold_min_swap_percent = warning_threshold_min_swap_tuple[0] -else: - warning_threshold_min_swap_kb = warning_threshold_min_swap_tuple[0] - swap_kb_dict['warning_threshold_min_swap_kb'] = warning_threshold_min_swap_kb - - -########################################################################## - - -if print_config_at_startup or check_config_flag: - check_config() - - -########################################################################## - - -# for calculating the column width when printing mem and zram -mem_len = len(str(round(mem_total / 1024.0))) - -if post_action_gui_notifications: - notify_sig_dict = {SIGKILL: 'Killing', - SIGTERM: 'Terminating'} - - -# convert rates from MiB/s to KiB/s -fill_rate_mem = fill_rate_mem * 1024 -fill_rate_swap = fill_rate_swap * 1024 -fill_rate_zram = fill_rate_zram * 1024 - - -warn_time_now = 0 -warn_time_delta = 1000 -warn_timer = 0 - - -########################################################################## - - -if not root: - log('WARNING: effective UID != 0; euid={}; processes with other e' - 'uids will be invisible for nohang'.format(self_uid)) - - -# Try to lock all memory - -mlockall() - -########################################################################## - - -# print_self_rss() - -psi_avg_string = '' # will be overwritten if PSI monitoring enabled - -mem_used_zram = 0 - - -if print_mem_check_results: - - # to find delta mem - wt2 = 0 - new_mem = 0 - - # init mem report interval - report0 = 0 - - -# handle signals -for i in sig_list: - signal(i, signal_handler) - - -x0 = time() -delta0 = 0 - - -threshold = None -mem_info = None - - -CHECK_PSI = False -if psi_support and not ignore_psi: - CHECK_PSI = True - -psi_kill_exceeded_timer = 0 -psi_term_exceeded_timer = 0 -psi_t0 = time() -psi_threshold = zram_threshold = zram_info = psi_info = None - - -CHECK_ZRAM = not ignore_zram - -log('Monitoring has started!') - -stdout.flush() - - -########################################################################## - - -while True: - - (masf_threshold, masf_info, mem_available, hard_threshold_min_swap_kb, - soft_threshold_min_swap_kb, swap_free, swap_total) = check_mem_swap_ex() - - if CHECK_ZRAM: - zram_threshold, zram_info, mem_used_zram = check_zram_ex() - - if CHECK_PSI: - (psi_threshold, psi_info, psi_t0, psi_kill_exceeded_timer, - psi_term_exceeded_timer, x0) = check_psi_ex( - psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0) - - if print_mem_check_results: - - if CHECK_PSI: - psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics) - if time() - psi_t0 >= psi_post_action_delay: - psi_post_action_delay_exceeded = True - else: - psi_post_action_delay_exceeded = False - - if print_mem_check_results: - psi_avg_string = 'PSI avg: {} | '.format( - str(psi_avg_value).rjust(6)) - - wt1 = time() - - delta = (mem_available + swap_free) - new_mem - - t_cycle = wt1 - wt2 - - report_delta = wt1 - report0 - - if report_delta >= min_mem_report_interval: - - mem_report = True - new_mem = mem_available + swap_free - - report0 = wt1 - - else: - mem_report = False - - wt2 = time() - - if mem_report: - - speed = delta / 1024.0 / report_delta - speed_info = ' | dMem: {} M/s'.format( - str(round(speed)).rjust(5) - ) - - # Calculate 'swap-column' width - swap_len = len(str(round(swap_total / 1024.0))) - - # Output available mem sizes - if swap_total == 0 and mem_used_zram == 0: - log('{}MemAvail: {} M, {} %{}'.format( - psi_avg_string, - human(mem_available, mem_len), - just_percent_mem(mem_available / mem_total), - speed_info - ) - ) - - elif swap_total > 0 and mem_used_zram == 0: - log('{}MemAvail: {} M, {} % | SwapFree: {} M, {} %{}'.format( - psi_avg_string, - human(mem_available, mem_len), - just_percent_mem(mem_available / mem_total), - human(swap_free, swap_len), - just_percent_swap(swap_free / (swap_total + 0.1)), - speed_info - ) - ) - - else: - log('{}MemAvail: {} M, {} % | SwapFree: {} M, {} % | Mem' - 'UsedZram: {} M, {} %{}'.format( - psi_avg_string, - human(mem_available, mem_len), - just_percent_mem(mem_available / mem_total), - human(swap_free, swap_len), - just_percent_swap(swap_free / (swap_total + 0.1)), - human(mem_used_zram, mem_len), - just_percent_mem(mem_used_zram / mem_total), - speed_info - ) - ) - - if (masf_threshold is SIGKILL or zram_threshold is SIGKILL or - psi_threshold is SIGKILL): - - threshold = SIGKILL - mem_info_list = [] - - if masf_info is not None: - mem_info_list.append(masf_info) - - if zram_info is not None: - mem_info_list.append(zram_info) - - if psi_info is not None: - mem_info_list.append(psi_info) - - psi_t0 = implement_corrective_action( - threshold, - mem_info_list, - psi_t0, - psi_kill_exceeded_timer, - psi_term_exceeded_timer, - x0, psi_threshold, zram_threshold, zram_info, psi_info) - continue - - if (masf_threshold is SIGTERM or zram_threshold is SIGTERM or - psi_threshold is SIGTERM): - - threshold = SIGTERM - mem_info_list = [] - - if masf_info is not None: - mem_info_list.append(masf_info) - - if zram_info is not None: - mem_info_list.append(zram_info) - - if psi_info is not None: - mem_info_list.append(psi_info) - - psi_t0 = implement_corrective_action( - threshold, - mem_info_list, - psi_t0, - psi_kill_exceeded_timer, - psi_term_exceeded_timer, - x0, psi_threshold, zram_threshold, zram_info, psi_info) - continue - - if low_memory_warnings_enabled: - - if (masf_threshold == 'WARN' or zram_threshold == 'WARN' or - psi_threshold == 'WARN'): - - warn_time_delta = time() - warn_time_now - warn_time_now = time() - warn_timer += warn_time_delta - if warn_timer > min_post_warning_delay: - - send_notify_warn() - - warn_timer = 0 - - sleep_after_check_mem() diff --git a/old/nohang.conf b/old/nohang.conf deleted file mode 100644 index 1b13348..0000000 --- a/old/nohang.conf +++ /dev/null @@ -1,359 +0,0 @@ - This is nohang config file. - Lines starting with #, tabs and spaces are comments. - Lines starting with @ contain optional parameters. - All values are case sensitive. - Be careful: nohang doesn't forbid you to shoot yourself in the foot. - - The configuration includes the following sections: - - 0. Common zram settings - 1. Memory levels to respond to as an OOM threat - 2. Response on PSI memory metrics - 3. The frequency of checking the level of available memory - (and CPU usage) - 4. The prevention of killing innocent victims - 5. Impact on the badness of processes via matching their names, cgroups and - cmdlines with specified regular expressions - 6. Customize corrective actions: the execution of a specific command - instead of sending the SIGTERM signal - 7. GUI notifications: - - low memory warnings - - OOM prevention results - 8. Output verbosity - 9. Misc - - Just read the description of the parameters and edit the values. - Please restart the program after editing the config. - - More docs will be written later. - -############################################################################### - - 0. Common zram settings - - See https://www.kernel.org/doc/Documentation/blockdev/zram.txt - You maybe need to set `zram_checking_enabled = True` if you has a big zram disksize. - -zram_checking_enabled = False - -############################################################################### - - 1. Thresholds below which a signal should be sent to the victim - - Sets the available memory levels at or below which SIGTERM or SIGKILL - signals are sent. The signal will be sent if MemAvailable and - SwapFree (in /proc/meminfo) at the same time will drop below the - corresponding values. Can be specified in % (percent) and M (MiB). - Valid values are floating-point numbers from the range [0; 100] %. - - MemAvailable levels. - -soft_threshold_min_mem = 8 % -hard_threshold_min_mem = 4 % - - SwapFree levels. - -soft_threshold_min_swap = 10 % -hard_threshold_min_swap = 5 % - - Specifying the total share of zram in memory, if exceeded the - corresponding signals are sent. As the share of zram in memory - increases, it may fall responsiveness of the system. 90 % is a - usual hang level, not recommended to set very high. - - Can be specified in % and M. Valid values are floating-point - numbers from the range [0; 90] %. - -soft_threshold_max_zram = 60 % -hard_threshold_max_zram = 65 % - - -############################################################################### - - 2. Response on PSI memory metrics (it needs Linux 4.20 and up) - - About PSI: - https://facebookmicrosites.github.io/psi/ - - Disabled by default (psi_checking_enabled = False). - -psi_checking_enabled = False - - Choose a path to PSI file. - By default it monitors system-wide file: /proc/pressure/memory - You also can set file to monitor one cgroup slice. - For example: - psi_path = /sys/fs/cgroup/unified/user.slice/memory.pressure - psi_path = /sys/fs/cgroup/unified/system.slice/memory.pressure - psi_path = /sys/fs/cgroup/unified/system.slice/foo.service/memory.pressure - - Execute the command - find /sys/fs/cgroup -name memory.pressure - to find available memory.pressue files (except /proc/pressure/memory). - (actual for cgroup2) - -psi_path = /proc/pressure/memory - - Valid psi_metrics are: - some_avg10 - some_avg60 - some_avg300 - full_avg10 - full_avg60 - full_avg300 - - some_avg10 is most sensitive. - -psi_metrics = some_avg10 - -soft_threshold_max_psi = 60 - -hard_threshold_max_psi = 90 - - >= 0, float -psi_excess_duration = 60 - -psi_post_action_delay = 60 - - -############################################################################### - - 3. The frequency of checking the amount of available memory - (and CPU usage) - - Coefficients that affect the intensity of monitoring. Reducing - the coefficients can reduce CPU usage and increase the periods - between memory checks. - - Why three coefficients instead of one? Because the swap fill rate - is usually lower than the RAM fill rate. - - It is possible to set a lower intensity of monitoring for swap - without compromising to prevent OOM and thus reduce the CPU load. - - Default values are well for desktop. On servers without rapid - fluctuations in memory levels the values can be reduced. - - Valid values are positive floating-point numbers. - -fill_rate_mem = 4000 -fill_rate_swap = 1500 -fill_rate_zram = 6000 - - See also https://github.com/rfjakob/earlyoom/issues/61 - -max_sleep = 3 -min_sleep = 0.1 - - Sleep time if soft threshold exceeded. - -over_sleep = 0.05 - -############################################################################### - - 4. The prevention of killing innocent victims - - Valid values are integers from the range [0; 1000]. - -min_badness = 10 - - Valid values are non-negative floating-point numbers. - Min delay if a victim doesn't respond to SIGTERM in 10 ms. - -post_soft_action_delay = 3 - -post_zombie_delay = 0.1 - -victim_cache_time = 10 - - Valid values are True and False. - -ignore_positive_oom_score_adj = False - -############################################################################### - - 5. Impact on the badness of processes via matching their names, - cmdlines or UIDs with regular expressions using re.search(). - - See https://en.wikipedia.org/wiki/Regular_expression and - https://en.wikipedia.org/wiki/Perl_Compatible_Regular_Expressions - - Enabling this options slows down the search for the victim - because the names, cmdlines or UIDs of all processes - (except init and kthreads) are compared with the - specified regex patterns (in fact slowing down is caused by - reading all /proc/*/cmdline and /proc/*/status files). - - Use script `oom-sort` from nohang package to view - names, cmdlines and UIDs of processes. - - 5.1. Matching process names with RE patterns - - Syntax: - - @BADNESS_ADJ_RE_NAME badness_adj /// RE_pattern - - New badness value will be += badness_adj - - It is possible to compare multiple patterns - with different badness_adj values. - - Example: - @BADNESS_ADJ_RE_NAME -500 /// ^sshd$ - - 5.2. Matching CGroup_v1-line with RE patterns - - @BADNESS_ADJ_RE_CGROUP_V1 -100 /// ^/system\.slice/ - - @BADNESS_ADJ_RE_CGROUP_V1 50 /// /foo\.service$ - - @BADNESS_ADJ_RE_CGROUP_V1 -50 /// ^/user\.slice/ - - 5.3. Matching CGroup_v2-line with RE patterns - - @BADNESS_ADJ_RE_CGROUP_V2 100 /// ^/workload - - 5.4. Matching eUIDs with RE patterns - - @BADNESS_ADJ_RE_UID -100 /// ^0$ - - 5.5. Matching realpath with RE patterns - - @BADNESS_ADJ_RE_REALPATH 20 /// ^/usr/bin/foo - - 5.6. Matching cmdlines with RE patterns - - A good option that allows fine adjustment. - - Prefer chromium tabs and electron-based apps - @BADNESS_ADJ_RE_CMDLINE 200 /// --type=renderer - - Prefer firefox tabs (Web Content and WebExtensions) - @BADNESS_ADJ_RE_CMDLINE 300 /// -appomni - - @BADNESS_ADJ_RE_CMDLINE -200 /// ^/usr/lib/virtualbox - - 5.7. Matching environ with RE patterns - - @BADNESS_ADJ_RE_ENVIRON 100 /// USER=user - - Note that you can control badness also via systemd units via - OOMScoreAdjust, see - www.freedesktop.org/software/systemd/man/systemd.exec.html#OOMScoreAdjust= - -############################################################################### - - 6. Customize corrective actions. - - TODO: docs - - Syntax: - KEY REGEXP SEPARATOR COMMAND - - @SOFT_ACTION_RE_NAME ^foo$ /// kill -SEGV $PID - @SOFT_ACTION_RE_NAME ^bash$ /// kill -9 $PID - - @SOFT_ACTION_RE_CGROUP_V1 ^/system\.slice/ /// systemctl restart $SERVICE - @SOFT_ACTION_RE_CGROUP_V1 /foo\.service$ /// systemctl restart $SERVICE - - $PID will be replaced by process PID. - $NAME will be replaced by process name. - $SERVICE will be replaced by .service if it exists (overwise it will be - relpaced by empty line) - -############################################################################### - - 7. GUI notifications & low memory warnings - -post_action_gui_notifications = False - - Enable GUI notifications about the low level of available memory. - Valid values are True and False. - -low_memory_warnings_enabled = False - - Execute the command instead of sending GUI notifications if the value is - not empty line. For example: - warning_exe = cat /proc/meminfo & - -warning_exe = - - Can be specified in % (percent) and M (MiB). - Valid values are floating-point numbers from the range [0; 100] %. - -warning_threshold_min_mem = 20 % - -warning_threshold_min_swap = 25 % - -warning_threshold_max_zram = 50 % - -warning_threshold_max_psi = 100 - - Valid values are floating-point numbers from the range [1; 300]. - -min_post_warning_delay = 20 - - Ampersands (&) will be replaced with asterisks (*) in process - names and in commands. - -############################################################################### - - 8. Verbosity - - Display the configuration when the program starts. - Valid values are True and False. - -print_config_at_startup = False - - Print memory check results. - Valid values are True and False. - -print_mem_check_results = False - -min_mem_report_interval = 60 - -print_proc_table = False - - Valid values: - None - cgroup_v1 - cgroup_v2 - realpath - cmdline - environ - -extra_table_info = None - -print_victim_status = True - -max_victim_ancestry_depth = 3 - -print_victim_cmdline = False - -print_statistics = True - - Print sleep periods between memory checks. - Valid values are True and False. - -debug_psi = False - -debug_gui_notifications = False - -debug_sleep = False - -separate_log = False - -############################################################################### - - 9. Misc - -max_soft_exit_time = 10 - -post_kill_exe = - -forbid_negative_badness = True - -############################################################################### - - Use cases, feature requests and any questions are welcome: - https://github.com/hakavlad/nohang/issues diff --git a/old/nohang_notify_helper b/old/nohang_notify_helper deleted file mode 100755 index b5beb43..0000000 --- a/old/nohang_notify_helper +++ /dev/null @@ -1,233 +0,0 @@ -#!/usr/bin/env python3 - -# print('Starting nohang_notify_helper') - - -def decoder(string): - """ - """ - decoded = '' - for i in string.split(':'): - decoded += chr(int(i)) - return decoded - - -def write(path, string): - """ - """ - with open(path, 'w') as f: - f.write(string) - - -def rline1(path): - """read 1st line from path.""" - try: - with open(path) as f: - for line in f: - return line - except OSError: - exit(1) - - -def rfile(path): - """read file.""" - with open(path) as f: - return f.read() - - -def re_pid_environ(pid): - """ - read environ of 1 process - returns tuple with USER, DBUS, DISPLAY like follow: - ('user', 'DISPLAY=:0', - 'DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus') - returns None if these vars is not in /proc/[pid]/environ - """ - try: - env = str(rline1('/proc/' + pid + '/environ')) - if display_env in env and dbus_env in env and user_env in env: - env_list = env.split('\x00') - - # iterating over a list of process environment variables - for i in env_list: - if i.startswith(user_env): - user = i - if user == 'USER=root': - return None - continue - - if i.startswith(display_env): - display = i[:10] - continue - - if i.startswith(dbus_env): - dbus = i - continue - - if i.startswith('HOME='): - # exclude Display Manager's user - if i.startswith('HOME=/var'): - return None - - try: - env = user.partition('USER=')[2], display, dbus - except UnboundLocalError: - # print('notify helper: UnboundLocalError') - return None - - return env - - except FileNotFoundError: - # print('notify helper: FileNotFoundError') - return None - except ProcessLookupError: - # print('notify helper: ProcessLookupError') - return None - - -def root_notify_env(): - """return set(user, display, dbus)""" - unsorted_envs_list = [] - # iterates over processes, find processes with suitable env - for pid in listdir('/proc'): - - if path.exists('/proc/' + pid + '/exe') is True: - one_env = re_pid_environ(pid) - unsorted_envs_list.append(one_env) - - env = set(unsorted_envs_list) - env.discard(None) - - # deduplicate dbus - new_env = [] - end = [] - for i in env: - key = i[0] + i[1] - if key not in end: - end.append(key) - new_env.append(i) - else: - continue - - return new_env - - -try: - write('/proc/self/oom_score_adj', '0') -except Exception: - pass - - -try: - from os import listdir, path - from subprocess import Popen, TimeoutExpired - from sys import argv -except OSError: - exit(1) - -if len(argv) == 5: - _, uid, debug, title, body = argv -else: - print('{}: invalid input'.format(argv[0])) - exit(1) - -uid = uid.partition('--euid=')[2] - -debug = debug.partition('--debug=')[2] - -if debug == 'True': - debug = True -else: - debug = False - -title = title.partition('--title=')[2] - -body = decoder(body.partition('--body=')[2]) - -if len(argv) != 5: - print('nohang_notify_helper: invalid input') - exit(1) - - -with open('/proc/meminfo') as f: - for line in f: - if line.startswith('SwapTotal'): - swap_total = int(line.split(':')[1][:-4]) - if swap_total > 0: - wait_time = 15 - else: - wait_time = 3 - - -if debug: - print('nohang_notify_helper: wait_time:', wait_time, 'sec') - - -if uid != '0': - cmd = ['notify-send', '--icon=dialog-warning', title, body] - if debug: - print('nohang_notify_helper: run cmd:', cmd) - with Popen(cmd) as proc: - try: - proc.wait(timeout=wait_time) - except TimeoutExpired: - proc.kill() - if debug: - print('nohang_notify_helper: TimeoutExpired') - exit() - -display_env = 'DISPLAY=' -dbus_env = 'DBUS_SESSION_BUS_ADDRESS=' -user_env = 'USER=' - -list_with_envs = root_notify_env() -list_len = len(list_with_envs) - -# if somebody logged in with GUI -if list_len > 0: - - for i in list_with_envs: - if debug: - print('Send a GUI notification:\n ', - 'title: ', [title], - '\n body: ', [body], - '\n user/env:', i - ) - - # iterating over logged-in users - for i in list_with_envs: - username, display_env, dbus_env = i[0], i[1], i[2] - display_tuple = display_env.partition('=') - dbus_tuple = dbus_env.partition('=') - display_value = display_tuple[2] - dbus_value = dbus_tuple[2] - - try: - with Popen([ - 'sudo', '-u', username, - 'env', - 'DISPLAY=' + display_value, - 'DBUS_SESSION_BUS_ADDRESS=' + dbus_value, - 'notify-send', - '--icon=dialog-warning', - title, - body - ]) as proc: - try: - proc.wait(timeout=wait_time) - except TimeoutExpired: - proc.kill() - print('TimeoutExpired: notify user: ' + username) - except BlockingIOError: - print('nohang_notify_helper: BlockingIOError') - except OSError: - print('nohang_notify_helper: OSError') - except Exception: - print('nohang_notify_helper: CANNOT SPAWN NOTIFY-SEND PROCESS') -else: - if debug: - print( - 'Not send GUI notification: [', - title, - body, - ']. Nobody logged-in with GUI. Nothing to do.')