diff --git a/Makefile b/Makefile index a1c0b0f..2eea755 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,6 @@ install: install -d $(DESTDIR)$(BINDIR) install -m0755 nohang $(DESTDIR)$(BINDIR)/nohang - install -m0755 nohang_notify_helper $(DESTDIR)$(BINDIR)/nohang_notify_helper install -m0755 oom-sort $(DESTDIR)$(BINDIR)/oom-sort install -m0755 psi-top $(DESTDIR)$(BINDIR)/psi-top install -m0755 psi-monitor $(DESTDIR)$(BINDIR)/psi-monitor @@ -43,7 +42,6 @@ install-desktop: install -d $(DESTDIR)$(BINDIR) install -m0755 nohang $(DESTDIR)$(BINDIR)/nohang - install -m0755 nohang_notify_helper $(DESTDIR)$(BINDIR)/nohang_notify_helper install -m0755 oom-sort $(DESTDIR)$(BINDIR)/oom-sort install -m0755 psi-top $(DESTDIR)$(BINDIR)/psi-top install -m0755 psi-monitor $(DESTDIR)$(BINDIR)/psi-monitor @@ -76,7 +74,6 @@ uninstall: -systemctl disable nohang.service || true -systemctl daemon-reload rm -fv $(DESTDIR)$(BINDIR)/nohang - rm -fv $(DESTDIR)$(BINDIR)/nohang_notify_helper rm -fv $(DESTDIR)$(BINDIR)/oom-sort rm -fv $(DESTDIR)$(BINDIR)/psi-top rm -fv $(DESTDIR)$(BINDIR)/psi-monitor @@ -95,7 +92,6 @@ systemd: pylint: -pylint3 -E nohang - -pylint3 -E nohang_notify_helper -pylint3 -E oom-sort -pylint3 -E psi-top -pylint3 -E psi-monitor diff --git a/nohang b/nohang index da010ea..4d0e433 100755 --- a/nohang +++ b/nohang @@ -17,6 +17,265 @@ from threading import Thread # define functions +def exe(cmd): + """ execute cmd + """ + log('Execute the command: {}'.format(cmd)) + t0 = time() + write_self_oom_score_adj(self_oom_score_adj_max) + err = os.system(cmd) + write_self_oom_score_adj(self_oom_score_adj_min) + dt = time() - t0 + log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3))) + return err + + +def go(func, *a): + """ run func in new thread + """ + t1 = time() + th = Thread(target=func, args=a) + th_name = th.getName() + if debug_threading: + log('Starting {}'.format(th_name)) + try: + th.start() + t2 = time() + if debug_threading: + log('{} has started in {} ms'.format( + th_name, round((t2 - t1) * 1000, 1))) + except RuntimeError: + if debug_threading: + log('RuntimeError: cannot start {}'.format(th_name)) + + +def re_pid_environ(pid): + """ + read environ of 1 process + returns tuple with USER, DBUS, DISPLAY like follow: + ('user', 'DISPLAY=:0', + 'DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus') + returns None if these vars is not in /proc/[pid]/environ + """ + + try: + with open('/proc/' + pid + '/environ') as f: + env = f.read() + except FileNotFoundError: + log('notify helper: FileNotFoundError') + return None + except ProcessLookupError: + log('notify helper: ProcessLookupError') + return None + + if display_env in env and dbus_env in env and user_env in env: + + env_list = env.split('\x00') + + # iterating over a list of process environment variables + for i in env_list: + + # exclude Display Manager's user + if i.startswith('HOME=/var'): + return None + + if i.startswith(user_env): + user = i + if user == 'USER=root': + return None + continue + + if i.startswith(display_env): + display = i[:10] + continue + + if i.startswith(dbus_env): + dbus = i + continue + + try: + return user.partition('USER=')[2], display, dbus + except UnboundLocalError: + log('notify helper: UnboundLocalError') + return None + + +def root_notify_env(): + """return set(user, display, dbus)""" + unsorted_envs_list = [] + # iterates over processes, find processes with suitable env + for pid in os.listdir('/proc'): + + if os.path.exists('/proc/' + pid + '/exe') is True: + one_env = re_pid_environ(pid) + unsorted_envs_list.append(one_env) + + env = set(unsorted_envs_list) + env.discard(None) + + # deduplicate dbus + new_env = [] + end = [] + for i in env: + key = i[0] + i[1] + if key not in end: + end.append(key) + new_env.append(i) + else: + continue + + return new_env + + +def pop(cmd, username): + """ + """ + if swap_total == 0: + wait_time = 2 + else: + wait_time = 20 + + t3 = time() + + with Popen(cmd) as proc: + try: + proc.wait(timeout=wait_time) + except TimeoutExpired: + proc.kill() + if debug_gui_notifications: + log('TimeoutExpired: notify user: {}'.format(username)) + + t4 = time() + + if debug_gui_notifications: + log('Popen time: {} sec; cmd: {}'.format(round(t4 - t3, 3), cmd)) + + +def send_notification(title, body): + """ + """ + if self_uid != 0: + cmd = ['notify-send', '--icon=dialog-warning', title, body] + username = '(UID={})'.format(self_uid) + pop(cmd, username) + return None + + t1 = time() + + if envd['t'] is None: + + list_with_envs = root_notify_env() + envd['list_with_envs'] = list_with_envs + envd['t'] = time() + + elif time() - envd['t'] > env_cache_time: + + list_with_envs = root_notify_env() + envd['list_with_envs'] = list_with_envs + envd['t'] = time() + + else: + + list_with_envs = envd['list_with_envs'] + + list_len = len(list_with_envs) + + t2 = time() + if debug_gui_notifications: + log('Find env time: {} ms'.format(round((t2 - t1) * 1000))) + + # if somebody logged in with GUI + if list_len > 0: + + for i in list_with_envs: + if debug_gui_notifications: + log('Send a GUI notification:\n ', + 'title: ', [title], + '\n body: ', [body], + '\n user/env:', i + ) + + # iterating over logged-in users + for i in list_with_envs: + username, display_env, dbus_env = i[0], i[1], i[2] + display_tuple = display_env.partition('=') + dbus_tuple = dbus_env.partition('=') + display_value = display_tuple[2] + dbus_value = dbus_tuple[2] + + cmd = [ + 'sudo', '-u', username, + 'env', + 'DISPLAY=' + display_value, + 'DBUS_SESSION_BUS_ADDRESS=' + dbus_value, + 'notify-send', + '--icon=dialog-warning', + title, + body + ] + + go(pop, cmd, username) + + +def send_notify_warn(): + """ Implement Low memory warnings + """ + log('Warning threshold exceeded') + + if check_warning_exe: + go(exe, warning_exe) + + else: + + title = 'Low memory' + + body = 'MemAvail: {}%\nSwapFree: {}%'.format( + round(mem_available / mem_total * 100), + round(swap_free / (swap_total + 0.1) * 100) + ) + + go(send_notification, title, body) + + +def send_notify(threshold, name, pid): + """ + Notificate about OOM Preventing. + + threshold: key for notify_sig_dict + name: str process name + pid: str process pid + """ + + title = 'Freeze prevention' + body = '{} [{}] {}'.format( + notify_sig_dict[threshold], + pid, + name.replace( + # symbol '&' can break notifications in some themes, + # therefore it is replaced by '*' + '&', '*' + ) + ) + + go(send_notification, title, body) + + +def send_notify_etc(pid, name, command): + """ + Notificate about OOM Preventing. + + command: str command that will be executed + name: str process name + pid: str process pid + """ + title = 'Freeze prevention' + body = 'Victim is [{}] {}\nExecute the co' \ + 'mmand:\n{}'.format( + pid, name.replace('&', '*'), command.replace('&', '*')) + + go(send_notification, title, body) + + def check_config(): """ """ @@ -181,15 +440,6 @@ def check_config(): exit() -def encoder(string): - """ - """ - encoded = '' - for i in string: - encoded += str(ord(i)) + ':' - return encoded[:-1] - - def get_swap_threshold_tuple(string): # re (Num %, True) or (Num KiB, False) """Returns KiB value if abs val was set in config, or tuple with %""" @@ -292,36 +542,6 @@ def signal_handler_inner(signum, frame): sig_dict[signum])) -def exe(cmd): - """ - """ - - log('Execute the command: {}'.format(cmd)) - t0 = time() - write_self_oom_score_adj(self_oom_score_adj_max) - err = os.system(cmd) - write_self_oom_score_adj(self_oom_score_adj_min) - dt = time() - t0 - log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3))) - return err - - -def go(func, *a): - """ run func in new thread - """ - t1 = time() - try: - Thread(target=func, args=a).start() - except RuntimeError: - print('RuntimeError: cannot spawn a new thread') - return 1 - t2 = time() - log('New thread spawned in {} ms'.format( - round((t2 - t1) * 1000, 1) - )) - return 0 - - def write(path, string): """ """ @@ -357,15 +577,9 @@ def func_print_proc_table(): def log(*msg): """ """ - try: - print(*msg) - except OSError: - sleep(0.01) + print(*msg) if separate_log: - try: - logging.info(*msg) - except OSError: - sleep(0.01) + logging.info(*msg) def print_version(): @@ -1009,80 +1223,6 @@ def zram_stat(zram_id): return disksize, mem_used_total # BYTES, str -def send_notify_warn(): - """ - Look for process with maximum 'badness' and warn user with notification. - (implement Low memory warnings) - """ - log('Warning threshold exceeded') - - if check_warning_exe: - exe(warning_exe) - - else: - - title = 'Low memory' - - body = 'MemAvail: {}%\nSwapFree: {}%'.format( - round(mem_available / mem_total * 100), - round(swap_free / (swap_total + 0.1) * 100) - ) - - send_notification(title, body) - - -def send_notify(threshold, name, pid): - """ - Notificate about OOM Preventing. - - threshold: key for notify_sig_dict - name: str process name - pid: str process pid - """ - - title = 'Freeze prevention' - body = '{} [{}] {}'.format( - notify_sig_dict[threshold], - pid, - name.replace( - # symbol '&' can break notifications in some themes, - # therefore it is replaced by '*' - '&', '*' - ) - ) - - send_notification(title, body) - - -def send_notify_etc(pid, name, command): - """ - Notificate about OOM Preventing. - - command: str command that will be executed - name: str process name - pid: str process pid - """ - title = 'Freeze prevention' - body = 'Victim is [{}] {}\nExecute the co' \ - 'mmand:\n{}'.format( - pid, name.replace('&', '*'), command.replace('&', '*')) - - send_notification(title, body) - - -def send_notification(title, body): - """ - """ - cmd = '{} "--euid={}" "--debug={}" "--title={}" "--body={}" &'.format( - notify_helper_path, - self_uid, - debug_gui_notifications, - title, - encoder(body)) - - go(exe, cmd) - - def get_pid_list(): """ Find pid list expect kthreads and zombies @@ -2163,11 +2303,7 @@ def sleep_after_check_mem(): log('Sleep {} sec (t_mem={}, t_swap={}{})'.format(round(t, 2), round( t_mem, 2), round(t_swap, 2), z)) - try: - stdout.flush() - except OSError: - pass - + stdout.flush() sleep(t) @@ -2631,10 +2767,19 @@ print_config_at_startup = conf_parse_bool('print_config_at_startup') print_mem_check_results = conf_parse_bool('print_mem_check_results') debug_sleep = conf_parse_bool('debug_sleep') low_memory_warnings_enabled = conf_parse_bool('low_memory_warnings_enabled') + + +if low_memory_warnings_enabled or post_action_gui_notifications: + from subprocess import Popen, TimeoutExpired + + post_action_gui_notifications = conf_parse_bool( 'post_action_gui_notifications') +debug_threading = conf_parse_bool('debug_threading') + + psi_checking_enabled = conf_parse_bool('psi_checking_enabled') ignore_psi = not psi_checking_enabled @@ -2694,6 +2839,20 @@ else: exit(1) +if 'env_cache_time' in config_dict: + env_cache_time = string_to_float_convert_test( + config_dict['env_cache_time']) + if env_cache_time is None: + errprint('Invalid env_cache_time value, not float\nExit') + exit(1) + if env_cache_time < 0: + errprint('fill_rate_mem MUST be >= 0\nExit') + exit(1) +else: + errprint('fill_rate_mem not in config\nExit') + exit(1) + + if 'fill_rate_mem' in config_dict: fill_rate_mem = string_to_float_convert_test(config_dict['fill_rate_mem']) if fill_rate_mem is None: @@ -3201,6 +3360,14 @@ log('Monitoring has started!') stdout.flush() +display_env = 'DISPLAY=' +dbus_env = 'DBUS_SESSION_BUS_ADDRESS=' +user_env = 'USER=' + +envd = dict() +envd['list_with_envs'] = envd['t'] = None + + ########################################################################## diff --git a/nohang-desktop.conf b/nohang-desktop.conf index e6d127c..7144fc1 100644 --- a/nohang-desktop.conf +++ b/nohang-desktop.conf @@ -297,6 +297,9 @@ warning_threshold_max_psi = 100 min_post_warning_delay = 30 +env_cache_time = 300 + + Ampersands (&) will be replaced with asterisks (*) in process names and in commands. @@ -347,6 +350,8 @@ debug_sleep = False separate_log = False +debug_threading = False + ############################################################################### 9. Misc diff --git a/nohang.conf b/nohang.conf index 1b13348..b36e0e9 100644 --- a/nohang.conf +++ b/nohang.conf @@ -293,6 +293,8 @@ warning_threshold_max_psi = 100 min_post_warning_delay = 20 +env_cache_time = 300 + Ampersands (&) will be replaced with asterisks (*) in process names and in commands. @@ -343,6 +345,8 @@ debug_sleep = False separate_log = False +debug_threading = False + ############################################################################### 9. Misc diff --git a/old/nohang b/old/nohang new file mode 100755 index 0000000..da010ea --- /dev/null +++ b/old/nohang @@ -0,0 +1,3360 @@ +#!/usr/bin/env python3 +"""A daemon that prevents OOM in Linux systems.""" + +import os +from ctypes import CDLL +from time import sleep, time +from operator import itemgetter +from sys import stdout, stderr, argv, exit +from re import search +from sre_constants import error as invalid_re +from signal import signal, SIGKILL, SIGTERM, SIGINT, SIGQUIT, SIGHUP +from threading import Thread + + +########################################################################## + +# define functions + + +def check_config(): + """ + """ + + log('#' * 79) + + log('0. Common zram settings') + + log(' zram_checking_enabled: {}'.format(zram_checking_enabled)) + + log('1. Thresholds below which a signal should be sent to the victim') + + log(' soft_threshold_min_mem: {} MiB, {} %'.format( + round(soft_threshold_min_mem_mb), round(soft_threshold_min_mem_percent, 1))) + log(' hard_threshold_min_mem: {} MiB, {} %'.format( + round(hard_threshold_min_mem_mb), round(hard_threshold_min_mem_percent, 1))) + log(' soft_threshold_min_swap: {}'.format(soft_threshold_min_swap)) + log(' hard_threshold_min_swap: {}'.format(hard_threshold_min_swap)) + log(' soft_threshold_max_zram: {} MiB, {} %'.format( + round(soft_threshold_max_zram_mb), round(soft_threshold_max_zram_percent, 1))) + log(' hard_threshold_max_zram: {} MiB, {} %'.format( + round(hard_threshold_max_zram_mb), round(hard_threshold_max_zram_percent, 1))) + + log('2. Response on PSI memory metrics') + + log(' psi_checking_enabled: {}'.format(psi_checking_enabled)) + log(' psi_path: {}'.format(psi_path)) + log(' psi_metrics: {}'.format(psi_metrics)) + log(' soft_threshold_max_psi: {}'.format(soft_threshold_max_psi)) + log(' hard_threshold_max_psi: {}'.format(hard_threshold_max_psi)) + log(' psi_excess_duration: {} sec'.format(psi_excess_duration)) + log(' psi_post_action_delay: {} sec'.format(psi_post_action_delay)) + + log('3. The frequency of checking the amount of available memory') + + log(' fill_rate_mem: {}'.format(fill_rate_mem)) + log(' fill_rate_swap: {}'.format(fill_rate_swap)) + log(' fill_rate_zram: {}'.format(fill_rate_zram)) + log(' max_sleep: {} sec'.format(max_sleep)) + log(' min_sleep: {} sec'.format(min_sleep)) + log(' over_sleep: {} sec'.format(over_sleep)) + + log('4. The prevention of killing innocent victims') + + log(' min_badness: {}'.format(min_badness)) + log(' post_soft_action_delay: {} sec'.format(post_soft_action_delay)) + log(' post_zombie_delay: {} sec'.format(post_zombie_delay)) + log(' victim_cache_time: {} sec'.format(victim_cache_time)) + log(' ignore_positive_oom_score_adj: {}'.format( + ignore_positive_oom_score_adj)) + + log('5. Impact on the badness of processes') + + log('5.1. Matching process names with RE patterns') + if len(badness_adj_re_name_list) > 0: + log(' regexp: badness_adj:') + for i in badness_adj_re_name_list: + log(' {} {}'.format(i[1], i[0])) + else: + log(' (not set)') + + log('5.2. Matching CGroup_v1-line with RE patterns') + if len(badness_adj_re_cgroup_v1_list) > 0: + log(' regexp: badness_adj:') + for i in badness_adj_re_cgroup_v1_list: + log(' {} {}'.format(i[1], i[0])) + else: + log(' (not set)') + + log('5.3. Matching CGroup_v2-line with RE patterns') + if len(badness_adj_re_cgroup_v2_list) > 0: + log(' regexp: badness_adj:') + for i in badness_adj_re_cgroup_v1_list: + log(' {} {}'.format(i[1], i[0])) + else: + log(' (not set)') + + log('5.4. Matching eUIDs with RE patterns') + if len(badness_adj_re_cgroup_v2_list) > 0: + log(' regexp: badness_adj:') + for i in badness_adj_re_uid_list: + log(' {} {}'.format(i[1], i[0])) + else: + log(' (not set)') + + log('5.5. Matching realpath with RE patterns') + if len(badness_adj_re_cgroup_v2_list) > 0: + log(' regexp: badness_adj:') + for i in badness_adj_re_realpath_list: + log(' {} {}'.format(i[1], i[0])) + else: + log(' (not set)') + + log('5.6. Matching cmdlines with RE patterns') + if len(badness_adj_re_cgroup_v2_list) > 0: + log(' regexp: badness_adj:') + for i in badness_adj_re_cmdline_list: + log(' {} {}'.format(i[1], i[0])) + else: + log(' (not set)') + + log('5.7. Matching environ with RE patterns') + if len(badness_adj_re_cgroup_v2_list) > 0: + log(' regexp: badness_adj:') + for i in badness_adj_re_environ_list: + log(' {} {}'.format(i[1], i[0])) + else: + log(' (not set)') + + log('6. Customize corrective actions') + + if len(soft_actions_list) > 0: + log(' Match by: regexp: command: ') + for i in soft_actions_list: + log(' {} {} {}'.format(i[0], i[1], i[2])) + else: + log(' (not set)') + + log('7. GUI notifications') + + log(' post_action_gui_notifications: {}'.format( + post_action_gui_notifications)) + log(' low_memory_warnings_enabled: {}'.format( + low_memory_warnings_enabled)) + log(' warning_exe: {}'.format(warning_exe)) + log(' warning_threshold_min_mem: {} MiB, {} %'.format(round( + warning_threshold_min_mem_mb), round(warning_threshold_min_mem_percent, 1))) + log(' warning_threshold_min_swap: {}'.format(warning_threshold_min_swap)) + log(' warning_threshold_max_zram: {} MiB, {} %'.format(round( + warning_threshold_max_zram_mb), round(warning_threshold_max_zram_percent, 1))) + log(' warning_threshold_max_psi: {}'.format(warning_threshold_max_psi)) + log(' min_post_warning_delay: {} sec'.format(min_post_warning_delay)) + + log('8. Verbosity') + + log(' print_config_at_startup: {}'.format(print_config_at_startup)) + log(' print_mem_check_results: {}'.format(print_mem_check_results)) + log(' min_mem_report_interval: {} sec'.format(min_mem_report_interval)) + log(' debug_sleep: {}'.format(debug_sleep)) + log(' print_statistics: {}'.format(print_statistics)) + log(' print_proc_table: {}'.format(print_proc_table)) + log(' extra_table_info: {}'.format(extra_table_info)) + log(' print_victim_status: {}'.format(print_victim_status)) + log(' print_victim_cmdline: {}'.format(print_victim_cmdline)) + log(' max_victim_ancestry_depth: {}'.format(max_victim_ancestry_depth)) + log(' debug_gui_notifications: {}'.format(debug_gui_notifications)) + log(' separate_log: {}'.format(separate_log)) + log(' debug_psi: {}'.format(debug_psi)) + + log('9. Misc') + + log(' max_soft_exit_time: {} sec'.format(max_soft_exit_time)) + log(' post_kill_exe: {}'.format(post_kill_exe)) + log(' forbid_negative_badness: {}'.format( + forbid_negative_badness)) + + # log(': {}'.format()) + log('#' * 79) + + if check_config_flag: + log('config is OK') + exit() + + +def encoder(string): + """ + """ + encoded = '' + for i in string: + encoded += str(ord(i)) + ':' + return encoded[:-1] + + +def get_swap_threshold_tuple(string): + # re (Num %, True) or (Num KiB, False) + """Returns KiB value if abs val was set in config, or tuple with %""" + # return tuple with abs and bool: (abs %, True) or (abs MiB, False) + + if string.endswith('%'): + valid = string_to_float_convert_test(string[:-1]) + if valid is None: + errprint('somewhere swap unit is not float_%') + exit(1) + + value = float(string[:-1].strip()) + if value < 0 or value > 100: + errprint('invalid value, must be from the range[0; 100] %') + exit(1) + + return value, True + + elif string.endswith('M'): + valid = string_to_float_convert_test(string[:-1]) + if valid is None: + errprint('somewhere swap unit is not float_M') + exit(1) + + value = float(string[:-1].strip()) * 1024 + if value < 0: + errprint('invalid unit in config (negative value)') + exit(1) + + return value, False + + else: + errprint( + 'Invalid config file. There are invalid units somewhere\nExit') + exit(1) + + +def find_cgroup_indexes(): + """ Find cgroup-line positions in /proc/*/cgroup file. + """ + + cgroup_v1_index = cgroup_v2_index = None + + with open('/proc/self/cgroup') as f: + for index, line in enumerate(f): + if ':name=' in line: + cgroup_v1_index = index + if line.startswith('0::'): + cgroup_v2_index = index + + return cgroup_v1_index, cgroup_v2_index + + +def pid_to_rss(pid): + """ + """ + try: + rss = int(rline1( + '/proc/{}/statm'.format(pid)).split(' ')[1]) * SC_PAGESIZE + except IndexError: + rss = None + except FileNotFoundError: + rss = None + except ProcessLookupError: + rss = None + return rss + + +def pid_to_vm_size(pid): + """ + """ + try: + vm_size = int(rline1( + '/proc/{}/statm'.format(pid)).partition(' ')[0]) * SC_PAGESIZE + except IndexError: + vm_size = None + except FileNotFoundError: + vm_size = None + except ProcessLookupError: + vm_size = None + return vm_size + + +def signal_handler(signum, frame): + """ + """ + for i in sig_list: + signal(i, signal_handler_inner) + log('Signal handler called with the {} signal '.format( + sig_dict[signum])) + update_stat_dict_and_print(None) + log('Exit') + exit() + + +def signal_handler_inner(signum, frame): + """ + """ + log('Signal handler called with the {} signal (ignored) '.format( + sig_dict[signum])) + + +def exe(cmd): + """ + """ + + log('Execute the command: {}'.format(cmd)) + t0 = time() + write_self_oom_score_adj(self_oom_score_adj_max) + err = os.system(cmd) + write_self_oom_score_adj(self_oom_score_adj_min) + dt = time() - t0 + log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3))) + return err + + +def go(func, *a): + """ run func in new thread + """ + t1 = time() + try: + Thread(target=func, args=a).start() + except RuntimeError: + print('RuntimeError: cannot spawn a new thread') + return 1 + t2 = time() + log('New thread spawned in {} ms'.format( + round((t2 - t1) * 1000, 1) + )) + return 0 + + +def write(path, string): + """ + """ + with open(path, 'w') as f: + f.write(string) + + +def write_self_oom_score_adj(new_value): + """ + """ + if root: + write('/proc/self/oom_score_adj', new_value) + + +def valid_re(reg_exp): + """Validate regular expression. + """ + try: + search(reg_exp, '') + except invalid_re: + log('Invalid config: invalid regexp: {}'.format(reg_exp)) + exit(1) + + +def func_print_proc_table(): + """ + """ + print_proc_table = True + find_victim(print_proc_table) + exit() + + +def log(*msg): + """ + """ + try: + print(*msg) + except OSError: + sleep(0.01) + if separate_log: + try: + logging.info(*msg) + except OSError: + sleep(0.01) + + +def print_version(): + """ + """ + try: + v = rline1('/etc/nohang/version') + except FileNotFoundError: + v = None + if v is None: + print('nohang unknown version') + else: + print('nohang ' + v) + exit() + + +def pid_to_cgroup_v1(pid): + """ + """ + cgroup_v1 = '' + try: + with open('/proc/' + pid + '/cgroup') as f: + for index, line in enumerate(f): + if index == cgroup_v1_index: + cgroup_v1 = '/' + line.partition('/')[2][:-1] + return cgroup_v1 + except FileNotFoundError: + return '' + + +def pid_to_cgroup_v2(pid): + """ + """ + cgroup_v2 = '' + try: + with open('/proc/' + pid + '/cgroup') as f: + for index, line in enumerate(f): + if index == cgroup_v2_index: + cgroup_v2 = line[3:-1] + return cgroup_v2 + except FileNotFoundError: + return '' + + +def pid_to_starttime(pid): + """ handle FNF error! + """ + try: + starttime = rline1('/proc/' + pid + '/stat').rpartition(')')[ + 2].split(' ')[20] + + except UnicodeDecodeError: + with open('/proc/' + pid + '/stat', 'rb') as f: + starttime = f.read().decode('utf-8', 'ignore').rpartition( + ')')[2].split(' ')[20] + + return float(starttime) / SC_CLK_TCK + + +def get_victim_id(pid): + """victim_id is starttime + pid""" + try: + return rline1('/proc/' + pid + '/stat').rpartition( + ')')[2].split(' ')[20] + '_pid' + pid + except FileNotFoundError: + return '' + except ProcessLookupError: + return '' + + +def pid_to_state(pid): + """ + """ + try: + with open('/proc/' + pid + '/stat', 'rb') as f: + return f.read(40).decode('utf-8', 'ignore').rpartition(')')[2][1] + except FileNotFoundError: + return '' + except ProcessLookupError: + return '' + except IndexError: + with open('/proc/' + pid + '/stat', 'rb') as f: + return f.read().decode('utf-8', 'ignore').rpartition(')')[2][1] + + +def pid_to_name(pid): + """ + """ + try: + with open('/proc/' + pid + '/comm', 'rb') as f: + return f.read().decode('utf-8', 'ignore')[:-1] + except FileNotFoundError: + return '' + except ProcessLookupError: + return '' + + +def pid_to_ppid(pid): + """ + """ + try: + with open('/proc/' + pid + '/status') as f: + for n, line in enumerate(f): + if n is ppid_index: + return line.split('\t')[1].strip() + except FileNotFoundError: + return '' + except ProcessLookupError: + return '' + except UnicodeDecodeError: + with open('/proc/' + pid + '/status', 'rb') as f: + f_list = f.read().decode('utf-8', 'ignore').split('\n') + for i in range(len(f_list)): + if i is ppid_index: + return f_list[i].split('\t')[1] + + +def pid_to_ancestry(pid, max_victim_ancestry_depth=1): + """ + """ + if max_victim_ancestry_depth == 1: + ppid = pid_to_ppid(pid) + pname = pid_to_name(ppid) + return '\n PPID: {} ({})'.format(ppid, pname) + if max_victim_ancestry_depth == 0: + return '' + anc_list = [] + for i in range(max_victim_ancestry_depth): + ppid = pid_to_ppid(pid) + pname = pid_to_name(ppid) + anc_list.append((ppid, pname)) + if ppid == '1': + break + pid = ppid + a = '' + for i in anc_list: + a = a + ' <= PID {} ({})'.format(i[0], i[1]) + return '\n Ancestry: ' + a[4:] + + +def pid_to_cmdline(pid): + """ + Get process cmdline by pid. + + pid: str pid of required process + returns string cmdline + """ + try: + with open('/proc/' + pid + '/cmdline') as f: + return f.read().replace('\x00', ' ').rstrip() + except FileNotFoundError: + return '' + + +def pid_to_environ(pid): + """ + Get process environ by pid. + + pid: str pid of required process + returns string environ + """ + try: + with open('/proc/' + pid + '/environ') as f: + return f.read().replace('\x00', ' ').rstrip() + except FileNotFoundError: + return '' + + +def pid_to_realpath(pid): + """ + """ + try: + return os.path.realpath('/proc/' + pid + '/exe') + except FileNotFoundError: + return '' + + +def pid_to_uid(pid): + """return euid""" + try: + with open('/proc/' + pid + '/status') as f: + for n, line in enumerate(f): + if n is uid_index: + return line.split('\t')[2] + except UnicodeDecodeError: + with open('/proc/' + pid + '/status', 'rb') as f: + f_list = f.read().decode('utf-8', 'ignore').split('\n') + return f_list[uid_index].split('\t')[2] + except FileNotFoundError: + return '' + + +def pid_to_badness(pid): + """Find and modify badness (if it needs).""" + + try: + + oom_score = int(rline1('/proc/' + pid + '/oom_score')) + badness = oom_score + + if ignore_positive_oom_score_adj: + oom_score_adj = int(rline1('/proc/' + pid + '/oom_score_adj')) + if oom_score_adj > 0: + badness = badness - oom_score_adj + + if regex_matching: + name = pid_to_name(pid) + for re_tup in badness_adj_re_name_list: + if search(re_tup[1], name) is not None: + badness += int(re_tup[0]) + + if re_match_cgroup_v1: + cgroup_v1 = pid_to_cgroup_v1(pid) + for re_tup in badness_adj_re_cgroup_v1_list: + if search(re_tup[1], cgroup_v1) is not None: + badness += int(re_tup[0]) + + if re_match_cgroup_v2: + cgroup_v2 = pid_to_cgroup_v2(pid) + for re_tup in badness_adj_re_cgroup_v2_list: + if search(re_tup[1], cgroup_v2) is not None: + badness += int(re_tup[0]) + + if re_match_realpath: + realpath = pid_to_realpath(pid) + for re_tup in badness_adj_re_realpath_list: + if search(re_tup[1], realpath) is not None: + badness += int(re_tup[0]) + + if re_match_cmdline: + cmdline = pid_to_cmdline(pid) + for re_tup in badness_adj_re_cmdline_list: + if search(re_tup[1], cmdline) is not None: + badness += int(re_tup[0]) + + if re_match_environ: + environ = pid_to_environ(pid) + for re_tup in badness_adj_re_environ_list: + if search(re_tup[1], environ) is not None: + badness += int(re_tup[0]) + + if re_match_uid: + uid = pid_to_uid(pid) + for re_tup in badness_adj_re_uid_list: + if search(re_tup[1], uid) is not None: + badness += int(re_tup[0]) + + if forbid_negative_badness: + if badness < 0: + badness = 0 + + return badness, oom_score + + except FileNotFoundError: + return None, None + except ProcessLookupError: + return None, None + + +def pid_to_status(pid): + """ + """ + + try: + + with open('/proc/' + pid + '/status') as f: + + for n, line in enumerate(f): + + if n == 0: + name = line.split('\t')[1][:-1] + + if n is state_index: + state = line.split('\t')[1][0] + continue + + if n is ppid_index: + ppid = line.split('\t')[1][:-1] + continue + + if n is uid_index: + uid = line.split('\t')[2] + continue + + if n is vm_size_index: + vm_size = kib_to_mib(int(line.split('\t')[1][:-4])) + continue + + if n is vm_rss_index: + vm_rss = kib_to_mib(int(line.split('\t')[1][:-4])) + continue + + if n is vm_swap_index: + vm_swap = kib_to_mib(int(line.split('\t')[1][:-4])) + break + + return name, state, ppid, uid, vm_size, vm_rss, vm_swap + + except UnicodeDecodeError: + return pid_to_status_unicode(pid) + + except FileNotFoundError: + return None + + except ProcessLookupError: + return None + + except ValueError: + return None + + +def pid_to_status_unicode(pid): + """ + """ + try: + + with open('/proc/' + pid + '/status', 'rb') as f: + f_list = f.read().decode('utf-8', 'ignore').split('\n') + + for i in range(len(f_list)): + + if i == 0: + name = f_list[i].split('\t')[1] + + if i is state_index: + state = f_list[i].split('\t')[1][0] + + if i is ppid_index: + ppid = f_list[i].split('\t')[1] + + if i is uid_index: + uid = f_list[i].split('\t')[2] + + if i is vm_size_index: + vm_size = kib_to_mib( + int(f_list[i].split('\t')[1][:-3])) + + if i is vm_rss_index: + vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3])) + + if i is vm_swap_index: + vm_swap = kib_to_mib(int(f_list[i].split('\t')[1][:-3])) + + return name, state, ppid, uid, vm_size, vm_rss, vm_swap + + except FileNotFoundError: + return None + + except ProcessLookupError: + return None + + except ValueError: + return None + + +def uptime(): + """ + """ + return float(rline1('/proc/uptime').split(' ')[0]) + + +def errprint(*text): + """ + """ + print(*text, file=stderr, flush=True) + + +def mlockall(): + """Lock all memory to prevent swapping nohang process.""" + + MCL_CURRENT = 1 + MCL_FUTURE = 2 + MCL_ONFAULT = 4 + + libc = CDLL('libc.so.6', use_errno=True) + + result = libc.mlockall( + MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT + ) + if result != 0: + result = libc.mlockall( + MCL_CURRENT | MCL_FUTURE + ) + if result != 0: + log('WARNING: cannot lock all memory') + else: + pass + # log('All memory locked with MCL_CURRENT | MCL_FUTURE') + else: + pass + # log('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT') + + +def update_stat_dict_and_print(key): + """ + """ + + if key is not None: + + if key not in stat_dict: + + stat_dict.update({key: 1}) + + else: + + new_value = stat_dict[key] + 1 + stat_dict.update({key: new_value}) + + if print_statistics: + + stats_msg = 'Total stat (what happened in the last {}):'.format( + format_time(time() - start_time)) + + for i in stat_dict: + stats_msg += '\n {}: {}'.format(i, stat_dict[i]) + + log(stats_msg) + + +def find_psi_metrics_value(psi_path, psi_metrics): + """ + """ + + if psi_support: + + if psi_metrics == 'some_avg10': + return float(rline1(psi_path).split(' ')[1].split('=')[1]) + if psi_metrics == 'some_avg60': + return float(rline1(psi_path).split(' ')[2].split('=')[1]) + if psi_metrics == 'some_avg300': + return float(rline1(psi_path).split(' ')[3].split('=')[1]) + + if psi_metrics == 'full_avg10': + with open(psi_path) as f: + psi_list = f.readlines() + return float(psi_list[1].split(' ')[1].split('=')[1]) + if psi_metrics == 'full_avg60': + with open(psi_path) as f: + psi_list = f.readlines() + return float(psi_list[1].split(' ')[2].split('=')[1]) + if psi_metrics == 'full_avg300': + with open(psi_path) as f: + psi_list = f.readlines() + return float(psi_list[1].split(' ')[3].split('=')[1]) + + +def check_mem_and_swap(): + """find mem_available, swap_total, swap_free""" + with open('/proc/meminfo') as f: + for n, line in enumerate(f): + if n == 2: + mem_available = int(line.split(':')[1][:-4]) + continue + if n is swap_total_index: + swap_total = int(line.split(':')[1][:-4]) + continue + if n is swap_free_index: + swap_free = int(line.split(':')[1][:-4]) + break + return mem_available, swap_total, swap_free + + +def check_zram(): + """find MemUsedZram""" + disksize_sum = 0 + mem_used_total_sum = 0 + + for dev in os.listdir('/sys/block'): + if dev.startswith('zram'): + stat = zram_stat(dev) + disksize_sum += int(stat[0]) + mem_used_total_sum += int(stat[1]) + + # Means that when setting zram disksize = 1 GiB available memory + # decrease by 0.0042 GiB. + # Found experimentally, requires clarification with different kernaels and + # architectures. + # On small disk drives (up to gigabyte) it can be more, up to 0.0045. + # The creator of the zram module claims that ZRAM_DISKSIZE_FACTOR should + # be 0.001: + # ("zram uses about 0.1% of the size of the disk" + # - https://www.kernel.org/doc/Documentation/blockdev/zram.txt), + # but this statement contradicts the experimental data. + # ZRAM_DISKSIZE_FACTOR = deltaMemAvailavle / disksize + # Found experimentally. + ZRAM_DISKSIZE_FACTOR = 0.0042 + + return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0 + + +''' +def format_time(t): + t = int(t) + if t < 60: + return '{} sec'.format(t) + if t >= 60 and t < 3600: + m = t // 60 + s = t % 60 + return '{} min {} sec'.format(m, s) + h = t // 3600 + s0 = t - h * 3600 + m = s0 // 60 + s = s0 % 60 + return '{} h {} min {} sec'.format(h, m, s) +''' + + +def format_time(t): + t = int(t) + + if t < 60: + return '{} sec'.format(t) + + if t > 3600: + h = t // 3600 + s0 = t - h * 3600 + m = s0 // 60 + s = s0 % 60 + return '{} h {} min {} sec'.format(h, m, s) + + m = t // 60 + s = t % 60 + return '{} min {} sec'.format(m, s) + + +def string_to_float_convert_test(string): + """Try to interprete string values as floats.""" + try: + return float(string) + except ValueError: + return None + + +def string_to_int_convert_test(string): + """Try to interpret string values as integers.""" + try: + return int(string) + except ValueError: + return None + + +def conf_parse_string(param): + """ + Get string parameters from the config dict. + + param: config_dict key + returns config_dict[param].strip() + """ + if param in config_dict: + return config_dict[param].strip() + else: + errprint('All the necessary parameters must be in the config') + errprint('There is no "{}" parameter in the config'.format(param)) + exit(1) + + +def conf_parse_bool(param): + """ + Get bool parameters from the config_dict. + + param: config_dicst key + returns bool + """ + if param in config_dict: + param_str = config_dict[param] + if param_str == 'True': + return True + elif param_str == 'False': + return False + else: + errprint('Invalid value of the "{}" parameter.'.format(param)) + errprint('Valid values are True and False.') + errprint('Exit') + exit(1) + else: + errprint('All the necessary parameters must be in the config') + errprint('There is no "{}" parameter in the config'.format(param)) + exit(1) + + +def rline1(path): + """read 1st line from path.""" + try: + with open(path) as f: + for line in f: + return line[:-1] + except UnicodeDecodeError: + with open(path, 'rb') as f: + return f.read(999).decode( + 'utf-8', 'ignore').split('\n')[0] # use partition()! + + +def kib_to_mib(num): + """Convert KiB values to MiB values.""" + return round(num / 1024.0) + + +def percent(num): + """Interprete num as percentage.""" + return round(num * 100, 1) + + +def just_percent_mem(num): + """convert num to percent and justify""" + return str(round(num * 100, 1)).rjust(4, ' ') + + +def just_percent_swap(num): + """ + """ + return str(round(num * 100, 1)).rjust(5, ' ') + + +def human(num, lenth): + """Convert KiB values to MiB values with right alignment""" + return str(round(num / 1024)).rjust(lenth, ' ') + + +def zram_stat(zram_id): + """ + Get zram state. + + zram_id: str zram block-device id + returns bytes disksize, str mem_used_total + """ + try: + disksize = rline1('/sys/block/' + zram_id + '/disksize') + except FileNotFoundError: + return '0', '0' + if disksize == ['0\n']: + return '0', '0' + try: + mm_stat = rline1('/sys/block/' + zram_id + '/mm_stat').split(' ') + mm_stat_list = [] + for i in mm_stat: + if i != '': + mm_stat_list.append(i) + mem_used_total = mm_stat_list[2] + except FileNotFoundError: + mem_used_total = rline1('/sys/block/' + zram_id + '/mem_used_total') + return disksize, mem_used_total # BYTES, str + + +def send_notify_warn(): + """ + Look for process with maximum 'badness' and warn user with notification. + (implement Low memory warnings) + """ + log('Warning threshold exceeded') + + if check_warning_exe: + exe(warning_exe) + + else: + + title = 'Low memory' + + body = 'MemAvail: {}%\nSwapFree: {}%'.format( + round(mem_available / mem_total * 100), + round(swap_free / (swap_total + 0.1) * 100) + ) + + send_notification(title, body) + + +def send_notify(threshold, name, pid): + """ + Notificate about OOM Preventing. + + threshold: key for notify_sig_dict + name: str process name + pid: str process pid + """ + + title = 'Freeze prevention' + body = '{} [{}] {}'.format( + notify_sig_dict[threshold], + pid, + name.replace( + # symbol '&' can break notifications in some themes, + # therefore it is replaced by '*' + '&', '*' + ) + ) + + send_notification(title, body) + + +def send_notify_etc(pid, name, command): + """ + Notificate about OOM Preventing. + + command: str command that will be executed + name: str process name + pid: str process pid + """ + title = 'Freeze prevention' + body = 'Victim is [{}] {}\nExecute the co' \ + 'mmand:\n{}'.format( + pid, name.replace('&', '*'), command.replace('&', '*')) + + send_notification(title, body) + + +def send_notification(title, body): + """ + """ + cmd = '{} "--euid={}" "--debug={}" "--title={}" "--body={}" &'.format( + notify_helper_path, + self_uid, + debug_gui_notifications, + title, + encoder(body)) + + go(exe, cmd) + + +def get_pid_list(): + """ + Find pid list expect kthreads and zombies + """ + pid_list = [] + for pid in os.listdir('/proc'): + if os.path.exists('/proc/' + pid + '/exe'): + pid_list.append(pid) + return pid_list + + +def get_non_decimal_pids(): + """ + """ + non_decimal_list = [] + for pid in pid_list: + if pid[0].isdecimal() is False: + non_decimal_list.append(pid) + return non_decimal_list + + +def find_victim(_print_proc_table): + """ + Find the process with highest badness and its badness adjustment + Return pid and badness + """ + + ft1 = time() + + pid_list = get_pid_list() + + pid_list.remove(self_pid) + + if '1' in pid_list: + pid_list.remove('1') + + non_decimal_list = get_non_decimal_pids() + + for i in non_decimal_list: + if i in pid_list: + pid_list.remove(i) + + pid_badness_list = [] + + if _print_proc_table: + + if extra_table_info == 'None': + extra_table_title = '' + + elif extra_table_info == 'cgroup_v1': + extra_table_title = 'CGroup_v1' + + elif extra_table_info == 'cgroup_v2': + extra_table_title = 'CGroup_v2' + + elif extra_table_info == 'cmdline': + extra_table_title = 'cmdline' + + elif extra_table_info == 'environ': + extra_table_title = 'environ' + + elif extra_table_info == 'realpath': + extra_table_title = 'realpath' + + else: + extra_table_title = '' + + hr = '#' * 107 + + log(hr) + log('# PID PPID badness oom_score oom_score_adj e' + 'UID S VmSize VmRSS VmSwap Name {}'.format( + extra_table_title)) + log('#------- ------- ------- --------- ------------- -------' + '--- - ------ ----- ------ ---------------') + + for pid in pid_list: + + badness = pid_to_badness(pid)[0] + + if badness is None: + continue + + if _print_proc_table: + + try: + oom_score = rline1('/proc/' + pid + '/oom_score') + oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') + except FileNotFoundError: + continue + + if pid_to_status(pid) is None: + continue + else: + (name, state, ppid, uid, vm_size, vm_rss, + vm_swap) = pid_to_status(pid) + + if extra_table_info == 'None': + extra_table_line = '' + + elif extra_table_info == 'cgroup_v1': + extra_table_line = pid_to_cgroup_v1(pid) + + elif extra_table_info == 'cgroup_v2': + extra_table_line = pid_to_cgroup_v2(pid) + + elif extra_table_info == 'cmdline': + extra_table_line = pid_to_cmdline(pid) + + elif extra_table_info == 'environ': + extra_table_line = pid_to_environ(pid) + + elif extra_table_info == 'realpath': + extra_table_line = pid_to_realpath(pid) + + else: + extra_table_line = '' + + log('#{} {} {} {} {} {} {} {} {} {} {} {}'.format( + pid.rjust(7), + ppid.rjust(7), + str(badness).rjust(7), + oom_score.rjust(9), + oom_score_adj.rjust(13), + uid.rjust(10), + state, + str(vm_size).rjust(6), + str(vm_rss).rjust(5), + str(vm_swap).rjust(6), + name.ljust(15), + extra_table_line + ) + ) + + pid_badness_list.append((pid, badness)) + + real_proc_num = len(pid_badness_list) + + # Make list of (pid, badness) tuples, sorted by 'badness' values + # print(pid_badness_list) + pid_tuple_list = sorted( + pid_badness_list, + key=itemgetter(1), + reverse=True + )[0] + + pid = pid_tuple_list[0] + victim_id = get_victim_id(pid) + + # Get maximum 'badness' value + victim_badness = pid_tuple_list[1] + victim_name = pid_to_name(pid) + + if _print_proc_table: + log(hr) + + log('Found {} processes with existing /proc/[pid]/exe realpath'.format( + real_proc_num)) + + log( + 'Process with highest badness (found in {} ms):\n PID: {}, Na' + 'me: {}, badness: {}'.format( + round((time() - ft1) * 1000), + pid, + victim_name, + victim_badness + ) + ) + + return pid, victim_badness, victim_name, victim_id + + +def find_victim_info(pid, victim_badness, name): + """ + """ + status0 = time() + + try: + + with open('/proc/' + pid + '/status') as f: + + for n, line in enumerate(f): + + if n is state_index: + state = line.split('\t')[1].rstrip() + continue + + """ + if n is ppid_index: + # ppid = line.split('\t')[1] + continue + """ + + if n is uid_index: + uid = line.split('\t')[2] + continue + + if n is vm_size_index: + vm_size = kib_to_mib(int(line.split('\t')[1][:-4])) + continue + + if n is vm_rss_index: + vm_rss = kib_to_mib(int(line.split('\t')[1][:-4])) + continue + + if detailed_rss: + + if n is anon_index: + anon_rss = kib_to_mib( + int(line.split('\t')[1][:-4])) + continue + + if n is file_index: + file_rss = kib_to_mib( + int(line.split('\t')[1][:-4])) + continue + + if n is shmem_index: + shmem_rss = kib_to_mib( + int(line.split('\t')[1][:-4])) + continue + + if n is vm_swap_index: + vm_swap = kib_to_mib(int(line.split('\t')[1][:-4])) + break + + if print_victim_cmdline: + cmdline = pid_to_cmdline(pid) + oom_score = rline1('/proc/' + pid + '/oom_score') + oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') + + except FileNotFoundError: + log('The victim died in the search process: FileNotFoundError') + update_stat_dict_and_print( + 'The victim died in the search process: FileNotFoundError') + return None + except ProcessLookupError: + log('The victim died in the search process: ProcessLookupError') + update_stat_dict_and_print( + 'The victim died in the search process: ProcessLookupError') + return None + except UnicodeDecodeError: + + with open('/proc/' + pid + '/status', 'rb') as f: + f_list = f.read().decode('utf-8', 'ignore').split('\n') + + for i in range(len(f_list)): + + if i is state_index: + state = f_list[i].split('\t')[1].rstrip() + + """ + if i is ppid_index: + pass + # ppid = f_list[i].split('\t')[1] + """ + + if i is uid_index: + uid = f_list[i].split('\t')[2] + + if i is vm_size_index: + vm_size = kib_to_mib( + int(f_list[i].split('\t')[1][:-3])) + + if i is vm_rss_index: + vm_rss = kib_to_mib(int(f_list[i].split('\t')[1][:-3])) + + if detailed_rss: + + if i is anon_index: + anon_rss = kib_to_mib( + int(f_list[i].split('\t')[1][:-3])) + + if i is file_index: + file_rss = kib_to_mib( + int(f_list[i].split('\t')[1][:-3])) + + if i is shmem_index: + shmem_rss = kib_to_mib( + int(f_list[i].split('\t')[1][:-3])) + + if i is vm_swap_index: + vm_swap = kib_to_mib( + int(f_list[i].split('\t')[1][:-3])) + + if print_victim_cmdline: + cmdline = pid_to_cmdline(pid) + oom_score = rline1('/proc/' + pid + '/oom_score') + oom_score_adj = rline1('/proc/' + pid + '/oom_score_adj') + + except IndexError: + log('The victim died in the search process: IndexError') + update_stat_dict_and_print( + 'The victim died in the search process: IndexError') + return None + except ValueError: + log('The victim died in the search process: ValueError') + update_stat_dict_and_print( + 'The victim died in the search process: ValueError') + return None + except FileNotFoundError: + log('The victim died in the search process: FileNotFoundError') + update_stat_dict_and_print( + 'The victim died in the search process: FileNotFoundError') + return None + except ProcessLookupError: + log('The victim died in the search process: ProcessLookupError') + update_stat_dict_and_print( + 'The victim died in the search process: ProcessLookupError') + return None + + len_vm = len(str(vm_size)) + + try: + realpath = os.path.realpath('/proc/' + pid + '/exe') + victim_lifetime = format_time(uptime() - pid_to_starttime(pid)) + victim_cgroup_v1 = pid_to_cgroup_v1(pid) + victim_cgroup_v2 = pid_to_cgroup_v2(pid) + + except FileNotFoundError: + log('The victim died in the search process: FileNotFoundError') + update_stat_dict_and_print( + 'The victim died in the search process: FileNotFoundError') + return None + + ancestry = pid_to_ancestry(pid, max_victim_ancestry_depth) + + if print_victim_cmdline is False: + cmdline = '' + c1 = '' + else: + c1 = '\n Cmdline: ' + + if detailed_rss: + detailed_rss_info = ' (' \ + 'Anon: {} MiB, ' \ + 'File: {} MiB, ' \ + 'Shmem: {} MiB)'.format( + anon_rss, + file_rss, + shmem_rss) + else: + detailed_rss_info = '' + + victim_info = 'Victim status (found in {} ms):' \ + '\n Name: {}' \ + '\n State: {}' \ + '\n PID: {}' \ + '{}' \ + '\n EUID: {}' \ + '\n badness: {}, ' \ + 'oom_score: {}, ' \ + 'oom_score_adj: {}' \ + '\n VmSize: {} MiB' \ + '\n VmRSS: {} MiB {}' \ + '\n VmSwap: {} MiB' \ + '\n CGroup_v1: {}' \ + '\n CGroup_v2: {}' \ + '\n Realpath: {}' \ + '{}{}' \ + '\n Lifetime: {}'.format( + round((time() - status0) * 1000), + name, + state, + pid, + ancestry, + uid, + victim_badness, + oom_score, + oom_score_adj, + vm_size, + str(vm_rss).rjust(len_vm), + detailed_rss_info, + str(vm_swap).rjust(len_vm), + victim_cgroup_v1, + victim_cgroup_v2, + realpath, + c1, cmdline, + victim_lifetime) + + return victim_info + + +def check_mem_swap_ex(): + """ + Check: is mem and swap threshold exceeded? + Return: None, (SIGTERM, meminfo), (SIGKILL, meminfo) + """ + + mem_available, swap_total, swap_free = check_mem_and_swap() + + # if hard_threshold_min_swap is set in percent + if swap_kill_is_percent: + hard_threshold_min_swap_kb = swap_total * \ + hard_threshold_min_swap_percent / 100.0 + else: + hard_threshold_min_swap_kb = swap_kb_dict['hard_threshold_min_swap_kb'] + + if swap_term_is_percent: + soft_threshold_min_swap_kb = swap_total * \ + soft_threshold_min_swap_percent / 100.0 + else: + soft_threshold_min_swap_kb = swap_kb_dict['soft_threshold_min_swap_kb'] + + if swap_warn_is_percent: + warning_threshold_min_swap_kb = swap_total * \ + warning_threshold_min_swap_percent / 100.0 + else: + warning_threshold_min_swap_kb = swap_kb_dict['warning_threshold_min_swap_kb'] + + if swap_total > hard_threshold_min_swap_kb: + swap_sigkill_pc = percent( + hard_threshold_min_swap_kb / (swap_total + 0.1)) + else: + swap_sigkill_pc = '-' + + if swap_total > soft_threshold_min_swap_kb: + swap_sigterm_pc = percent( + soft_threshold_min_swap_kb / (swap_total + 0.1)) + else: + swap_sigterm_pc = '-' + + if (mem_available <= hard_threshold_min_mem_kb and + swap_free <= hard_threshold_min_swap_kb): + + mem_info = 'Memory status that requ' \ + 'ires corrective actions (hard threshold exceeded):' \ + '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ + 'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ + 'p_min_sigkill [{} MiB, {} %]'.format( + kib_to_mib(mem_available), + percent(mem_available / mem_total), + kib_to_mib(hard_threshold_min_mem_kb), + percent(hard_threshold_min_mem_kb / mem_total), + kib_to_mib(swap_free), + percent(swap_free / (swap_total + 0.1)), + kib_to_mib(hard_threshold_min_swap_kb), + swap_sigkill_pc) + + return (SIGKILL, mem_info, mem_available, hard_threshold_min_swap_kb, + soft_threshold_min_swap_kb, swap_free, swap_total) + + if (mem_available <= soft_threshold_min_mem_kb and + swap_free <= soft_threshold_min_swap_kb): + + mem_info = 'Memory status that requi' \ + 'res corrective actions (soft threshold exceeded):' \ + '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ + 'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ + 'p_min_sigterm [{} MiB, {} %]'.format( + kib_to_mib(mem_available), + percent(mem_available / mem_total), + kib_to_mib(soft_threshold_min_mem_kb), + round(soft_threshold_min_mem_percent, 1), + kib_to_mib(swap_free), + percent(swap_free / (swap_total + 0.1)), + kib_to_mib(soft_threshold_min_swap_kb), + swap_sigterm_pc) + + return (SIGTERM, mem_info, mem_available, hard_threshold_min_swap_kb, + soft_threshold_min_swap_kb, swap_free, swap_total) + + if low_memory_warnings_enabled: + + if (mem_available <= warning_threshold_min_mem_kb and swap_free <= + warning_threshold_min_swap_kb + 0.1): + return ('WARN', None, mem_available, hard_threshold_min_swap_kb, + soft_threshold_min_swap_kb, swap_free, swap_total) + + return (None, None, mem_available, hard_threshold_min_swap_kb, + soft_threshold_min_swap_kb, swap_free, swap_total) + + +def check_zram_ex(): + """ + """ + mem_used_zram = check_zram() + + if mem_used_zram >= hard_threshold_max_zram_kb: + + mem_info = 'Memory status that requir' \ + 'es corrective actions (hard threshold exceeded):' \ + '\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \ + 'kill [{} MiB, {} %]'.format( + kib_to_mib(mem_used_zram), + percent(mem_used_zram / mem_total), + kib_to_mib(hard_threshold_max_zram_kb), + percent(hard_threshold_max_zram_kb / mem_total)) + + return SIGKILL, mem_info, mem_used_zram + + if mem_used_zram >= soft_threshold_max_zram_kb: + + mem_info = 'Memory status that requires corrective actions (soft th' \ + 'reshold exceeded):\n MemUsedZram [{} MiB, {} %] >= zram_max_s' \ + 'igterm [{} M, {} %]'.format( + kib_to_mib(mem_used_zram), + percent(mem_used_zram / mem_total), + kib_to_mib(soft_threshold_max_zram_kb), + percent(soft_threshold_max_zram_kb / mem_total)) + + return SIGTERM, mem_info, mem_used_zram + + if low_memory_warnings_enabled: + if mem_used_zram >= warning_threshold_max_zram_kb: + return 'WARN', None, mem_used_zram + + return None, None, mem_used_zram + + +def check_psi_ex(psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0): + """ + """ + + delta0 = time() - x0 + x0 = time() + + psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics) + # print(psi_avg_value) + + psi_post_action_delay_timer = time() - last_action_dict['t'] # psi_t0 + + if psi_post_action_delay_timer >= psi_post_action_delay: + psi_post_action_delay_exceeded = True + else: + psi_post_action_delay_exceeded = False + + if psi_avg_value >= hard_threshold_max_psi: + sigkill_psi_exceeded = True + psi_kill_exceeded_timer += delta0 + else: + sigkill_psi_exceeded = False + psi_kill_exceeded_timer = 0 + + if debug_psi: + + log('psi_post_action_delay_timer: {}'.format( + round(psi_post_action_delay_timer, 3))) + + log('psi_post_action_delay_exceeded: {}\nsigkill_psi_exceeded' + ': {}\npsi_kill_exceeded_timer: {}'.format( + psi_post_action_delay_exceeded, + sigkill_psi_exceeded, + round(psi_kill_exceeded_timer, 1) + ) + ) + + if (psi_kill_exceeded_timer >= psi_excess_duration and + psi_post_action_delay_exceeded): + + mem_info = 'PSI avg ({}) > hard_threshold_max_psi ({})\n' \ + 'PSI avg exceeded psi_excess_duration (value' \ + ' = {} sec) for {} seconds'.format( + psi_avg_value, + hard_threshold_max_psi, + psi_excess_duration, + round(psi_kill_exceeded_timer, 1) + ) + + return (SIGKILL, mem_info, psi_t0, psi_kill_exceeded_timer, + psi_term_exceeded_timer, x0) + + if psi_avg_value >= soft_threshold_max_psi: + sigterm_psi_exceeded = True + psi_term_exceeded_timer += delta0 + else: + sigterm_psi_exceeded = False + psi_term_exceeded_timer = 0 + + if debug_psi: + + log('sigterm_psi_exceeded: {}\n' + 'psi_term_exceeded_timer: {}\n'.format( + sigterm_psi_exceeded, + round(psi_term_exceeded_timer, 1) + ) + ) + + if (psi_term_exceeded_timer >= psi_excess_duration and + psi_post_action_delay_exceeded): + + mem_info = 'PSI avg ({}) > soft_threshold_max_psi ({})\n' \ + 'PSI avg exceeded psi_excess_duration (value' \ + ' = {} sec) for {} seconds'.format( + psi_avg_value, + soft_threshold_max_psi, + psi_excess_duration, + round(psi_term_exceeded_timer, 1) + ) + + return (SIGTERM, mem_info, psi_t0, psi_kill_exceeded_timer, + psi_term_exceeded_timer, x0) + + if low_memory_warnings_enabled: + + if psi_avg_value >= warning_threshold_max_psi: + return ('WARN', None, psi_t0, psi_kill_exceeded_timer, + psi_term_exceeded_timer, x0) + + return (None, None, psi_t0, psi_kill_exceeded_timer, + psi_term_exceeded_timer, x0) + + +def is_victim_alive(victim_id): + """ + We do not have a reliable sign of the end of the release of memory: + https://github.com/rfjakob/earlyoom/issues/128#issuecomment-507023717 + + Варианты возврата: + 0 X, nonexist, другой процесс (полн конец имплементации, можно не делать POST SIGKILL DELAY) + 1 rp true + 2 R освобождает память. Ждем смерти. + 3 Z возможно уже освободил память. Конец отслеживания + """ + + # Проверка целостности жертвы + starttime, pid = victim_id.split('_pid') + new_victim_id = get_victim_id(pid) + if victim_id != new_victim_id: + return 0 + + # Жива ли жертва? + exe_exists = os.path.exists('/proc/{}/exe'.format(pid)) + if exe_exists: + return 1 + + # далее жертва смертельно ранена. Дифференцируемся по State. + # R -> 2 # отслеживать жертву дальше + # X, FNFE, PLE -> 0 + + state = pid_to_state(pid) + + if state == 'R': + return 2 + + if state == 'Z': + return 3 + + if state == 'X' or state == '': + return 0 + + return 0 + + +def implement_corrective_action( + threshold, + mem_info_list, + psi_t0, + psi_kill_exceeded_timer, + psi_term_exceeded_timer, + x0, + psi_threshold, + zram_threshold, + zram_info, + psi_info): + + log('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>') + + debug_corrective_action = True + + time0 = time() + + # 1. Очистка словаря от мертвых. Итерация по словарю, отслеживание умирающих. + # 2. Итерация по оставшемуся словарю. Поиск дельт. Если хоть у одного + # дельта НЕ истекла - ЖДЕМ, выход из фции. + + # print(v_dict) + nu = [] + + for victim_id in v_dict: + iva = is_victim_alive(victim_id) + #print(iva, victim_id) + if iva == 0 or iva == 3: + nu.append(victim_id) + """ + continue + if iva == 1: + continue + if iva == 2: + pass # быстро отследить умирающего + """ + + for i in nu: + if debug_corrective_action: + log('Remove {} from v_dict'.format(i)) + v_dict.pop(i) + + x = False + cache_list = [] + #cache_list.append(('foo', 0.01)) + #cache_list.append(('boo', 1111.01)) + # 2 + # print(v_dict) + + for victim_id in v_dict: + tx = v_dict[victim_id]['time'] + ddt = time() - tx + if ddt < victim_cache_time: + + if debug_corrective_action: + log( + 'victim_cache_time is not exceeded for {} ({} < {})'.format( + victim_id, round(ddt, 3), victim_cache_time + ) + ) + x = True + cache_list.append((victim_id, ddt)) + break + + if x: + # print(cache_list) + e = sorted(cache_list, key=itemgetter(1), reverse=False) + cached_victim_id = e[0][0] + + for i in mem_info_list: + log(i) + + if x: + victim_id = cached_victim_id + pid = victim_id.partition('_pid')[2] + victim_badness = pid_to_badness(pid)[0] + name = v_dict[victim_id]['name'] + log('New victim is cached victim {} ({})'.format(pid, name)) + else: + pid, victim_badness, name, victim_id = find_victim(print_proc_table) + + log('Recheck memory levels...') + + (masf_threshold, masf_info, mem_available, hard_threshold_min_swap_kb, + soft_threshold_min_swap_kb, swap_free, swap_total) = check_mem_swap_ex() + + if CHECK_ZRAM: + zram_threshold, zram_info, mem_used_zram = check_zram_ex() + + if CHECK_PSI: + (psi_threshold, psi_info, psi_t0, psi_kill_exceeded_timer, + psi_term_exceeded_timer, x0) = check_psi_ex( + psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0) + + if (masf_threshold is SIGKILL or zram_threshold is SIGKILL or + psi_threshold is SIGKILL): + + new_threshold = SIGKILL + mem_info_list = [] + + if masf_threshold is SIGKILL or masf_threshold is SIGTERM: + mem_info_list.append(masf_info) + + if zram_threshold is SIGKILL or zram_threshold is SIGTERM: + mem_info_list.append(zram_info) + + if psi_threshold is SIGKILL or psi_threshold is SIGTERM: + mem_info_list.append(psi_info) + + elif (masf_threshold is SIGTERM or zram_threshold is SIGTERM or + psi_threshold is SIGTERM): + + new_threshold = SIGTERM + mem_info_list = [] + + if masf_threshold is SIGKILL or masf_threshold is SIGTERM: + mem_info_list.append(masf_info) + + if zram_threshold is SIGKILL or zram_threshold is SIGTERM: + mem_info_list.append(zram_info) + + if psi_threshold is SIGKILL or psi_threshold is SIGTERM: + mem_info_list.append(psi_info) + + else: + log('Thresholds is not exceeded now') + return psi_t0 + + for i in mem_info_list: + log(i) + + if new_threshold is None or new_threshold == 'WARN': + log('Thresholds is not exceeded now') + return psi_t0 + + threshold = new_threshold + + vwd = None # Victim Will Die + + if victim_badness >= min_badness: + + if threshold is SIGTERM: + if victim_id in v_dict: + dt = time() - v_dict[victim_id]['time'] + if dt > max_soft_exit_time: + log('max_soft_exit_time is exceeded: the ' + 'victim will get SIGKILL') + threshold = SIGKILL + else: + log('max_soft_exit_time is not exceeded (' + '{} < {}) for the victim'.format(round( + dt, 1), max_soft_exit_time)) + + if debug_sleep: + log('Sleep {} sec (over_sleep)'.format(over_sleep)) + sleep(over_sleep) + + log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') + + return psi_t0 + + # log('Try to implement a corrective action...') + + if print_victim_status: + # victim badness ищи снова, не полагайся на старое + victim_info = find_victim_info(pid, victim_badness, name) + log(victim_info) + + soft_match = False + if soft_actions and threshold is SIGTERM: + name = pid_to_name(pid) + cgroup_v1 = pid_to_cgroup_v1(pid) + service = '' + cgroup_v1_tail = cgroup_v1.rpartition('/')[2] + if cgroup_v1_tail.endswith('.service'): + service = cgroup_v1_tail + for i in soft_actions_list: + unit = i[0] + if unit == 'name': + u = name + else: + u = cgroup_v1 + regexp = i[1] + command = i[2] + + if search(regexp, u) is not None: + log("Regexp '{}' matches with {} '{}'".format( + regexp, unit, u)) + soft_match = True + break + + if soft_match: + + cmd = command.replace('$PID', pid).replace('$NAME', pid_to_name( + pid)).replace('$SERVICE', service) + go(exe, cmd) + + """ + if exit_status == 0: + success = True + else: + success = False + """ + + response_time = time() - time0 + + exit_status = None + + preventing_oom_message = 'Implement a corrective act' \ + 'ion:\n Run the command: {}' \ + '\n Exit status: {}; total response ' \ + 'time: {} ms'.format( + cmd, + exit_status, + round(response_time * 1000)) + + else: + + try: + os.kill(int(pid), threshold) + + response_time = time() - time0 + + send_result = 'total response time: {} ms'.format( + round(response_time * 1000)) + + preventing_oom_message = 'Implement a corrective action:' \ + '\n Send {} to the victim; {}'.format( + sig_dict[threshold], send_result) + + # success = True + + if threshold is SIGKILL: + vwd = True + + except FileNotFoundError: + vwd = True + # success = False + # response_time = time() - time0 + # send_result = 'no such process; response time: {} ms'.format(round(response_time * 1000)) + key = 'The victim died in the search process: ' \ + 'FileNotFoundError' + except ProcessLookupError: + vwd = True + # success = False + # response_time = time() - time0 + # send_result = 'no such process; response time: {} ms'.format(round(response_time * 1000)) + key = 'The victim died in the search process: ' \ + 'ProcessLookupError' + + try: + log(preventing_oom_message) + except UnboundLocalError: + pass + # preventing_oom_message = key + + if not vwd: + if victim_id not in v_dict: + v_dict[victim_id] = dict() + v_dict[victim_id]['time'] = time() + v_dict[victim_id]['name'] = name + else: + pass + + last_action_dict['t'] = kill_timestamp = time() + + # print(v_dict) + + # response_time = time() - time0 + + # log('success: ' + str(success)) + # log('victim will die: ' + str(vwd)) + # log('response_time: ' + str(response_time) + ' sec') + + # НАЧАЛО ОТСЛЕЖИВАНИЯ СОСТОЯНИЯ ЖЕРТВЫ. Можно вынести в отд фц. Приним + # айди, логирует, возвращает что-то. + + # Далее поработать со словарями. Жертва тут умерла - сброс таймера. Все + # старые жертвы умерли до 3х секунд с следующих циклах - сброс таймера. + # После этого все должно быть супер охуенно. + + while True: + sleep(0.005) + d = time() - kill_timestamp + #print('Прошло времени:', d) + iva = is_victim_alive(victim_id) + + if iva == 0: + + log('The victim died in {} sec'.format(round(d, 3))) + + if victim_id in v_dict: + v_dict.pop(victim_id) + break + + elif iva == 1: + #print('Жива и занимает память') + if not vwd and d > sensitivity_test_time: + + log("The victim doesn't respond on corrective action in {} sec".format( + round(d, 3))) + + break + + elif iva == 2: + pass + #print('Смертельно ранена и освобождает память. Дождаться окончания освобождения памяти.') + + else: # 3 + #print('Z и быстро освобождает память, если еще не. Поспать немножно и выйти из цикла.') + + log('The victim became a zombie in {} sec'.format(round(d, 3))) + + if victim_id in v_dict: + v_dict.pop(victim_id) + sleep(post_zombie_delay) + break + + mem_available, swap_total, swap_free = check_mem_and_swap() + ma_mib = int(mem_available) / 1024.0 + sf_mib = int(swap_free) / 1024.0 + log('Memory status after implementing a corrective act' + 'ion:\n MemAvailable' + ': {} MiB, SwapFree: {} MiB'.format( + round(ma_mib, 1), round(sf_mib, 1))) + + if soft_match is False: + key = 'Send {} to {}'.format(sig_dict[threshold], name) + update_stat_dict_and_print(key) + else: + key = "Run the command '{}'".format(command) + update_stat_dict_and_print(key) + + if threshold is SIGKILL and post_kill_exe != '': + + cmd = post_kill_exe.replace('$PID', pid).replace( + '$NAME', pid_to_name(pid)) + + log('Execute post_kill_exe') + + go(exe, cmd) + + if post_action_gui_notifications: + if soft_match: + send_notify_etc(pid, name, cmd) + else: + send_notify(threshold, name, pid) + + else: + + response_time = time() - time0 + victim_badness_is_too_small = 'victim badness ({}) < min_b' \ + 'adness ({}); nothing to do; response time: {} ms'.format( + victim_badness, + min_badness, + round(response_time * 1000)) + + log(victim_badness_is_too_small) + + # update stat_dict + key = 'victim badness < min_badness' + update_stat_dict_and_print(key) + + if vwd is None: + + if debug_sleep: + log('Sleep {} sec (over_sleep)'.format(over_sleep)) + sleep(over_sleep) + + log('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<') + + return psi_t0 + + +def sleep_after_check_mem(): + """Specify sleep times depends on rates and avialable memory.""" + + if stable_sleep: + + if debug_sleep: + log('Sleep {} sec'.format(min_sleep)) + stdout.flush() + sleep(min_sleep) + return None + + if hard_threshold_min_mem_kb < soft_threshold_min_mem_kb: + mem_point = mem_available - soft_threshold_min_mem_kb + else: + mem_point = mem_available - hard_threshold_min_mem_kb + + if hard_threshold_min_swap_kb < soft_threshold_min_swap_kb: + swap_point = swap_free - soft_threshold_min_swap_kb + else: + swap_point = swap_free - hard_threshold_min_swap_kb + + if swap_point < 0: + swap_point = 0 + + if mem_point < 0: + mem_point = 0 + + t_mem = mem_point / fill_rate_mem + t_swap = swap_point / fill_rate_swap + + if CHECK_ZRAM: + t_zram = (mem_total * 0.8 - mem_used_zram) / fill_rate_zram + if t_zram < 0: + t_zram = 0 + t_mem_zram = t_mem + t_zram + z = ', t_zram={}'.format(round(t_zram, 2)) + else: + z = '' + + t_mem_swap = t_mem + t_swap + + if CHECK_ZRAM: + + if t_mem_swap <= t_mem_zram: + t = t_mem_swap + else: + t = t_mem_zram + else: + t = t_mem_swap + + if t > max_sleep: + t = max_sleep + elif t < min_sleep: + t = min_sleep + else: + pass + + if debug_sleep: + log('Sleep {} sec (t_mem={}, t_swap={}{})'.format(round(t, 2), round( + t_mem, 2), round(t_swap, 2), z)) + + try: + stdout.flush() + except OSError: + pass + + sleep(t) + + +def calculate_percent(arg_key): + """ + parse conf dict + Calculate mem_min_KEY_percent. + + Try use this one) + arg_key: str key for config_dict + returns int mem_min_percent or NoneType if got some error + """ + + if arg_key in config_dict: + mem_min = config_dict[arg_key] + + if mem_min.endswith('%'): + # truncate percents, so we have a number + mem_min_percent = mem_min[:-1].strip() + # then 'float test' + mem_min_percent = string_to_float_convert_test(mem_min_percent) + if mem_min_percent is None: + errprint('Invalid {} value, not float\nExit'.format(arg_key)) + exit(1) + # Final validations... + if mem_min_percent < 0 or mem_min_percent > 100: + errprint( + '{}, as percents value, out of ran' + 'ge [0; 100]\nExit'.format(arg_key)) + exit(1) + + # soft_threshold_min_mem_percent is clean and valid float percentage. Can + # translate into Kb + mem_min_kb = mem_min_percent / 100 * mem_total + mem_min_mb = round(mem_min_kb / 1024) + + elif mem_min.endswith('M'): + mem_min_mb = string_to_float_convert_test(mem_min[:-1].strip()) + if mem_min_mb is None: + errprint('Invalid {} value, not float\nExit'.format(arg_key)) + exit(1) + mem_min_kb = mem_min_mb * 1024 + if mem_min_kb > mem_total: + errprint( + '{} value can not be greater then MemT' + 'otal ({} MiB)\nExit'.format( + arg_key, round( + mem_total / 1024))) + exit(1) + mem_min_percent = mem_min_kb / mem_total * 100 + + else: + log('Invalid {} units in config.\n Exit'.format(arg_key)) + exit(1) + mem_min_percent = None + + else: + log('{} not in config\nExit'.format(arg_key)) + exit(1) + mem_min_percent = None + + return mem_min_kb, mem_min_mb, mem_min_percent + + +########################################################################## + + +# {victim_id : {'time': timestamp, 'name': name} +v_dict = dict() + + +start_time = time() + + +help_mess = """usage: nohang [-h] [-v] [-p] [-c CONFIG] [-cc CONFIG] + +optional arguments: + -h, --help show this help message and exit + -v, --version print version + -p, --print-proc-table + print table of processes with their badness values + -c CONFIG, --config CONFIG + path to the config file, default values: + ./nohang.conf, /etc/nohang/nohang.conf + -cc CONFIG, --check-config CONFIG + check and print config""" + + +SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK']) + +SC_PAGESIZE = os.sysconf(os.sysconf_names['SC_PAGESIZE']) + +conf_err_mess = 'Invalid config. Exit.' + +sig_list = [SIGTERM, SIGINT, SIGQUIT, SIGHUP] + +sig_dict = { + SIGKILL: 'SIGKILL', + SIGINT: 'SIGINT', + SIGQUIT: 'SIGQUIT', + SIGHUP: 'SIGHUP', + SIGTERM: 'SIGTERM' +} + +self_pid = str(os.getpid()) + +self_uid = os.geteuid() + +if self_uid == 0: + root = True +else: + root = False + + +if os.path.exists('./nohang_notify_helper'): + notify_helper_path = './nohang_notify_helper' +else: + notify_helper_path = 'nohang_notify_helper' + + +last_action_dict = dict() + +last_action_dict['t'] = time() + + +# will store corrective actions stat +stat_dict = dict() + + +separate_log = False # will be overwritten after parse config + + +cgroup_v1_index, cgroup_v2_index = find_cgroup_indexes() + + +self_oom_score_adj_min = '-600' +self_oom_score_adj_max = '-6' + + +write_self_oom_score_adj(self_oom_score_adj_min) + + +pid_list = get_pid_list() + + +print_proc_table_flag = False + +check_config_flag = False + + +if os.path.exists('./nohang.conf'): + config = os.getcwd() + '/nohang.conf' +else: + config = '/etc/nohang/nohang.conf' + + +if len(argv) == 1: + pass +elif len(argv) == 2: + if argv[1] == '--help' or argv[1] == '-h': + print(help_mess) + exit() + elif argv[1] == '--check-config' or argv[1] == '-cc': + check_config_flag = True + elif argv[1] == '--version' or argv[1] == '-v': + print_version() + elif argv[1] == '--print-proc-table' or argv[1] == '-p': + print_proc_table_flag = True + if os.path.exists('./nohang.conf'): + config = os.getcwd() + '/nohang.conf' + else: + config = '/etc/nohang/nohang.conf' + else: + errprint('Unknown option: {}'.format(argv[1])) + exit(1) +elif len(argv) == 3: + if argv[1] == '--config' or argv[1] == '-c': + config = argv[2] + elif argv[1] == '--check-config' or argv[1] == '-cc': + config = argv[2] + check_config_flag = True + else: + errprint('Unknown option: {}'.format(argv[1])) + exit(1) +else: + errprint('Invalid CLI input: too many options') + exit(1) + + +# find mem_total +# find positions of SwapFree and SwapTotal in /proc/meminfo + +with open('/proc/meminfo') as f: + mem_list = f.readlines() + +mem_list_names = [] +for s in mem_list: + mem_list_names.append(s.split(':')[0]) + +if mem_list_names[2] != 'MemAvailable': + errprint('WARNING: Your Linux kernel is too old, Linux 3.14+ requied') + exit(1) + +swap_total_index = mem_list_names.index('SwapTotal') +swap_free_index = swap_total_index + 1 + +mem_total = int(mem_list[0].split(':')[1][:-4]) + +# Get names from /proc/*/status to be able to get VmRSS and VmSwap values + +with open('/proc/self/status') as file: + status_list = file.readlines() + +status_names = [] +for s in status_list: + status_names.append(s.split(':')[0]) + +ppid_index = status_names.index('PPid') +vm_size_index = status_names.index('VmSize') +vm_rss_index = status_names.index('VmRSS') +vm_swap_index = status_names.index('VmSwap') +uid_index = status_names.index('Uid') +state_index = status_names.index('State') + + +try: + anon_index = status_names.index('RssAnon') + file_index = status_names.index('RssFile') + shmem_index = status_names.index('RssShmem') + detailed_rss = True + # print(detailed_rss, 'detailed_rss') +except ValueError: + detailed_rss = False + # print('It is not Linux 4.5+') + + +log('config: ' + config) + + +########################################################################## + +# parsing the config with obtaining the parameters dictionary + +# conf_parameters_dict +# conf_restart_dict + +# dictionary with config options +config_dict = dict() + +badness_adj_re_name_list = [] +badness_adj_re_cmdline_list = [] +badness_adj_re_environ_list = [] +badness_adj_re_uid_list = [] +badness_adj_re_cgroup_v1_list = [] +badness_adj_re_cgroup_v2_list = [] +badness_adj_re_realpath_list = [] + +soft_actions_list = [] + +# separator for optional parameters (that starts with @) +opt_separator = '///' + +# stupid conf parsing, need refactoring +try: + with open(config) as f: + + for line in f: + + a = line.startswith('#') + b = line.startswith('\n') + c = line.startswith('\t') + d = line.startswith(' ') + + etc = line.startswith('@SOFT_ACTION_RE_NAME') + etc2 = line.startswith('@SOFT_ACTION_RE_CGROUP_V1') + + if not a and not b and not c and not d and not etc and not etc2: + a = line.partition('=') + + key = a[0].strip() + value = a[2].strip() + + if key not in config_dict: + config_dict[key] = value + else: + log('ERROR: config key duplication: {}'.format(key)) + exit(1) + + if etc: + + a = line.partition('@SOFT_ACTION_RE_NAME')[ + 2].partition(opt_separator) + + a1 = 'name' + + a2 = a[0].strip() + valid_re(a2) + + a3 = a[2].strip() + + zzz = (a1, a2, a3) + + soft_actions_list.append(zzz) + + if etc2: + + a = line.partition('@SOFT_ACTION_RE_CGROUP_V1')[ + 2].partition(opt_separator) + + a1 = 'cgroup_v1' + + a2 = a[0].strip() + valid_re(a2) + + a3 = a[2].strip() + + zzz = (a1, a2, a3) + + soft_actions_list.append(zzz) + + if line.startswith('@BADNESS_ADJ_RE_NAME'): + a = line.partition('@BADNESS_ADJ_RE_NAME')[2].strip( + ' \n').partition(opt_separator) + badness_adj = a[0].strip(' ') + reg_exp = a[2].strip(' ') + valid_re(reg_exp) + badness_adj_re_name_list.append((badness_adj, reg_exp)) + + if line.startswith('@BADNESS_ADJ_RE_CMDLINE'): + a = line.partition('@BADNESS_ADJ_RE_CMDLINE')[2].strip( + ' \n').partition(opt_separator) + badness_adj = a[0].strip(' ') + reg_exp = a[2].strip(' ') + valid_re(reg_exp) + badness_adj_re_cmdline_list.append((badness_adj, reg_exp)) + + if line.startswith('@BADNESS_ADJ_RE_UID'): + a = line.partition('@BADNESS_ADJ_RE_UID')[2].strip( + ' \n').partition(opt_separator) + badness_adj = a[0].strip(' ') + reg_exp = a[2].strip(' ') + valid_re(reg_exp) + badness_adj_re_uid_list.append((badness_adj, reg_exp)) + + if line.startswith('@BADNESS_ADJ_RE_CGROUP_V1'): + a = line.partition('@BADNESS_ADJ_RE_CGROUP_V1')[2].strip( + ' \n').partition(opt_separator) + badness_adj = a[0].strip(' ') + reg_exp = a[2].strip(' ') + valid_re(reg_exp) + badness_adj_re_cgroup_v1_list.append((badness_adj, reg_exp)) + + if line.startswith('@BADNESS_ADJ_RE_CGROUP_V2'): + a = line.partition('@BADNESS_ADJ_RE_CGROUP_V2')[2].strip( + ' \n').partition(opt_separator) + badness_adj = a[0].strip(' ') + reg_exp = a[2].strip(' ') + valid_re(reg_exp) + badness_adj_re_cgroup_v2_list.append((badness_adj, reg_exp)) + + if line.startswith('@BADNESS_ADJ_RE_REALPATH'): + a = line.partition('@BADNESS_ADJ_RE_REALPATH')[2].strip( + ' \n').partition(opt_separator) + badness_adj = a[0].strip(' ') + reg_exp = a[2].strip(' ') + valid_re(reg_exp) + badness_adj_re_realpath_list.append((badness_adj, reg_exp)) + + if line.startswith('@BADNESS_ADJ_RE_ENVIRON'): + a = line.partition('@BADNESS_ADJ_RE_ENVIRON')[2].strip( + ' \n').partition(opt_separator) + badness_adj = a[0].strip(' ') + reg_exp = a[2].strip(' ') + valid_re(reg_exp) + badness_adj_re_environ_list.append((badness_adj, reg_exp)) + + +except PermissionError: + errprint('PermissionError', conf_err_mess) + exit(1) +except UnicodeDecodeError: + errprint('UnicodeDecodeError', conf_err_mess) + exit(1) +except IsADirectoryError: + errprint('IsADirectoryError', conf_err_mess) + exit(1) +except IndexError: + errprint('IndexError', conf_err_mess) + exit(1) +except FileNotFoundError: + errprint('FileNotFoundError', conf_err_mess) + exit(1) + + +if badness_adj_re_name_list == []: + regex_matching = False +else: + regex_matching = True + + +if badness_adj_re_cmdline_list == []: + re_match_cmdline = False +else: + re_match_cmdline = True + + +if badness_adj_re_uid_list == []: + re_match_uid = False +else: + re_match_uid = True + + +if badness_adj_re_environ_list == []: + re_match_environ = False +else: + re_match_environ = True + + +if badness_adj_re_realpath_list == []: + re_match_realpath = False +else: + re_match_realpath = True + + +if badness_adj_re_cgroup_v1_list == []: + re_match_cgroup_v1 = False +else: + re_match_cgroup_v1 = True + + +if badness_adj_re_cgroup_v2_list == []: + re_match_cgroup_v2 = False +else: + re_match_cgroup_v2 = True + + +if soft_actions_list == []: + soft_actions = False +else: + soft_actions = True + + +########################################################################## + + +# post_zombie_delay = 0.1 + +# victim_cache_time = 50 + + +# extracting parameters from the dictionary +# check for all necessary parameters +# validation of all parameters +debug_psi = conf_parse_bool('debug_psi') +print_statistics = conf_parse_bool('print_statistics') +print_proc_table = conf_parse_bool('print_proc_table') +forbid_negative_badness = conf_parse_bool('forbid_negative_badness') +print_victim_status = conf_parse_bool('print_victim_status') +print_victim_cmdline = conf_parse_bool('print_victim_cmdline') +print_config_at_startup = conf_parse_bool('print_config_at_startup') +print_mem_check_results = conf_parse_bool('print_mem_check_results') +debug_sleep = conf_parse_bool('debug_sleep') +low_memory_warnings_enabled = conf_parse_bool('low_memory_warnings_enabled') +post_action_gui_notifications = conf_parse_bool( + 'post_action_gui_notifications') + + +psi_checking_enabled = conf_parse_bool('psi_checking_enabled') +ignore_psi = not psi_checking_enabled + +zram_checking_enabled = conf_parse_bool('zram_checking_enabled') +ignore_zram = not zram_checking_enabled + + +debug_gui_notifications = conf_parse_bool('debug_gui_notifications') +ignore_positive_oom_score_adj = conf_parse_bool( + 'ignore_positive_oom_score_adj') + + +(soft_threshold_min_mem_kb, soft_threshold_min_mem_mb, + soft_threshold_min_mem_percent) = calculate_percent('soft_threshold_min_mem') + +(hard_threshold_min_mem_kb, hard_threshold_min_mem_mb, + hard_threshold_min_mem_percent) = calculate_percent('hard_threshold_min_mem') + +(soft_threshold_max_zram_kb, soft_threshold_max_zram_mb, + soft_threshold_max_zram_percent) = calculate_percent('soft_threshold_max_zram') + +(hard_threshold_max_zram_kb, hard_threshold_max_zram_mb, + hard_threshold_max_zram_percent) = calculate_percent('hard_threshold_max_zram') + +(warning_threshold_min_mem_kb, warning_threshold_min_mem_mb, + warning_threshold_min_mem_percent) = calculate_percent('warning_threshold_min_mem') + +(warning_threshold_max_zram_kb, warning_threshold_max_zram_mb, + warning_threshold_max_zram_percent) = calculate_percent('warning_threshold_max_zram') + + +if 'post_zombie_delay' in config_dict: + post_zombie_delay = string_to_float_convert_test( + config_dict['post_zombie_delay']) + if post_zombie_delay is None: + errprint('Invalid post_zombie_delay, not float\nExit') + exit(1) + if post_zombie_delay < 0: + errprint('post_zombie_delay MUST be >= 0\nExit') + exit(1) +else: + errprint('post_zombie_delay not in config\nExit') + exit(1) + + +if 'victim_cache_time' in config_dict: + victim_cache_time = string_to_float_convert_test( + config_dict['victim_cache_time']) + if victim_cache_time is None: + errprint('Invalid victim_cache_time, not float\nExit') + exit(1) + if victim_cache_time < 0: + errprint('victim_cache_time MUST be >= 0\nExit') + exit(1) +else: + errprint('victim_cache_time not in config\nExit') + exit(1) + + +if 'fill_rate_mem' in config_dict: + fill_rate_mem = string_to_float_convert_test(config_dict['fill_rate_mem']) + if fill_rate_mem is None: + errprint('Invalid fill_rate_mem value, not float\nExit') + exit(1) + if fill_rate_mem <= 0: + errprint('fill_rate_mem MUST be > 0\nExit') + exit(1) +else: + errprint('fill_rate_mem not in config\nExit') + exit(1) + + +if 'fill_rate_swap' in config_dict: + fill_rate_swap = string_to_float_convert_test( + config_dict['fill_rate_swap']) + if fill_rate_swap is None: + errprint('Invalid fill_rate_swap value, not float\nExit') + exit(1) + if fill_rate_swap <= 0: + errprint('fill_rate_swap MUST be > 0\nExit') + exit(1) +else: + errprint('fill_rate_swap not in config\nExit') + exit(1) + + +if 'fill_rate_zram' in config_dict: + fill_rate_zram = string_to_float_convert_test( + config_dict['fill_rate_zram']) + if fill_rate_zram is None: + errprint('Invalid fill_rate_zram value, not float\nExit') + exit(1) + if fill_rate_zram <= 0: + errprint('fill_rate_zram MUST be > 0\nExit') + exit(1) +else: + errprint('fill_rate_zram not in config\nExit') + exit(1) + + +if 'soft_threshold_min_swap' in config_dict: + soft_threshold_min_swap = config_dict['soft_threshold_min_swap'] +else: + errprint('soft_threshold_min_swap not in config\nExit') + exit(1) + + +if 'hard_threshold_min_swap' in config_dict: + hard_threshold_min_swap = config_dict['hard_threshold_min_swap'] +else: + errprint('hard_threshold_min_swap not in config\nExit') + exit(1) + + +if 'post_soft_action_delay' in config_dict: + post_soft_action_delay = string_to_float_convert_test( + config_dict['post_soft_action_delay']) + if post_soft_action_delay is None: + errprint('Invalid post_soft_action_delay value, not float\nExit') + exit(1) + if post_soft_action_delay < 0: + errprint('post_soft_action_delay must be positiv\nExit') + exit(1) +else: + errprint('post_soft_action_delay not in config\nExit') + exit(1) + + +if 'psi_post_action_delay' in config_dict: + psi_post_action_delay = string_to_float_convert_test( + config_dict['psi_post_action_delay']) + if psi_post_action_delay is None: + errprint('Invalid psi_post_action_delay value, not float\nExit') + exit(1) + if psi_post_action_delay < 0: + errprint('psi_post_action_delay must be positive\nExit') + exit(1) +else: + errprint('psi_post_action_delay not in config\nExit') + exit(1) + + +if 'hard_threshold_max_psi' in config_dict: + hard_threshold_max_psi = string_to_float_convert_test( + config_dict['hard_threshold_max_psi']) + if hard_threshold_max_psi is None: + errprint('Invalid hard_threshold_max_psi value, not float\nExit') + exit(1) + if hard_threshold_max_psi < 0 or hard_threshold_max_psi > 100: + errprint('hard_threshold_max_psi must be in the range [0; 100]\nExit') + exit(1) +else: + errprint('hard_threshold_max_psi not in config\nExit') + exit(1) + + +if 'soft_threshold_max_psi' in config_dict: + soft_threshold_max_psi = string_to_float_convert_test( + config_dict['soft_threshold_max_psi']) + if soft_threshold_max_psi is None: + errprint('Invalid soft_threshold_max_psi value, not float\nExit') + exit(1) + if soft_threshold_max_psi < 0 or soft_threshold_max_psi > 100: + errprint('soft_threshold_max_psi must be in the range [0; 100]\nExit') + exit(1) +else: + errprint('soft_threshold_max_psi not in config\nExit') + exit(1) + + +if 'warning_threshold_max_psi' in config_dict: + warning_threshold_max_psi = string_to_float_convert_test( + config_dict['warning_threshold_max_psi']) + if warning_threshold_max_psi is None: + errprint('Invalid warning_threshold_max_psi value, not float\nExit') + exit(1) + if warning_threshold_max_psi < 0 or warning_threshold_max_psi > 100: + errprint( + 'warning_threshold_max_psi must be in the range [0; 100]\nExit') + exit(1) +else: + errprint('warning_threshold_max_psi not in config\nExit') + exit(1) + + +if 'min_badness' in config_dict: + min_badness = string_to_int_convert_test( + config_dict['min_badness']) + if min_badness is None: + errprint('Invalid min_badness value, not integer\nExit') + exit(1) + if min_badness < 0 or min_badness > 1000: + errprint('Invalud min_badness value\nExit') + exit(1) +else: + errprint('min_badness not in config\nExit') + exit(1) + + +if 'min_post_warning_delay' in config_dict: + min_post_warning_delay = string_to_float_convert_test( + config_dict['min_post_warning_delay']) + if min_post_warning_delay is None: + errprint('Invalid min_post_warning_delay value, not float\nExit') + exit(1) + if min_post_warning_delay < 1 or min_post_warning_delay > 300: + errprint('min_post_warning_delay value out of range [1; 300]\nExit') + exit(1) +else: + errprint('min_post_warning_delay not in config\nExit') + exit(1) + + +if 'warning_threshold_min_swap' in config_dict: + warning_threshold_min_swap = config_dict['warning_threshold_min_swap'] +else: + errprint('warning_threshold_min_swap not in config\nExit') + exit(1) + + +if 'max_victim_ancestry_depth' in config_dict: + max_victim_ancestry_depth = string_to_int_convert_test( + config_dict['max_victim_ancestry_depth']) + if min_badness is None: + errprint('Invalid max_victim_ancestry_depth value, not integer\nExit') + exit(1) + if max_victim_ancestry_depth < 1: + errprint('Invalud max_victim_ancestry_depth value\nExit') + exit(1) +else: + errprint('max_victim_ancestry_depth is not in config\nExit') + exit(1) + + +if 'max_soft_exit_time' in config_dict: + max_soft_exit_time = string_to_float_convert_test( + config_dict['max_soft_exit_time']) + if max_soft_exit_time is None: + errprint('Invalid max_soft_exit_time val' + 'ue, not float\nExit') + exit(1) + if max_soft_exit_time < 0: + errprint('max_soft_exit_time must be non-n' + 'egative number\nExit') + exit(1) +else: + errprint('max_soft_exit_time is not in config\nExit') + exit(1) + + +if 'post_kill_exe' in config_dict: + post_kill_exe = config_dict['post_kill_exe'] +else: + errprint('post_kill_exe is not in config\nExit') + exit(1) + + +if 'psi_path' in config_dict: + psi_path = config_dict['psi_path'] +else: + errprint('psi_path is not in config\nExit') + exit(1) + + +if 'psi_metrics' in config_dict: + psi_metrics = config_dict['psi_metrics'] +else: + errprint('psi_metrics is not in config\nExit') + exit(1) + + +if 'warning_exe' in config_dict: + warning_exe = config_dict['warning_exe'] + if warning_exe != '': + check_warning_exe = True + else: + check_warning_exe = False +else: + errprint('warning_exe is not in config\nExit') + exit(1) + + +if 'extra_table_info' in config_dict: + extra_table_info = config_dict['extra_table_info'] + if (extra_table_info != 'None' and + extra_table_info != 'cgroup_v1' and + extra_table_info != 'cgroup_v2' and + extra_table_info != 'cmdline' and + extra_table_info != 'environ' and + extra_table_info != 'realpath'): + + errprint('Invalid config: invalid extra_table_info value\nExit') + exit(1) +else: + errprint('Invalid config: extra_table_info is not in config\nExit') + exit(1) + + +separate_log = conf_parse_bool('separate_log') + +if separate_log: + + import logging + + log_dir = '/var/log/nohang' + + try: + os.mkdir(log_dir) + except PermissionError: + print('ERROR: can not create log dir') + except FileExistsError: + pass + + logfile = log_dir + '/nohang.log' + + try: + with open(logfile, 'a') as f: + pass + except FileNotFoundError: + print('ERROR: log FileNotFoundError') + except PermissionError: + print('ERROR: log PermissionError') + + try: + logging.basicConfig( + filename=logfile, + level=logging.INFO, + format="%(asctime)s: %(message)s") + except PermissionError: + errprint('ERROR: Permission denied: {}'.format(logfile)) + except FileNotFoundError: + errprint('ERROR: FileNotFoundError: {}'.format(logfile)) + + +if 'min_mem_report_interval' in config_dict: + min_mem_report_interval = string_to_float_convert_test( + config_dict['min_mem_report_interval']) + if min_mem_report_interval is None: + errprint('Invalid min_mem_report_interval value, not float\nExit') + exit(1) + if min_mem_report_interval < 0: + errprint('min_mem_report_interval must be non-negative number\nExit') + exit(1) +else: + errprint('min_mem_report_interval is not in config\nExit') + exit(1) + + +if 'psi_excess_duration' in config_dict: + psi_excess_duration = string_to_float_convert_test( + config_dict['psi_excess_duration']) + if psi_excess_duration is None: + errprint('Invalid psi_excess_duration value, not float\nExit') + exit(1) + if psi_excess_duration < 0: + errprint('psi_excess_duration must be non-negative number\nExit') + exit(1) +else: + errprint('psi_excess_duration is not in config\nExit') + exit(1) + + +if 'max_sleep' in config_dict: + max_sleep = string_to_float_convert_test( + config_dict['max_sleep']) + if max_sleep is None: + errprint('Invalid max_sleep value, not float\nExit') + exit(1) + if max_sleep <= 0: + errprint('max_sleep must be positive number\nExit') + exit(1) +else: + errprint('max_sleep is not in config\nExit') + exit(1) + + +if 'min_sleep' in config_dict: + min_sleep = string_to_float_convert_test( + config_dict['min_sleep']) + if min_sleep is None: + errprint('Invalid min_sleep value, not float\nExit') + exit(1) + if min_sleep <= 0: + errprint('min_sleep must be positive number\nExit') + exit(1) +else: + errprint('min_sleep is not in config\nExit') + exit(1) + + +if 'over_sleep' in config_dict: + over_sleep = string_to_float_convert_test( + config_dict['over_sleep']) + if over_sleep is None: + errprint('Invalid over_sleep value, not float\nExit') + exit(1) + if over_sleep <= 0: + errprint('over_sleep must be positive number\nExit') + exit(1) +else: + errprint('over_sleep is not in config\nExit') + exit(1) + + +sensitivity_test_time = over_sleep / 2 + + +if max_sleep < min_sleep: + errprint('min_sleep value must not exceed max_sleep value.\nExit') + exit(1) + + +if min_sleep < over_sleep: + errprint('over_sleep value must not exceed min_sleep value.\nExit') + exit(1) + + +if max_sleep == min_sleep: + stable_sleep = True +else: + stable_sleep = False + + +if print_proc_table_flag: + + if not root: + log('WARNING: effective UID != 0; euid={}; processes with other e' + 'uids will be invisible for nohang'.format(self_uid)) + + func_print_proc_table() + + +########################################################################## + + +psi_support = os.path.exists(psi_path) + + +########################################################################## + +# Get KiB levels if it's possible. + +soft_threshold_min_swap_tuple = get_swap_threshold_tuple( + soft_threshold_min_swap) +hard_threshold_min_swap_tuple = get_swap_threshold_tuple( + hard_threshold_min_swap) +warning_threshold_min_swap_tuple = get_swap_threshold_tuple( + warning_threshold_min_swap) + + +swap_kb_dict = dict() + +swap_term_is_percent = soft_threshold_min_swap_tuple[1] +if swap_term_is_percent: + soft_threshold_min_swap_percent = soft_threshold_min_swap_tuple[0] +else: + soft_threshold_min_swap_kb = soft_threshold_min_swap_tuple[0] + swap_kb_dict['soft_threshold_min_swap_kb'] = soft_threshold_min_swap_kb + +swap_kill_is_percent = hard_threshold_min_swap_tuple[1] +if swap_kill_is_percent: + hard_threshold_min_swap_percent = hard_threshold_min_swap_tuple[0] +else: + hard_threshold_min_swap_kb = hard_threshold_min_swap_tuple[0] + swap_kb_dict['hard_threshold_min_swap_kb'] = hard_threshold_min_swap_kb + + +swap_warn_is_percent = warning_threshold_min_swap_tuple[1] +if swap_warn_is_percent: + warning_threshold_min_swap_percent = warning_threshold_min_swap_tuple[0] +else: + warning_threshold_min_swap_kb = warning_threshold_min_swap_tuple[0] + swap_kb_dict['warning_threshold_min_swap_kb'] = warning_threshold_min_swap_kb + + +########################################################################## + + +if print_config_at_startup or check_config_flag: + check_config() + + +########################################################################## + + +# for calculating the column width when printing mem and zram +mem_len = len(str(round(mem_total / 1024.0))) + +if post_action_gui_notifications: + notify_sig_dict = {SIGKILL: 'Killing', + SIGTERM: 'Terminating'} + + +# convert rates from MiB/s to KiB/s +fill_rate_mem = fill_rate_mem * 1024 +fill_rate_swap = fill_rate_swap * 1024 +fill_rate_zram = fill_rate_zram * 1024 + + +warn_time_now = 0 +warn_time_delta = 1000 +warn_timer = 0 + + +########################################################################## + + +if not root: + log('WARNING: effective UID != 0; euid={}; processes with other e' + 'uids will be invisible for nohang'.format(self_uid)) + + +# Try to lock all memory + +mlockall() + +########################################################################## + + +# print_self_rss() + +psi_avg_string = '' # will be overwritten if PSI monitoring enabled + +mem_used_zram = 0 + + +if print_mem_check_results: + + # to find delta mem + wt2 = 0 + new_mem = 0 + + # init mem report interval + report0 = 0 + + +# handle signals +for i in sig_list: + signal(i, signal_handler) + + +x0 = time() +delta0 = 0 + + +threshold = None +mem_info = None + + +CHECK_PSI = False +if psi_support and not ignore_psi: + CHECK_PSI = True + +psi_kill_exceeded_timer = 0 +psi_term_exceeded_timer = 0 +psi_t0 = time() +psi_threshold = zram_threshold = zram_info = psi_info = None + + +CHECK_ZRAM = not ignore_zram + +log('Monitoring has started!') + +stdout.flush() + + +########################################################################## + + +while True: + + (masf_threshold, masf_info, mem_available, hard_threshold_min_swap_kb, + soft_threshold_min_swap_kb, swap_free, swap_total) = check_mem_swap_ex() + + if CHECK_ZRAM: + zram_threshold, zram_info, mem_used_zram = check_zram_ex() + + if CHECK_PSI: + (psi_threshold, psi_info, psi_t0, psi_kill_exceeded_timer, + psi_term_exceeded_timer, x0) = check_psi_ex( + psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0) + + if print_mem_check_results: + + if CHECK_PSI: + psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics) + if time() - psi_t0 >= psi_post_action_delay: + psi_post_action_delay_exceeded = True + else: + psi_post_action_delay_exceeded = False + + if print_mem_check_results: + psi_avg_string = 'PSI avg: {} | '.format( + str(psi_avg_value).rjust(6)) + + wt1 = time() + + delta = (mem_available + swap_free) - new_mem + + t_cycle = wt1 - wt2 + + report_delta = wt1 - report0 + + if report_delta >= min_mem_report_interval: + + mem_report = True + new_mem = mem_available + swap_free + + report0 = wt1 + + else: + mem_report = False + + wt2 = time() + + if mem_report: + + speed = delta / 1024.0 / report_delta + speed_info = ' | dMem: {} M/s'.format( + str(round(speed)).rjust(5) + ) + + # Calculate 'swap-column' width + swap_len = len(str(round(swap_total / 1024.0))) + + # Output available mem sizes + if swap_total == 0 and mem_used_zram == 0: + log('{}MemAvail: {} M, {} %{}'.format( + psi_avg_string, + human(mem_available, mem_len), + just_percent_mem(mem_available / mem_total), + speed_info + ) + ) + + elif swap_total > 0 and mem_used_zram == 0: + log('{}MemAvail: {} M, {} % | SwapFree: {} M, {} %{}'.format( + psi_avg_string, + human(mem_available, mem_len), + just_percent_mem(mem_available / mem_total), + human(swap_free, swap_len), + just_percent_swap(swap_free / (swap_total + 0.1)), + speed_info + ) + ) + + else: + log('{}MemAvail: {} M, {} % | SwapFree: {} M, {} % | Mem' + 'UsedZram: {} M, {} %{}'.format( + psi_avg_string, + human(mem_available, mem_len), + just_percent_mem(mem_available / mem_total), + human(swap_free, swap_len), + just_percent_swap(swap_free / (swap_total + 0.1)), + human(mem_used_zram, mem_len), + just_percent_mem(mem_used_zram / mem_total), + speed_info + ) + ) + + if (masf_threshold is SIGKILL or zram_threshold is SIGKILL or + psi_threshold is SIGKILL): + + threshold = SIGKILL + mem_info_list = [] + + if masf_info is not None: + mem_info_list.append(masf_info) + + if zram_info is not None: + mem_info_list.append(zram_info) + + if psi_info is not None: + mem_info_list.append(psi_info) + + psi_t0 = implement_corrective_action( + threshold, + mem_info_list, + psi_t0, + psi_kill_exceeded_timer, + psi_term_exceeded_timer, + x0, psi_threshold, zram_threshold, zram_info, psi_info) + continue + + if (masf_threshold is SIGTERM or zram_threshold is SIGTERM or + psi_threshold is SIGTERM): + + threshold = SIGTERM + mem_info_list = [] + + if masf_info is not None: + mem_info_list.append(masf_info) + + if zram_info is not None: + mem_info_list.append(zram_info) + + if psi_info is not None: + mem_info_list.append(psi_info) + + psi_t0 = implement_corrective_action( + threshold, + mem_info_list, + psi_t0, + psi_kill_exceeded_timer, + psi_term_exceeded_timer, + x0, psi_threshold, zram_threshold, zram_info, psi_info) + continue + + if low_memory_warnings_enabled: + + if (masf_threshold == 'WARN' or zram_threshold == 'WARN' or + psi_threshold == 'WARN'): + + warn_time_delta = time() - warn_time_now + warn_time_now = time() + warn_timer += warn_time_delta + if warn_timer > min_post_warning_delay: + + send_notify_warn() + + warn_timer = 0 + + sleep_after_check_mem() diff --git a/old/nohang.conf b/old/nohang.conf new file mode 100644 index 0000000..1b13348 --- /dev/null +++ b/old/nohang.conf @@ -0,0 +1,359 @@ + This is nohang config file. + Lines starting with #, tabs and spaces are comments. + Lines starting with @ contain optional parameters. + All values are case sensitive. + Be careful: nohang doesn't forbid you to shoot yourself in the foot. + + The configuration includes the following sections: + + 0. Common zram settings + 1. Memory levels to respond to as an OOM threat + 2. Response on PSI memory metrics + 3. The frequency of checking the level of available memory + (and CPU usage) + 4. The prevention of killing innocent victims + 5. Impact on the badness of processes via matching their names, cgroups and + cmdlines with specified regular expressions + 6. Customize corrective actions: the execution of a specific command + instead of sending the SIGTERM signal + 7. GUI notifications: + - low memory warnings + - OOM prevention results + 8. Output verbosity + 9. Misc + + Just read the description of the parameters and edit the values. + Please restart the program after editing the config. + + More docs will be written later. + +############################################################################### + + 0. Common zram settings + + See https://www.kernel.org/doc/Documentation/blockdev/zram.txt + You maybe need to set `zram_checking_enabled = True` if you has a big zram disksize. + +zram_checking_enabled = False + +############################################################################### + + 1. Thresholds below which a signal should be sent to the victim + + Sets the available memory levels at or below which SIGTERM or SIGKILL + signals are sent. The signal will be sent if MemAvailable and + SwapFree (in /proc/meminfo) at the same time will drop below the + corresponding values. Can be specified in % (percent) and M (MiB). + Valid values are floating-point numbers from the range [0; 100] %. + + MemAvailable levels. + +soft_threshold_min_mem = 8 % +hard_threshold_min_mem = 4 % + + SwapFree levels. + +soft_threshold_min_swap = 10 % +hard_threshold_min_swap = 5 % + + Specifying the total share of zram in memory, if exceeded the + corresponding signals are sent. As the share of zram in memory + increases, it may fall responsiveness of the system. 90 % is a + usual hang level, not recommended to set very high. + + Can be specified in % and M. Valid values are floating-point + numbers from the range [0; 90] %. + +soft_threshold_max_zram = 60 % +hard_threshold_max_zram = 65 % + + +############################################################################### + + 2. Response on PSI memory metrics (it needs Linux 4.20 and up) + + About PSI: + https://facebookmicrosites.github.io/psi/ + + Disabled by default (psi_checking_enabled = False). + +psi_checking_enabled = False + + Choose a path to PSI file. + By default it monitors system-wide file: /proc/pressure/memory + You also can set file to monitor one cgroup slice. + For example: + psi_path = /sys/fs/cgroup/unified/user.slice/memory.pressure + psi_path = /sys/fs/cgroup/unified/system.slice/memory.pressure + psi_path = /sys/fs/cgroup/unified/system.slice/foo.service/memory.pressure + + Execute the command + find /sys/fs/cgroup -name memory.pressure + to find available memory.pressue files (except /proc/pressure/memory). + (actual for cgroup2) + +psi_path = /proc/pressure/memory + + Valid psi_metrics are: + some_avg10 + some_avg60 + some_avg300 + full_avg10 + full_avg60 + full_avg300 + + some_avg10 is most sensitive. + +psi_metrics = some_avg10 + +soft_threshold_max_psi = 60 + +hard_threshold_max_psi = 90 + + >= 0, float +psi_excess_duration = 60 + +psi_post_action_delay = 60 + + +############################################################################### + + 3. The frequency of checking the amount of available memory + (and CPU usage) + + Coefficients that affect the intensity of monitoring. Reducing + the coefficients can reduce CPU usage and increase the periods + between memory checks. + + Why three coefficients instead of one? Because the swap fill rate + is usually lower than the RAM fill rate. + + It is possible to set a lower intensity of monitoring for swap + without compromising to prevent OOM and thus reduce the CPU load. + + Default values are well for desktop. On servers without rapid + fluctuations in memory levels the values can be reduced. + + Valid values are positive floating-point numbers. + +fill_rate_mem = 4000 +fill_rate_swap = 1500 +fill_rate_zram = 6000 + + See also https://github.com/rfjakob/earlyoom/issues/61 + +max_sleep = 3 +min_sleep = 0.1 + + Sleep time if soft threshold exceeded. + +over_sleep = 0.05 + +############################################################################### + + 4. The prevention of killing innocent victims + + Valid values are integers from the range [0; 1000]. + +min_badness = 10 + + Valid values are non-negative floating-point numbers. + Min delay if a victim doesn't respond to SIGTERM in 10 ms. + +post_soft_action_delay = 3 + +post_zombie_delay = 0.1 + +victim_cache_time = 10 + + Valid values are True and False. + +ignore_positive_oom_score_adj = False + +############################################################################### + + 5. Impact on the badness of processes via matching their names, + cmdlines or UIDs with regular expressions using re.search(). + + See https://en.wikipedia.org/wiki/Regular_expression and + https://en.wikipedia.org/wiki/Perl_Compatible_Regular_Expressions + + Enabling this options slows down the search for the victim + because the names, cmdlines or UIDs of all processes + (except init and kthreads) are compared with the + specified regex patterns (in fact slowing down is caused by + reading all /proc/*/cmdline and /proc/*/status files). + + Use script `oom-sort` from nohang package to view + names, cmdlines and UIDs of processes. + + 5.1. Matching process names with RE patterns + + Syntax: + + @BADNESS_ADJ_RE_NAME badness_adj /// RE_pattern + + New badness value will be += badness_adj + + It is possible to compare multiple patterns + with different badness_adj values. + + Example: + @BADNESS_ADJ_RE_NAME -500 /// ^sshd$ + + 5.2. Matching CGroup_v1-line with RE patterns + + @BADNESS_ADJ_RE_CGROUP_V1 -100 /// ^/system\.slice/ + + @BADNESS_ADJ_RE_CGROUP_V1 50 /// /foo\.service$ + + @BADNESS_ADJ_RE_CGROUP_V1 -50 /// ^/user\.slice/ + + 5.3. Matching CGroup_v2-line with RE patterns + + @BADNESS_ADJ_RE_CGROUP_V2 100 /// ^/workload + + 5.4. Matching eUIDs with RE patterns + + @BADNESS_ADJ_RE_UID -100 /// ^0$ + + 5.5. Matching realpath with RE patterns + + @BADNESS_ADJ_RE_REALPATH 20 /// ^/usr/bin/foo + + 5.6. Matching cmdlines with RE patterns + + A good option that allows fine adjustment. + + Prefer chromium tabs and electron-based apps + @BADNESS_ADJ_RE_CMDLINE 200 /// --type=renderer + + Prefer firefox tabs (Web Content and WebExtensions) + @BADNESS_ADJ_RE_CMDLINE 300 /// -appomni + + @BADNESS_ADJ_RE_CMDLINE -200 /// ^/usr/lib/virtualbox + + 5.7. Matching environ with RE patterns + + @BADNESS_ADJ_RE_ENVIRON 100 /// USER=user + + Note that you can control badness also via systemd units via + OOMScoreAdjust, see + www.freedesktop.org/software/systemd/man/systemd.exec.html#OOMScoreAdjust= + +############################################################################### + + 6. Customize corrective actions. + + TODO: docs + + Syntax: + KEY REGEXP SEPARATOR COMMAND + + @SOFT_ACTION_RE_NAME ^foo$ /// kill -SEGV $PID + @SOFT_ACTION_RE_NAME ^bash$ /// kill -9 $PID + + @SOFT_ACTION_RE_CGROUP_V1 ^/system\.slice/ /// systemctl restart $SERVICE + @SOFT_ACTION_RE_CGROUP_V1 /foo\.service$ /// systemctl restart $SERVICE + + $PID will be replaced by process PID. + $NAME will be replaced by process name. + $SERVICE will be replaced by .service if it exists (overwise it will be + relpaced by empty line) + +############################################################################### + + 7. GUI notifications & low memory warnings + +post_action_gui_notifications = False + + Enable GUI notifications about the low level of available memory. + Valid values are True and False. + +low_memory_warnings_enabled = False + + Execute the command instead of sending GUI notifications if the value is + not empty line. For example: + warning_exe = cat /proc/meminfo & + +warning_exe = + + Can be specified in % (percent) and M (MiB). + Valid values are floating-point numbers from the range [0; 100] %. + +warning_threshold_min_mem = 20 % + +warning_threshold_min_swap = 25 % + +warning_threshold_max_zram = 50 % + +warning_threshold_max_psi = 100 + + Valid values are floating-point numbers from the range [1; 300]. + +min_post_warning_delay = 20 + + Ampersands (&) will be replaced with asterisks (*) in process + names and in commands. + +############################################################################### + + 8. Verbosity + + Display the configuration when the program starts. + Valid values are True and False. + +print_config_at_startup = False + + Print memory check results. + Valid values are True and False. + +print_mem_check_results = False + +min_mem_report_interval = 60 + +print_proc_table = False + + Valid values: + None + cgroup_v1 + cgroup_v2 + realpath + cmdline + environ + +extra_table_info = None + +print_victim_status = True + +max_victim_ancestry_depth = 3 + +print_victim_cmdline = False + +print_statistics = True + + Print sleep periods between memory checks. + Valid values are True and False. + +debug_psi = False + +debug_gui_notifications = False + +debug_sleep = False + +separate_log = False + +############################################################################### + + 9. Misc + +max_soft_exit_time = 10 + +post_kill_exe = + +forbid_negative_badness = True + +############################################################################### + + Use cases, feature requests and any questions are welcome: + https://github.com/hakavlad/nohang/issues diff --git a/nohang_notify_helper b/old/nohang_notify_helper similarity index 100% rename from nohang_notify_helper rename to old/nohang_notify_helper diff --git a/test.conf b/test.conf index 28f837f..dc85b2b 100644 --- a/test.conf +++ b/test.conf @@ -291,6 +291,8 @@ warning_threshold_max_psi = 100 min_post_warning_delay = 20 +env_cache_time = 300 + Ampersands (&) will be replaced with asterisks (*) in process names and in commands. @@ -341,6 +343,9 @@ debug_sleep = True separate_log = True +debug_threading = True + + ############################################################################### 9. Misc