diff --git a/README.md b/README.md index 86db404..ceceaa8 100644 --- a/README.md +++ b/README.md @@ -176,7 +176,7 @@ See also `man journalctl`. ## Known problems - Awful documentation -- Slowly starting, slowly looking for a victim, especially when using swapspace +- Slowly starting, slowly looking for a victim, especially when using swapspace (although this should be enough for more than 95% of all cases, IMHO) - It is written in an interpreted language and is actually a prototype ## Contribution @@ -194,8 +194,11 @@ Please create [issues](https://github.com/hakavlad/nohang/issues). Use cases, fe - Display `UID`, `oom_score`, `oom_score_adj`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem` and `cmdline` of the victim in corrective action reports - Print in terminal with colors - Print statistics on corrective actions after each corrective action - - Optimize limiting `oom_score_adj`: now it can works without UID=0 - - Optimize GUI warnings: find env without run `ps` and `env` + - Improve poll rate algorithm + - Improve limiting `oom_score_adj`: now it can works without UID=0 + - Improve GUI warnings: + - Find env without run `ps` and `env` + - Handle all timeouts when notify-send starts - Fix conf parsing: use of `line.partition('=')` instead of `line.split('=')` - Add `PSI` support (using `/proc/pressure/memory`, need Linux 4.20+) - Add `oom-sort` diff --git a/nohang b/nohang index a2221aa..fb74e67 100755 --- a/nohang +++ b/nohang @@ -21,10 +21,15 @@ self_pid = str(os.getpid()) self_uid = os.geteuid() if self_uid == 0: root = True +else: + root = False -wait_time = 2 -cache_time = 30 -cache_path = '/dev/shm/nohang_env_cache' +wait_time = 14 + +max_sleep_time = 2 +min_sleep_time = 0.1 + +notify_helper_path = '/usr/bin/nohang_notify_helper' psi_path = '/proc/pressure/memory' psi_support = os.path.exists(psi_path) @@ -119,79 +124,6 @@ def format_time(t): return '{} h {} min {} sec'.format(h, m, s) -def re_pid_environ(pid): - """ - read environ of 1 process - returns tuple with USER, DBUS, DISPLAY like follow: - ('user', 'DISPLAY=:0', - 'DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus') - returns None if these vars is not in /proc/[pid]/environ - """ - display_env = 'DISPLAY=' - dbus_env = 'DBUS_SESSION_BUS_ADDRESS=' - user_env = 'USER=' - try: - env = str(rline1('/proc/' + pid + '/environ')) - if display_env in env and dbus_env in env and user_env in env: - env_list = env.split('\x00') - - # iterating over a list of process environment variables - for i in env_list: - if i.startswith(user_env): - user = i - continue - - if i.startswith(display_env): - display = i[:10] - continue - - if i.startswith(dbus_env): - #if ',guid=' in i: - # return None - dbus = i - continue - - if i.startswith('HOME='): - # exclude Display Manager's user - if i.startswith('HOME=/var'): - return None - - env = user.partition('USER=')[2], display, dbus - return env - - except FileNotFoundError: - return None - except ProcessLookupError: - return None - - -def root_notify_env(): - """return set(user, display, dbus)""" - unsorted_envs_list = [] - # iterates over processes, find processes with suitable env - for pid in os.listdir('/proc'): - if pid[0].isdecimal() is False: - continue - one_env = re_pid_environ(pid) - unsorted_envs_list.append(one_env) - env = set(unsorted_envs_list) - env.discard(None) - - new_env = [] - end = [] - for i in env: - #print(i) - key = i[0] + i[1] - #print(key) - if key not in end: - end.append(key) - new_env.append(i) - else: - continue - #print(new_env) - return new_env - - def string_to_float_convert_test(string): """Try to interprete string values as floats.""" try: @@ -351,9 +283,28 @@ def pid_to_uid(pid): return line.split('\t')[1] +def notify_send_wait(title, body): + with Popen(['notify-send', '--icon=dialog-warning', title, body]) as proc: + try: + proc.wait(timeout=wait_time) + except TimeoutExpired: + proc.kill() + print('TimeoutExpired: notify-send {} {}'.format(title, body)) + + +def notify_helper(title, body): + with Popen([notify_helper_path, title, body]) as proc: + try: + proc.wait(timeout=wait_time) + except TimeoutExpired: + proc.kill() + print('TimeoutExpired: nohang_notify_helper {} {}'.format(title, body)) + + def send_notify_warn(): """ Look for process with maximum 'badness' and warn user with notification. + (implement Low memory warnings) """ # find process with max badness fat_tuple = fattest() @@ -378,23 +329,10 @@ def send_notify_warn(): if root: # If nohang was started by root # send notification to all active users with special script - - # теперь можно напрямую уведомлять из кэша если он не устарел - - Popen([ - '/usr/bin/nohang_notify_low_mem', - '--mem', low_mem_percent, - '--pid', pid, - '--name', name - ]) - + notify_helper(title, body) else: # Or by regular user # send notification to user that runs this nohang - try: - Popen(['notify-send', '--icon=dialog-warning', - '{}'.format(title), '{}'.format(body)]).wait(wait_time) - except TimeoutExpired: - print('TimeoutExpired: ' + 'notify low mem') + notify_send_wait(title, body) def send_notify(signal, name, pid): @@ -413,26 +351,10 @@ def send_notify(signal, name, pid): '&', '*')) if root: # send notification to all active users with notify-send - b = root_notify_env() - if len(b) > 0: - for i in b: - username, display_env, dbus_env = i[0], i[1], i[2] - #if '1000' in dbus_env: - # continue - #print(username, display_env, dbus_env) - try: - Popen(['sudo', '-u', username, 'env', display_env, - dbus_env, 'notify-send', '--icon=dialog-warning', - '{}'.format(title), '{}'.format(body)]).wait(wait_time) - except TimeoutExpired: - print('TimeoutExpired: ' + 'notify send signal') + notify_helper(title, body) else: # send notification to user that runs this nohang - try: - Popen(['notify-send', '--icon=dialog-warning', - '{}'.format(title), '{}'.format(body)]).wait(wait_time) - except TimeoutExpired: - print('TimeoutExpired: ' + 'notify send signal') + notify_send_wait(title, body) def send_notify_etc(pid, name, command): @@ -448,20 +370,10 @@ def send_notify_etc(pid, name, command): pid, name.replace('&', '*'), command.replace('&', '*')) if root: # send notification to all active users with notify-send - b = root_notify_env() - if len(b) > 0: - for i in b: - username, display_env, dbus_env = i[0], i[1], i[2] - try: - Popen(['sudo', '-u', username, 'env', display_env, - dbus_env, 'notify-send', '--icon=dialog-warning', - '{}'.format(title), '{}'.format(body)]).wait(wait_time) - except TimeoutExpired: - print('TimeoutExpired: notify run command') + notify_send_wait(title, body) else: # send notification to user that runs this nohang - Popen(['notify-send', '--icon=dialog-warning', '{}'.format(title), '{}' - .format(body)]) + notify_send_wait(title, body) def sleep_after_send_signal(signal): @@ -789,9 +701,20 @@ def find_victim_and_send_signal(signal): def sleep_after_check_mem(): """Specify sleep times depends on rates and avialable memory.""" - t_mem = mem_available / rate_mem - t_swap = swap_free / rate_swap - t_zram = (mem_total - mem_used_zram) / rate_zram + + if mem_min_sigkill_kb < mem_min_sigterm_kb: + mem_point = mem_available - mem_min_sigterm_kb + else: + mem_point = mem_available - mem_min_sigkill_kb + + if swap_min_sigkill_kb < swap_min_sigterm_kb: + swap_point = swap_free - swap_min_sigterm_kb + else: + swap_point = swap_free - swap_min_sigkill_kb + + t_mem = mem_point / rate_mem + t_swap = swap_point / rate_swap + t_zram = (mem_total * 0.9 - mem_used_zram) / rate_zram t_mem_swap = t_mem + t_swap t_mem_zram = t_mem + t_zram @@ -801,17 +724,20 @@ def sleep_after_check_mem(): else: t = t_mem_zram - max_sleep_time = 1 if t > max_sleep_time: - t = 1 + t = max_sleep_time + elif t < min_sleep_time: + t = min_sleep_time + else: + pass try: if print_sleep_periods: - print('sleep', round(t, 2), - ' (t_mem={}, t_swap={}, t_zram={})'.format( - round(t_mem, 2), - round(t_swap, 2), - round(t_zram, 2))) + print('sleep', round(t, 2)) + # ' (t_mem={}, t_swap={}, t_zram={})'.format( + #round(t_mem, 2), + #round(t_swap, 2), + #round(t_zram, 2))) stdout.flush() sleep(t) except KeyboardInterrupt: @@ -1425,68 +1351,10 @@ print('Startup time:', print('Monitoring started!') - -def save_env_cache(): - z = '{}\n'.format(int(time())) - a = root_notify_env() - # print(a) - for i in a: - z = z + '{}\x00{}\x00{}\n'.format(i[0], i[1], i[2]) - write(cache_path, z) - os.chmod(cache_path, 0000) - return a - - -def read_env_cache(): - x, y = [], [] - try: - with open(cache_path) as f: - for n, line in enumerate(f): - if n is 0: - t = line[:-1] - y.append(t) - continue - if n > 0: - x.append(line[:-1].split('\x00')) - except FileNotFoundError: - return None - y.append(x) - return y - - -def root_env_cache(): - cache = read_env_cache() - if cache is None: - print('cache not found, get new env and cache it') - return save_env_cache() - delta_t = time() - int(cache[0]) - if delta_t > cache_time: - print('cache time: {}, delta: {}, ' - 'get new env and cache it'.format( - cache_time, round(delta_t))) - save_env_cache() - return root_notify_env() - else: - print('cache time: {}, delta: {}, ' - 'get cached env'.format( - cache_time, round(delta_t))) - return cache[1] - - -t1 = time() -# root_env_cache() -t2 = time() -# print(t2 - t1) - - stdout.flush() -# exit() - - sigterm_psi = sigterm_psi_avg10 sigkill_psi = sigkill_psi_avg10 -# avg_min_time = 4 psi_min_sleep_time_after_action = psi_avg10_sleep_time @@ -1494,12 +1362,9 @@ psi_min_sleep_time_after_action = psi_avg10_sleep_time if psi_support and not ignore_psi: - # ta0 = time() - # a0 = psi_mem_some_avg_total() kill_psi_t0 = time() + psi_avg10_sleep_time term_psi_t0 = time() + psi_avg10_sleep_time - avg_value = '' while True: diff --git a/nohang.conf b/nohang.conf index 8fdf791..be454b2 100644 --- a/nohang.conf +++ b/nohang.conf @@ -56,7 +56,7 @@ swap_min_sigkill = 5 % usual hang level, not recommended to set very high. Can be specified in % and M. Valid values are floating-point - numbers from the range [0; 100] %. + numbers from the range [0; 90] %. zram_max_sigterm = 50 % zram_max_sigkill = 55 % @@ -93,8 +93,8 @@ psi_avg10_sleep_time = 60 Valid values are positive floating-point numbers. -rate_mem = 6 -rate_swap = 3 +rate_mem = 4 +rate_swap = 2 rate_zram = 1 See also https://github.com/rfjakob/earlyoom/issues/61 @@ -261,7 +261,7 @@ gui_notifications = True Enable GUI notifications about the low level of available memory. Valid values are True and False. -gui_low_memory_warnings = True +gui_low_memory_warnings = False Минимальное время между отправками уведомлений в секундах. Valid values are floating-point numbers from the range [1; 300]. @@ -304,5 +304,5 @@ print_mem_check_results = True Print sleep periods between memory checks. Valid values are True and False. -print_sleep_periods = False +print_sleep_periods = True diff --git a/nohang_notify_low_mem b/nohang_notify_helper similarity index 64% rename from nohang_notify_low_mem rename to nohang_notify_helper index 72fdfa0..b40e381 100755 --- a/nohang_notify_low_mem +++ b/nohang_notify_helper @@ -1,53 +1,17 @@ #!/usr/bin/env python3 +# +# Usage: +# ./nohang_notify_helper "title" "body" -# nohang_notify_low_mem --mem '14% 12%' --name 'stress' --pid '6666' - -# need UID=0 - -# output: -# Low memory: 14% 12% -# Fattest process: 6666, stress - -# need to remove this slow and fat parser -from argparse import ArgumentParser - +from sys import argv from os import listdir from subprocess import Popen, TimeoutExpired -wait_time = 10 +if len(argv) < 2 or argv[1] == "-h" or argv[1] == "--help": + print('Usage: ./nohang_notify_helper "title" "body"') + exit(1) -parser = ArgumentParser() - -parser.add_argument( - '--mem', - help="""available memory percent (15%, for example)""", - default=None, - type=str -) - -parser.add_argument( - '--pid', - help="""pid""", - default=None, - type=str -) - -parser.add_argument( - '--name', - help="""process name""", - default=None, - type=str -) - -args = parser.parse_args() - -pid = args.pid -name = args.name -mem = args.mem - -title = 'Low memory: {}'.format(mem) - -body = 'Fattest process: {}, {}'.format(pid, name) +wait_time = 12 display_env = 'DISPLAY=' dbus_env = 'DBUS_SESSION_BUS_ADDRESS=' @@ -69,9 +33,6 @@ def re_pid_environ(pid): 'DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus') returns None if these vars is not in /proc/[pid]/environ """ - display_env = 'DISPLAY=' - dbus_env = 'DBUS_SESSION_BUS_ADDRESS=' - user_env = 'USER=' try: env = str(rline1('/proc/' + pid + '/environ')) if display_env in env and dbus_env in env and user_env in env: @@ -88,8 +49,6 @@ def re_pid_environ(pid): continue if i.startswith(dbus_env): - #if ',guid=' in i: - # return None dbus = i continue @@ -119,27 +78,27 @@ def root_notify_env(): env = set(unsorted_envs_list) env.discard(None) + # deduplicate dbus new_env = [] end = [] for i in env: - #print(i) key = i[0] + i[1] - #print(key) if key not in end: end.append(key) new_env.append(i) else: continue - #print(new_env) + return new_env -b = root_notify_env() +list_with_envs = root_notify_env() + # if somebody logged in with GUI -if len(b) > 0: +if len(list_with_envs) > 0: # iterating over logged-in users - for i in b: + for i in list_with_envs: username, display_env, dbus_env = i[0], i[1], i[2] display_tuple = display_env.partition('=') dbus_tuple = dbus_env.partition('=') @@ -147,16 +106,16 @@ if len(b) > 0: dbus_key, dbus_value = dbus_tuple[0], dbus_tuple[2] with Popen(['sudo', '-u', username, - 'notify-send', '--icon=dialog-warning', - '{}'.format(title), '{}'.format(body) - ], env={ + 'notify-send', '--icon=dialog-warning', + argv[1], argv[2] + ], env={ display_key: display_value, dbus_key: dbus_value - }) as proc: - try: - proc.wait(timeout=wait_time) - except TimeoutExpired: - proc.kill() - print('TimeoutExpired: notify' + username) + }) as proc: + try: + proc.wait(timeout=wait_time) + except TimeoutExpired: + proc.kill() + print('TimeoutExpired: notify' + username) else: - print('Low memory warnings: nobody logged in with GUI. Nothing to do.') + print('Nobody logged-in with GUI. Nothing to do.')