From a6171e85b8403272fa8a0a0ff667eb2d55a2f1ba Mon Sep 17 00:00:00 2001 From: Alexey Avramov Date: Wed, 20 Mar 2019 17:47:14 +0900 Subject: [PATCH] fix logging; pep8 validation; add forbid_negative_badness and log_dir options --- Makefile | 16 ++- nohang | 284 +++++++++++++++++++++++++++++++--------------------- nohang.conf | 80 ++++++++++----- psi-monitor | 4 - 4 files changed, 232 insertions(+), 152 deletions(-) diff --git a/Makefile b/Makefile index d388da8..f77b5c4 100644 --- a/Makefile +++ b/Makefile @@ -4,29 +4,27 @@ PREFIX = / all: @ echo "Nothing to compile. Use: make install, make uninstall, make systemd" -install: +install: install -d $(DESTDIR)/$(PREFIX)/usr/sbin install -m0755 ./nohang $(DESTDIR)/$(PREFIX)/usr/sbin/nohang install -m0755 ./nohang_notify_helper $(DESTDIR)/$(PREFIX)/usr/sbin/nohang_notify_helper - + install -d $(DESTDIR)/$(PREFIX)/usr/bin install -m0755 ./oom-sort $(DESTDIR)/$(PREFIX)/usr/bin/oom-sort install -m0755 ./oom-trigger $(DESTDIR)/$(PREFIX)/usr/bin/oom-trigger - + install -d $(DESTDIR)/$(PREFIX)/etc/nohang install -m0644 ./nohang.conf $(DESTDIR)/$(PREFIX)/etc/nohang/$(VERSION) install -m0644 ./nohang.conf $(DESTDIR)/$(PREFIX)/etc/nohang/nohang.conf.default - - install -d $(DESTDIR)/$(PREFIX)/var/log/nohang - + install -d $(DESTDIR)/$(PREFIX)/usr/share/man/man1 gzip -k -c nohang.1 > $(DESTDIR)/$(PREFIX)/usr/share/man/man1/nohang.1.gz gzip -k -c oom-sort.1 > $(DESTDIR)/$(PREFIX)/usr/share/man/man1/oom-sort.1.gz gzip -k -c oom-trigger.1 > $(DESTDIR)/$(PREFIX)/usr/share/man/man1/oom-trigger.1.gz - + install -d $(DESTDIR)/$(PREFIX)/lib/systemd/system install -m0644 ./nohang.service $(DESTDIR)/$(PREFIX)/lib/systemd/system/nohang.service - + uninstall: # 'make uninstall' must not fail with error if systemctl is unavailable or returns error systemctl disable nohang.service || true @@ -40,7 +38,7 @@ uninstall: rm -fv $(PREFIX)/lib/systemd/system/nohang.service rm -fvr $(PREFIX)/etc/nohang/ rm -fvr $(PREFIX)/var/log/nohang/ - + systemd: systemctl daemon-reload systemctl enable nohang.service diff --git a/nohang b/nohang index b3984d9..bb85eda 100755 --- a/nohang +++ b/nohang @@ -9,32 +9,10 @@ from sys import stdout, stderr, argv, exit from signal import SIGKILL, SIGTERM import sys -import logging -from logging import basicConfig -from logging import info - start_time = time() -logfile = '/var/log/nohang/nohang.log' - - -basicConfig(filename=logfile, - level=logging.INFO, - format="%(asctime)s: %(message)s") - - -separate_log = False - - -def log(msg): - print(msg) - if separate_log: - info(msg) - - - help_mess = """usage: nohang [-h] [-c CONFIG] optional arguments: @@ -65,7 +43,6 @@ wait_time = 10 notify_helper_path = '/usr/sbin/nohang_notify_helper' - victim_dict = dict() @@ -80,8 +57,19 @@ stat_dict = dict() # define functions +def log(*msg): + """ + """ + print(*msg) + if separate_log: + info(*msg) + + def print_version(): - # сначала пытаться получ версию прямо из гита - вариант для неустановленых + """ + сначала пытаться получ версию прямо из гита - вариант для неустановленых, + для тех, кто еще не запускал make install + """ try: v = rline1('/etc/nohang/version') except FileNotFoundError: @@ -94,6 +82,8 @@ def print_version(): def test(): + """ + """ print(sys.version) print(sys.argv) @@ -155,11 +145,14 @@ def test(): def uptime(): + """ + """ return float(rline1('/proc/uptime').split(' ')[0]) def pid_to_starttime(pid): - + """ + """ try: starttime = rline1('/proc/' + pid + '/stat').rpartition(')')[ 2].split(' ')[20] @@ -180,6 +173,8 @@ def get_victim_id(pid): def errprint(*text): + """ + """ print(*text, file=stderr, flush=True) @@ -200,19 +195,22 @@ def mlockall(): MCL_CURRENT | MCL_FUTURE ) if result != 0: - print('Cannot lock all memory') + log('Cannot lock all memory') else: - print('All memory locked with MCL_CURRENT | MCL_FUTURE') + log('All memory locked with MCL_CURRENT | MCL_FUTURE') else: - print('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT') + log('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT') def pid_to_state(pid): + """ + """ return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1] def update_stat_dict_and_print(key): - + """ + """ if key not in stat_dict: stat_dict.update({key: 1}) @@ -232,24 +230,10 @@ def update_stat_dict_and_print(key): print(stats_msg) -''' -def psi_mem_some_avg_total(): - if psi_support: - return float(rline1(psi_path).rpartition('=')[2]) -''' - -''' -def psi_mem_some_avg10(): - if psi_support: - return float(rline1(psi_path).split(' ')[1].split('=')[1]) -''' - - - - -# psi_metrics = 'some_avg10' def find_psi_metrics_value(psi_path, psi_metrics): + """ + """ if psi_support: @@ -309,9 +293,11 @@ def check_zram(): # Means that when setting zram disksize = 1 GiB available memory # decrease by 0.0042 GiB. - # Found experimentally, requires clarification with different kernaels and architectures. + # Found experimentally, requires clarification with different kernaels and + # architectures. # On small disk drives (up to gigabyte) it can be more, up to 0.0045. - # The creator of the zram module claims that ZRAM_DISKSIZE_FACTOR should be 0.001: + # The creator of the zram module claims that ZRAM_DISKSIZE_FACTOR should + # be 0.001: # ("zram uses about 0.1% of the size of the disk" # - https://www.kernel.org/doc/Documentation/blockdev/zram.txt), # but this statement contradicts the experimental data. @@ -323,6 +309,8 @@ def check_zram(): def format_time(t): + """ + """ t = int(t) if t < 60: return '{} sec'.format(t) @@ -406,15 +394,6 @@ def rline1(path): 'utf-8', 'ignore').split('\n')[0] - - - - - - - - - def kib_to_mib(num): """Convert KiB values to MiB values.""" return round(num / 1024.0) @@ -431,6 +410,8 @@ def just_percent_mem(num): def just_percent_swap(num): + """ + """ return str(round(num * 100, 1)).rjust(5, ' ') @@ -488,6 +469,8 @@ def pid_to_name(pid): def pid_to_ppid(pid): + """ + """ try: with open('/proc/' + pid + '/status') as f: for n, line in enumerate(f): @@ -506,6 +489,8 @@ def pid_to_ppid(pid): def pid_to_ancestry(pid, max_ancestry_depth=1): + """ + """ if max_ancestry_depth == 1: ppid = pid_to_ppid(pid) pname = pid_to_name(ppid) @@ -545,7 +530,7 @@ def pid_to_realpath(pid): def pid_to_uid(pid): - '''return euid''' + """return euid""" try: with open('/proc/' + pid + '/status') as f: for n, line in enumerate(f): @@ -558,7 +543,7 @@ def pid_to_uid(pid): def notify_send_wait(title, body): - '''GUI notifications with UID != 0''' + """GUI notifications with UID != 0""" with Popen(['notify-send', '--icon=dialog-warning', title, body]) as proc: try: proc.wait(timeout=wait_time) @@ -568,7 +553,7 @@ def notify_send_wait(title, body): def notify_helper(title, body): - '''GUI notification with UID = 0''' + """GUI notification with UID = 0""" with Popen([notify_helper_path, title, body]) as proc: try: @@ -727,6 +712,8 @@ pid_list = get_pid_list() def get_non_decimal_pids(): + """ + """ non_decimal_list = [] for pid in pid_list: if pid[0].isdecimal() is False: @@ -765,6 +752,10 @@ def pid_to_badness(pid): if search(re_tup[1], uid) is not None: badness += int(re_tup[0]) + if forbid_negative_badness: + if badness < 0: + badness = 0 + return badness, oom_score except FileNotFoundError: @@ -796,9 +787,11 @@ def find_victim(): pid_badness_list = [] if print_proc_table: - log('===============================================================================') + log('==============================================================' + '=================') log(' PID badness Name eUID cmdline') - log('------- ------- --------------- ---------- ---------------------------------') + log('------- ------- --------------- ---------- -----------' + '----------------------') for pid in pid_list: @@ -834,10 +827,12 @@ def find_victim(): victim_name = pid_to_name(pid) if print_proc_table: - log('===============================================================================') + log('============================================================' + '===================') log( - 'Process with highest badness (found in {} ms):\n PID: {}, Name: {}, badness: {}'.format( + 'Process with highest badness (found in {} ms):\n PID: {}, Na' + 'me: {}, badness: {}'.format( round((time() - ft1) * 1000), pid, victim_name, @@ -849,6 +844,8 @@ def find_victim(): def find_victim_info(pid, victim_badness, name): + """ + """ status0 = time() @@ -1080,8 +1077,9 @@ def implement_corrective_action(signal): m = check_mem_and_swap() ma = round(int(m[0]) / 1024.0) sf = round(int(m[2]) / 1024.0) - log('Memory status before implementing a corrective action:\n MemAvailable' - ': {} MiB, SwapFree: {} MiB'.format(ma, sf)) + log('Memory status before implementing a corrective act' + 'ion:\n MemAvailable' + ': {} MiB, SwapFree: {} MiB'.format(ma, sf)) exit_status = os.system(etc_dict[name].replace( '$PID', pid).replace('$NAME', pid_to_name(pid))) @@ -1093,13 +1091,15 @@ def implement_corrective_action(signal): response_time = time() - time0 - etc_info = 'Implement a corrective action:\n Run the command: {}' \ - '\n Exit status: {}; total response time: {} ms'.format( - command.replace( - '$PID', pid).replace( - '$NAME', pid_to_name(pid)), - exit_status, - round(response_time * 1000)) + etc_info = 'Implement a corrective act' \ + 'ion:\n Run the command: {}' \ + '\n Exit status: {}; total response ' \ + 'time: {} ms'.format( + command.replace( + '$PID', pid).replace( + '$NAME', pid_to_name(pid)), + exit_status, + round(response_time * 1000)) print(etc_info) @@ -1110,7 +1110,8 @@ def implement_corrective_action(signal): send_notify_etc( pid, name, - command.replace('$PID', pid).replace('$NAME', pid_to_name(pid))) + command.replace('$PID', pid).replace( + '$NAME', pid_to_name(pid))) else: @@ -1119,8 +1120,9 @@ def implement_corrective_action(signal): m = check_mem_and_swap() ma = round(int(m[0]) / 1024.0) sf = round(int(m[2]) / 1024.0) - log('Memory status before implementing a corrective action:\n MemAvailable' - ': {} MiB, SwapFree: {} MiB'.format(ma, sf)) + log('Memory status before implementing a correct' + 'ive action:\n MemAvailable' + ': {} MiB, SwapFree: {} MiB'.format(ma, sf)) os.kill(int(pid), signal) response_time = time() - time0 @@ -1149,12 +1151,14 @@ def implement_corrective_action(signal): response_time = time() - time0 send_result = 'no such process; response time: {} ms'.format( round(response_time * 1000)) - key = 'FileNotFoundError (the victim died in the search process): ' + key = 'FileNotFoundError (the victim died in the se' \ + 'arch process): ' except ProcessLookupError: response_time = time() - time0 send_result = 'no such process; response time: {} ms'.format( round(response_time * 1000)) - key = 'ProcessLookupError (the victim died in the search process): ' + key = 'ProcessLookupError (the victim died in the se' \ + 'arch process): ' log(preventing_oom_message) @@ -1175,7 +1179,6 @@ def implement_corrective_action(signal): key = 'victim badness < min_badness' update_stat_dict_and_print(key) - sleep_after_send_signal(signal) @@ -1262,7 +1265,8 @@ def calculate_percent(arg_key): # Final validations... if mem_min_percent < 0 or mem_min_percent > 100: errprint( - '{}, as percents value, out of range [0; 100]\nExit'.format(arg_key)) + '{}, as percents value, out of ran' + 'ge [0; 100]\nExit'.format(arg_key)) exit(1) # mem_min_sigterm_percent is clean and valid float percentage. Can @@ -1278,7 +1282,8 @@ def calculate_percent(arg_key): mem_min_kb = mem_min_mb * 1024 if mem_min_kb > mem_total: errprint( - '{} value can not be greater then MemTotal ({} MiB)\nExit'.format( + '{} value can not be greater then MemT' + 'otal ({} MiB)\nExit'.format( arg_key, round( mem_total / 1024))) exit(1) @@ -1381,6 +1386,7 @@ except ValueError: print('Config:', config) +# todo: log it ########################################################################## @@ -1473,6 +1479,7 @@ except FileNotFoundError: # validation of all parameters +forbid_negative_badness = conf_parse_bool('forbid_negative_badness') print_victim_info = conf_parse_bool('print_victim_info') print_config = conf_parse_bool('print_config') print_mem_check_results = conf_parse_bool('print_mem_check_results') @@ -1491,20 +1498,23 @@ if regex_matching or re_match_cmdline or re_match_uid: from re import search import sre_constants -mem_min_sigterm_kb, mem_min_sigterm_mb, mem_min_sigterm_percent = calculate_percent( - 'mem_min_sigterm') -mem_min_sigkill_kb, mem_min_sigkill_mb, mem_min_sigkill_percent = calculate_percent( - 'mem_min_sigkill') +(mem_min_sigterm_kb, mem_min_sigterm_mb, mem_min_sigterm_percent + ) = calculate_percent('mem_min_sigterm') -zram_max_sigterm_kb, zram_max_sigterm_mb, zram_max_sigterm_percent = calculate_percent( - 'zram_max_sigterm') -zram_max_sigkill_kb, zram_max_sigkill_mb, zram_max_sigkill_percent = calculate_percent( - 'zram_max_sigkill') +(mem_min_sigkill_kb, mem_min_sigkill_mb, mem_min_sigkill_percent + ) = calculate_percent('mem_min_sigkill') -mem_min_warnings_kb, mem_min_warnings_mb, mem_min_warnings_percent = calculate_percent( - 'mem_min_warnings') -zram_max_warnings_kb, zram_max_warnings_mb, zram_max_warnings_percent = calculate_percent( - 'zram_max_warnings') +(zram_max_sigterm_kb, zram_max_sigterm_mb, zram_max_sigterm_percent + ) = calculate_percent('zram_max_sigterm') + +(zram_max_sigkill_kb, zram_max_sigkill_mb, zram_max_sigkill_percent + ) = calculate_percent('zram_max_sigkill') + +(mem_min_warnings_kb, mem_min_warnings_mb, mem_min_warnings_percent + ) = calculate_percent('mem_min_warnings') + +(zram_max_warnings_kb, zram_max_warnings_mb, zram_max_warnings_percent + ) = calculate_percent('zram_max_warnings') if 'rate_mem' in config_dict: @@ -1697,10 +1707,12 @@ if 'max_post_sigterm_victim_lifetime' in config_dict: max_post_sigterm_victim_lifetime = string_to_float_convert_test( config_dict['max_post_sigterm_victim_lifetime']) if max_post_sigterm_victim_lifetime is None: - errprint('Invalid max_post_sigterm_victim_lifetime value, not float\nExit') + errprint('Invalid max_post_sigterm_victim_lifetime val' + 'ue, not float\nExit') exit(1) if max_post_sigterm_victim_lifetime < 0: - errprint('max_post_sigterm_victim_lifetime must be non-negative number\nExit') + errprint('max_post_sigterm_victim_lifetime must be non-n' + 'egative number\nExit') exit(1) else: errprint('max_post_sigterm_victim_lifetime is not in config\nExit') @@ -1714,7 +1726,6 @@ else: exit(1) - if 'psi_path' in config_dict: psi_path = config_dict['psi_path'] else: @@ -1729,11 +1740,51 @@ else: exit(1) +if 'log_dir' in config_dict: + log_dir = config_dict['log_dir'] +else: + errprint('log_dir is not in config\nExit') + exit(1) print_total_stat = conf_parse_bool('print_total_stat') print_proc_table = conf_parse_bool('print_proc_table') +separate_log = conf_parse_bool('separate_log') + +if separate_log: + + import logging + from logging import basicConfig + from logging import info + + try: + os.mkdir(log_dir) + except PermissionError: + print('ERROR: can not create log dir') + except FileExistsError: + pass + + logfile = log_dir + '/nohang.log' + + try: + with open(logfile, 'a') as f: + pass + except FileNotFoundError: + print('ERROR: log FileNotFoundError') + except PermissionError: + print('ERROR: log PermissionError') + + try: + basicConfig( + filename=logfile, + level=logging.INFO, + format="%(asctime)s: %(message)s") + except PermissionError: + errprint('ERROR: Permission denied: {}'.format(logfile)) + except FileNotFoundError: + errprint('ERROR: FileNotFoundError: {}'.format(logfile)) + if 'min_mem_report_interval' in config_dict: min_mem_report_interval = string_to_float_convert_test( @@ -1790,12 +1841,9 @@ if max_sleep_time < min_sleep_time: psi_support = os.path.exists(psi_path) - ########################################################################## - - # Get KiB levels if it's possible. # получ кб. если не кб - то процент. Если процент - находим кб ниже на @@ -1834,7 +1882,8 @@ def get_swap_threshold_tuple(string): return value, False else: - errprint('Invalid config file. There are invalid units somewhere\nExit') + errprint( + 'Invalid config file. There are invalid units somewhere\nExit') exit(1) @@ -1869,7 +1918,8 @@ else: if print_config: print( - '\n1. Memory levels to respond to as an OOM threat\n[displaying these options need fix]\n') + '\n1. Memory levels to respond to as an OOM threat\n[display' + 'ing these options need fix]\n') print('mem_min_sigterm: {} MiB, {} %'.format( round(mem_min_sigterm_mb), round(mem_min_sigterm_percent, 1))) @@ -1884,7 +1934,8 @@ if print_config: print('zram_max_sigkill: {} MiB, {} %'.format( round(zram_max_sigkill_mb), round(zram_max_sigkill_percent, 1))) - print('\n2. The frequency of checking the level of available memory (and CPU usage)\n') + print('\n2. The frequency of checking the level of available m' + 'emory (and CPU usage)\n') print('rate_mem: {}'.format(rate_mem)) print('rate_swap: {}'.format(rate_swap)) print('rate_zram: {}'.format(rate_zram)) @@ -1906,19 +1957,22 @@ if print_config: print('(todo)') - print('\n5. The execution of a specific command instead of sending the\nSIGTERM signal\n') + print('\n5. The execution of a specific command instead of sen' + 'ding the\nSIGTERM signal\n') print('execute_the_command: {}'.format(execute_the_command)) if execute_the_command: print('\nPROCESS NAME COMMAND TO EXECUTE') for key in etc_dict: print('{} {}'.format(key.ljust(15), etc_dict[key])) - print('\n6. GUI notifications:\n- OOM prevention results and\n- low memory warnings\n') + print('\n6. GUI notifications:\n- OOM prevention results and\n- low m' + 'emory warnings\n') print('gui_notifications: {}'.format(gui_notifications)) print('gui_low_memory_warnings: {}'.format(gui_low_memory_warnings)) if gui_low_memory_warnings: - print('min_time_between_warnings: {}'.format(min_time_between_warnings)) + print('min_time_between_warnings: {}'.format( + min_time_between_warnings)) print('mem_min_warnings: {} MiB, {} %'.format( round(mem_min_warnings_mb), round(mem_min_warnings_percent, 1))) @@ -1973,7 +2027,7 @@ if print_proc_table: find_victim() print() -print('Monitoring started!') +log('Monitoring started!') stdout.flush() @@ -2011,14 +2065,16 @@ while True: if avg10 >= sigkill_psi and time() - psi_t0 >= psi_avg10_sleep_time: time0 = time() - mem_info = 'PSI avg value ({}) > sigkill_psi ({})'.format(avg10, sigkill_psi) + mem_info = 'PSI avg value ({}) > sigkill_psi ({})'.format( + avg10, sigkill_psi) implement_corrective_action(SIGKILL) psi_t0 = time() continue if avg10 >= sigterm_psi and time() - psi_t0 >= psi_avg10_sleep_time: time0 = time() - mem_info = 'PSI avg value ({}) > sigterm_psi ({})'.format(avg10, sigterm_psi) + mem_info = 'PSI avg value ({}) > sigterm_psi ({})'.format( + avg10, sigterm_psi) implement_corrective_action(SIGTERM) psi_t0 = time() continue @@ -2076,7 +2132,7 @@ while True: # Calculate 'swap-column' width swap_len = len(str(round(swap_total / 1024.0))) - # Output avialable mem sizes + # Output available mem sizes if swap_total == 0 and mem_used_zram == 0: log('{}MemAvail: {} M, {} %{}'.format( avg_value, @@ -2133,7 +2189,8 @@ while True: swap_free <= swap_min_sigkill_kb): time0 = time() - mem_info = 'Hard threshold exeeded\nMemory status that requires corrective actions:' \ + mem_info = 'Hard threshold exceeded\nMemory status that requ' \ + 'ires corrective actions:' \ '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ 'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ 'p_min_sigkill [{} MiB, {} %]'.format( @@ -2154,7 +2211,8 @@ while True: if mem_used_zram >= zram_max_sigkill_kb: time0 = time() - mem_info = 'Hard threshold exeeded\nMemory status that requires corrective actions:' \ + mem_info = 'Hard threshold exceeded\nMemory status that requir' \ + 'es corrective actions:' \ '\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \ 'kill [{} MiB, {} %]'.format( kib_to_mib(mem_used_zram), @@ -2172,7 +2230,8 @@ while True: time0 = time() - mem_info = 'Soft threshold exeeded\nMemory status that requires corrective actions:' \ + mem_info = 'Soft threshold exceeded\nMemory status that requi' \ + 'res corrective actions:' \ '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ 'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ 'p_min_sigterm [{} MiB, {} %]'.format( @@ -2195,7 +2254,8 @@ while True: if mem_used_zram >= zram_max_sigterm_kb: time0 = time() - mem_info = 'Soft threshold exeeded\nMemory status that requires corrective actions:' \ + mem_info = 'Soft threshold exceeded\nMemory status that requ' \ + 'ires corrective actions:' \ '\n MemUsedZram [{} MiB, {} %] >= ' \ 'zram_max_sigterm [{} M, {} %]'.format( kib_to_mib(mem_used_zram), diff --git a/nohang.conf b/nohang.conf index 556a7d9..9a89942 100644 --- a/nohang.conf +++ b/nohang.conf @@ -12,20 +12,22 @@ The configuration includes the following sections: 1. Memory levels to respond to as an OOM threat - 2. The frequency of checking the level of available memory + 2. Response on PSI memory metrics + 3. The frequency of checking the level of available memory (and CPU usage) - 3. The prevention of killing innocent victims - 4. Impact on the badness of processes via matching their + 4. The prevention of killing innocent victims + 5. Impact on the badness of processes via matching their - names, - cmdlines and - UIDs with regular expressions - 5. The execution of a specific command instead of sending the + 6. The execution of a specific command instead of sending the SIGTERM signal - 6. GUI notifications: + 7. GUI notifications: - OOM prevention results and - low memory warnings - 7. Output verbosity + 8. Output verbosity + 9. Misc Just read the description of the parameters and edit the values. Please restart the program after editing the config. @@ -56,26 +58,42 @@ swap_min_sigkill = 5 % usual hang level, not recommended to set very high. Can be specified in % and M. Valid values are floating-point - numbers from the range [0; 90] %. + numbers from the range [0; 90] %. zram_max_sigterm = 50 % zram_max_sigkill = 55 % +##################################################################### - Response on PSI memory some/full avg10/avg60/avg300 value - (/proc/pressure/memory on systems with Linux 4.20+). + 2. Response on PSI memory metrics (it needs Linux 4.20 and up) + + About PSI: + https://facebookmicrosites.github.io/psi/ + + Disabled by default (ignore_psi = True). ignore_psi = True - Choose path to PSI file. - + Choose a path to PSI file. + By default it monitors system-wide file: /proc/pressure/memory + You also can set file to monitor one cgroup slice. + For example: psi_path = /sys/fs/cgroup/unified/user.slice/memory.pressure psi_path = /sys/fs/cgroup/unified/system.slice/memory.pressure psi_path = /sys/fs/cgroup/unified/system.slice/foo.service/memory.pressure - psi_path = ./psi_dummy psi_path = /proc/pressure/memory + Valid psi_metrics are: + some_avg10 + some_avg60 + some_avg300 + full_avg10 + full_avg60 + full_avg300 + + some_avg10 is most sensitive. + psi_metrics = some_avg10 sigterm_psi_avg10 = 60 @@ -86,7 +104,7 @@ psi_avg10_sleep_time = 60 ##################################################################### - 2. The frequency of checking the amount of available memory + 3. The frequency of checking the amount of available memory (and CPU usage) Coefficients that affect the intensity of monitoring. Reducing @@ -124,9 +142,10 @@ min_sleep_time = 0.1 ##################################################################### - 3. The prevention of killing innocent victims + 4. The prevention of killing innocent victims - Минимальное значение oom_score, которым должен обладать + Минимальное значение bandess (по умолчанию равно oom_score), + которым должен обладать процесс для того, чтобы ему был отправлен сигнал. Позволяет предотвратить убийство невиновных если что-то пойдет не так. @@ -163,7 +182,7 @@ oom_score_adj_max = 30 ##################################################################### - 4. Impact on the badness of processes via matching their names, + 5. Impact on the badness of processes via matching their names, cmdlines or UIDs with regular expressions using re.search(). See https://en.wikipedia.org/wiki/Regular_expression and @@ -179,7 +198,7 @@ oom_score_adj_max = 30 names, cmdlines and UIDs of processes. - 4.1 Matching process names with RE patterns + 5.1 Matching process names with RE patterns Valid values are True and False. @@ -203,7 +222,7 @@ regex_matching = False @PROCESSNAME_RE 300 /// ^(chromium|firefox)$ - 4.2 Matching cmdlines with RE patterns + 5.2 Matching cmdlines with RE patterns A good option that allows fine adjustment. @@ -214,7 +233,7 @@ re_match_cmdline = False @CMDLINE_RE -200 /// ^/usr/lib/virtualbox - 4.3 Matching UIDs with RE patterns + 5.3 Matching UIDs with RE patterns The most slow option @@ -227,7 +246,7 @@ re_match_uid = False ##################################################################### - 5. The execution of a specific command instead of sending the + 6. The execution of a specific command instead of sending the SIGTERM signal. For processes with a specific name you can specify a command to @@ -277,7 +296,7 @@ $ETC apache2 /// systemctl restart apache2 ##################################################################### - 6. GUI notifications: + 7. GUI notifications: - OOM prevention results and - low memory warnings @@ -323,7 +342,7 @@ zram_max_warnings = 40 % ##################################################################### - 7. Verbosity + 8. Verbosity Display the configuration when the program starts. Valid values are True and False. @@ -357,15 +376,20 @@ print_proc_table = False print_victim_info = True - Максимальная глубина показа родословной. По умолчанию (1) - показывается только родитель - PPID. + Максимальная глубина показа родословной жертвы. + По умолчанию (1) показывается только родитель - PPID. Целое положительное число. -max_ancestry_depth = 3 +max_ancestry_depth = 1 + +separate_log = False + +log_dir = /var/log/nohang + ##################################################################### - 8. Misc + 9. Misc Жертва может не реагировать на SIGTERM. max_post_sigterm_victim_lifetime - это время, при превышении @@ -378,5 +402,7 @@ max_post_sigterm_victim_lifetime = 10 Пустая строка - ничего не выполнять. Произвольная строка. -post_kill_exe = +post_kill_exe = + +forbid_negative_badness = True diff --git a/psi-monitor b/psi-monitor index 50fa0ea..80694f9 100755 --- a/psi-monitor +++ b/psi-monitor @@ -49,7 +49,3 @@ while True: stdout.flush() sleep(0.1) - - - -