diff --git a/ascii.conf b/ascii.conf new file mode 100644 index 0000000..5245163 --- /dev/null +++ b/ascii.conf @@ -0,0 +1,359 @@ + + This is nohang config file. + + Redesign of this config in progress. + + Lines starting with #, tabs and spaces are comments. + + Lines starting with $ contain obligatory parameters. + + Lines starting with @ contain optional parameters. + + The configuration includes the following sections: + + 1. Memory levels to respond to as an OOM threat + 2. Response on PSI memory metrics + 3. The frequency of checking the level of available memory + (and CPU usage) + 4. The prevention of killing innocent victims + 5. Impact on the badness of processes via matching their + - names, + - cmdlines and + - UIDs + with regular expressions + 6. The execution of a specific command instead of sending the + SIGTERM signal + 7. GUI notifications: + - OOM prevention results and + - low memory warnings + 8. Output verbosity + 9. Misc + + Just read the description of the parameters and edit the values. + Please restart the program after editing the config. + +##################################################################### + + 1. Thresholds below which a signal should be sent to the victim + + Sets the available memory levels at or below which SIGTERM or SIGKILL + signals are sent. The signal will be sent if MemAvailable and + SwapFree (in /proc/meminfo) at the same time will drop below the + corresponding values. Can be specified in % (percent) and M (MiB). + Valid values are floating-point numbers from the range [0; 100] %. + + MemAvailable levels. + +mem_min_sigterm = 10 % +mem_min_sigkill = 5 % + + SwapFree levels. + +swap_min_sigterm = 10 % +swap_min_sigkill = 5 % + + Specifying the total share of zram in memory, if exceeded the + corresponding signals are sent. As the share of zram in memory + increases, it may fall responsiveness of the system. 90 % is a + usual hang level, not recommended to set very high. + + Can be specified in % and M. Valid values are floating-point + numbers from the range [0; 90] %. + +zram_max_sigterm = 50 % +zram_max_sigkill = 55 % + +##################################################################### + + 2. Response on PSI memory metrics (it needs Linux 4.20 and up) + + About PSI: + https://facebookmicrosites.github.io/psi/ + + Disabled by default (ignore_psi = True). + +ignore_psi = True + + Choose a path to PSI file. + By default it monitors system-wide file: /proc/pressure/memory + You also can set file to monitor one cgroup slice. + For example: + psi_path = /sys/fs/cgroup/unified/user.slice/memory.pressure + psi_path = /sys/fs/cgroup/unified/system.slice/memory.pressure + psi_path = /sys/fs/cgroup/unified/system.slice/foo.service/memory.pressure + +psi_path = /proc/pressure/memory + + Valid psi_metrics are: + some_avg10 + some_avg60 + some_avg300 + full_avg10 + full_avg60 + full_avg300 + + some_avg10 is most sensitive. + +psi_metrics = some_avg10 + +sigterm_psi_threshold = 80 +sigkill_psi_threshold = 90 + +psi_post_action_delay = 60 + +##################################################################### + + 3. The frequency of checking the amount of available memory + (and CPU usage) + + Coefficients that affect the intensity of monitoring. Reducing + the coefficients can reduce CPU usage and increase the periods + between memory checks. + + Why three coefficients instead of one? Because the swap fill rate + is usually lower than the RAM fill rate. + + It is possible to set a lower intensity of monitoring for swap + without compromising to prevent OOM and thus reduce the CPU load. + + Default values are well for desktop. On servers without rapid + fluctuations in memory levels the values can be reduced. + + Valid values are positive floating-point numbers. + +rate_mem = 4000 +rate_swap = 1500 +rate_zram = 500 + + See also https://github.com/rfjakob/earlyoom/issues/61 + +max_sleep_time = 3 + +min_sleep_time = 0.1 + +##################################################################### + + 4. The prevention of killing innocent victims + + Valid values are integers from the range [0; 1000]. + +min_badness = 20 + + Valid values are non-negative floating-point numbers. + +min_delay_after_sigterm = 0.2 +min_delay_after_sigkill = 1 + + Enabling the option requires root privileges. + Valid values are True and False. + Values are case sensitive. + +decrease_oom_score_adj = False + + Valid values are integers from the range [0; 1000]. + +oom_score_adj_max = 20 + +##################################################################### + + 5. Impact on the badness of processes via matching their names, + cmdlines or UIDs with regular expressions using re.search(). + + See https://en.wikipedia.org/wiki/Regular_expression and + https://en.wikipedia.org/wiki/Perl_Compatible_Regular_Expressions + + Enabling this options slows down the search for the victim + because the names, cmdlines or UIDs of all processes + (except init and kthreads) are compared with the + specified regex patterns (in fact slowing down is caused by + reading all /proc/*/cmdline and /proc/*/status files). + + Use script `oom-sort` from nohang package to view + names, cmdlines and UIDs of processes. + + + 5.1 Matching process names with RE patterns + + Valid values are True and False. + +regex_matching = False + + Syntax: + + @PROCESSNAME_RE badness_adj /// RE_pattern + + New badness value will be += badness_adj + + It is possible to compare multiple patterns + with different badness_adj values. + + Example: + +@PROCESSNAME_RE -100 /// ^Xorg$ + +@PROCESSNAME_RE -500 /// ^sshd$ + + 5.2 Matching cmdlines with RE patterns + + A good option that allows fine adjustment. + +re_match_cmdline = False + +@CMDLINE_RE 300 /// -childID|--type=renderer + +@CMDLINE_RE -200 /// ^/usr/lib/virtualbox + + 5.3 Matching UIDs with RE patterns + + The most slow option + +re_match_uid = False + +@UID_RE -100 /// ^0$ + + 5.4 Matching CGroup-line with RE patterns + +re_match_cgroup = True + + @CGROUP_RE -50 /// system.slice + + @CGROUP_RE 50 /// foo.service +@CGROUP_RE 2000 /// user.slice + + 5.5 Matching realpath with RE patterns + +re_match_realpath = False + +@REALPATH_RE 20 /// ^/usr/bin/foo + + Note that you can control badness also via systemd units via OOMScoreAdjust, see + https://www.freedesktop.org/software/systemd/man/systemd.exec.html#OOMScoreAdjust= + +##################################################################### + + 6. The execution of a specific command instead of sending the + SIGTERM signal. + + For processes with a specific name you can specify a command to + run instead of sending the SIGTERM signal. + + For example, if the process is running as a daemon, you can run + the restart command instead of sending SIGTERM. + + Valid values are True and False. + +execute_the_command = False + + The length of the process name can't exceed 15 characters. + The syntax is as follows: lines starting with keyword $ETC are + considered as the lines containing names of processes and + corresponding commands. After a name of process the triple slash + (///) follows. And then follows the command that will be + executed if the specified process is selected as a victim. The + ampersand (&) at the end of the command will allow nohang to + continue runing without waiting for the end of the command + execution. + + For example: + $ETC mysqld /// systemctl restart mariadb.service & + $ETC php-fpm7.0 /// systemctl restart php7.0-fpm.service + + If command will contain $PID pattern, this template ($PID) will + be replaced by PID of process which name match with RE pattern. + + Exmple: + + $ETC bash /// kill -KILL $PID + + It is way to send any signal instead of SIGTERM. + (run `kill -L` to see list of all signals) + + Also $NAME will be replaced by process name. + + $ETC bash /// kill -9 $PID + +$ETC firefox-esr /// kill -SEGV $PID + +$ETC tail /// kill -9 $PID + +$ETC apache2 /// systemctl restart apache2 + + +##################################################################### + + 7. GUI notifications: + - OOM prevention results and + - low memory warnings + +gui_notifications = True + + Enable GUI notifications about the low level of available memory. + Valid values are True and False. + +gui_low_memory_warnings = True + + Execute the command instead of sending GUI notifications if the value is + not empty line. For example: + warning_exe = cat /proc/meminfo & + +warning_exe = + + Can be specified in % (percent) and M (MiB). + Valid values are floating-point numbers from the range [0; 100] %. + +mem_min_warnings = 25 % + +swap_min_warnings = 25 % + +zram_max_warnings = 40 % + + Valid values are floating-point numbers from the range [1; 300]. + +min_time_between_warnings = 15 + + Ampersands (&) will be replaced with asterisks (*) in process + names and in commands. + +##################################################################### + + 8. Verbosity + + Display the configuration when the program starts. + Valid values are True and False. + +print_config = False + + Print memory check results. + Valid values are True and False. + +print_mem_check_results = False + +min_mem_report_interval = 60 + + Print sleep periods between memory checks. + Valid values are True and False. + +print_sleep_periods = False + +print_total_stat = True + +print_proc_table = True + +print_victim_info = True + +max_ancestry_depth = 1 + +separate_log = False + +psi_debug = False + +##################################################################### + + 9. Misc + +max_post_sigterm_victim_lifetime = 10 + +post_kill_exe = + +forbid_negative_badness = True + diff --git a/nohang b/nohang index b83c3b8..2fce26d 100755 --- a/nohang +++ b/nohang @@ -43,7 +43,7 @@ else: root = False -notify_helper_path = '/usr/sbin/nohang_notify_helper' +notify_helper_path = './nohang_notify_helper' victim_dict = dict() @@ -63,6 +63,23 @@ separate_log = False # will be overwritten after parse config # define functions + + + +def exe(cmd): + """ + """ + log('Execute the command: {}'.format(cmd)) + t0 = time() + err = os.system(cmd) + dt = time() - t0 + log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3))) + return err + + + + + def valid_re(reg_exp): """Validate regular expression. """ @@ -652,9 +669,7 @@ def send_notify_warn(): if check_warning_exe: print('Warning threshold exceeded') - print('Execute the command: {}'.format(warning_exe)) - err = os.system(warning_exe) - print('Exit code: {}'.format(err)) + exe(warning_exe) else: @@ -746,7 +761,7 @@ def send_notification(title, body): cmd = '{} --uid {} --time {} &'.format(notify_helper_path, self_uid, t000) - os.system(cmd) + exe(cmd) def sleep_after_send_signal(signal): @@ -1177,13 +1192,27 @@ def implement_corrective_action(signal): 'ion:\n MemAvailable' ': {} MiB, SwapFree: {} MiB'.format(ma, sf)) - exit_status = os.system(etc_dict[name].replace( - '$PID', pid).replace('$NAME', pid_to_name(pid))) - if exit_status == 0: - exit_status = '0' - else: - exit_status = '{}'.format(exit_status) + + cmd = etc_dict[name].replace('$PID', pid).replace( + '$NAME', pid_to_name(pid)) + + + exit_status = exe(cmd) + + + + + + + + exit_status = str(exit_status) + + + + + + response_time = time() - time0 @@ -1237,10 +1266,10 @@ def implement_corrective_action(signal): cmd = post_kill_exe.replace('$PID', pid).replace( '$NAME', pid_to_name(pid)) - print('exe:', cmd) - err = os.system(cmd) + log('Execute post_kill_exe') + + exe(cmd) - print('post_kill_exe exit status:', err) if gui_notifications: @@ -1476,8 +1505,8 @@ for s in mem_list: mem_list_names.append(s.split(':')[0]) if mem_list_names[2] != 'MemAvailable': - errprint('Your Linux kernel is too old, Linux 3.14+ requied\nExit') - exit(1) + errprint('WARNING: Your Linux kernel is too old, Linux 3.14+ requied\nExit') + # exit(1) swap_total_index = mem_list_names.index('SwapTotal') swap_free_index = swap_total_index + 1 diff --git a/nohang.conf b/nohang.conf index 464f530..9542f1c 100644 --- a/nohang.conf +++ b/nohang.conf @@ -238,17 +238,18 @@ re_match_uid = False 5.4 Matching CGroup-line with RE patterns -re_match_cgroup = False +re_match_cgroup = True -@CGROUP_RE -50 /// system.slice + @CGROUP_RE -50 /// system.slice -@CGROUP_RE 50 /// foo.service + @CGROUP_RE 50 /// foo.service +@CGROUP_RE 2000 /// user.slice 5.5 Matching realpath with RE patterns re_match_realpath = False -@REALPATH_RE 20 /// ^/usr/bin +@REALPATH_RE 20 /// ^/usr/bin/foo Note that you can control badness also via systemd units via OOMScoreAdjust, see https://www.freedesktop.org/software/systemd/man/systemd.exec.html#OOMScoreAdjust= @@ -318,12 +319,12 @@ $ETC apache2 /// systemctl restart apache2 See also wiki.archlinux.org/index.php/Desktop_notifications Valid values are True and False. -gui_notifications = False +gui_notifications = True Enable GUI notifications about the low level of available memory. Valid values are True and False. -gui_low_memory_warnings = False +gui_low_memory_warnings = True Execute the command instead of sending GUI notifications if the value is not empty line. For example: diff --git a/nohang_notify_helper b/nohang_notify_helper index 99653c4..b945ad8 100755 --- a/nohang_notify_helper +++ b/nohang_notify_helper @@ -41,7 +41,11 @@ path_to_cache = '/dev/shm/nohang_notify_cache_uid{}_time{}'.format( uid, t000 ) -title, body = rfile(path_to_cache).split(split_by) +try: + title, body = rfile(path_to_cache).split(split_by) +except FileNotFoundError: + print('nohang_notify_helper: FileNotFoundError') + exit(1) remove(path_to_cache) @@ -75,7 +79,7 @@ def re_pid_environ(pid): if i.startswith('HOME='): # exclude Display Manager's user - if i.startswith('HOME=/var'): + if i.startswith('HOME=/var') or i.startswith('HOME=/root'): return None try: