From 448a60c3f01e4304d32dbdb6a9ccffff9117002f Mon Sep 17 00:00:00 2001 From: Alexey Avramov Date: Mon, 22 Apr 2019 19:51:02 +0900 Subject: [PATCH] customize corrective actions: re match with cgroups and names --- nohang | 170 ++++++++++++++++++++++++---------- nohang.conf | 72 ++++---------- trash/memleak/install.sh | 4 + trash/memleak/memleak | 12 +++ trash/memleak/memleak.service | 9 ++ 5 files changed, 166 insertions(+), 101 deletions(-) create mode 100755 trash/memleak/install.sh create mode 100755 trash/memleak/memleak create mode 100644 trash/memleak/memleak.service diff --git a/nohang b/nohang index fdcb721..52ee7e5 100755 --- a/nohang +++ b/nohang @@ -1145,6 +1145,8 @@ def find_victim(_print_proc_table): pid_badness_list.append((pid, badness)) + real_proc_num = len(pid_badness_list) + # Make list of (pid, badness) tuples, sorted by 'badness' values # print(pid_badness_list) pid_tuple_list = sorted( @@ -1162,6 +1164,8 @@ def find_victim(_print_proc_table): if _print_proc_table: log(hr) + log('Found {} processes with exists realpath'.format(real_proc_num)) + log( 'Process with highest badness (found in {} ms):\n PID: {}, Na' 'me: {}, badness: {}'.format( @@ -1172,6 +1176,7 @@ def find_victim(_print_proc_table): ) ) + return pid, victim_badness, victim_name @@ -1413,9 +1418,37 @@ def implement_corrective_action(signal): ) signal = SIGKILL - if execute_the_command and signal is SIGTERM and name in etc_dict: + soft_match = False - command = etc_dict[name] + if soft_actions and signal is SIGTERM: + # если мягкий порог И список мягких не пуст: + # итерируемся по списку, ища мэтчинги. Есть совпадения - выполн + # команду и выход из цикла. + name = pid_to_name(pid) + cgroup_v1 = pid_to_cgroup_v1(pid) + service = '' + cgroup_v1_tail = cgroup_v1.rpartition('/')[2] + print(cgroup_v1_tail) + if cgroup_v1_tail.endswith('.service'): + service = cgroup_v1_tail + print('$SERVICE:', [service]) + print('ИЩЕМ СОВПАДЕНИЯ ДЛЯ МЯГКИХ ДЕЙСТВИЙ') + # итерируемся по списку кортежей + for i in soft_actions_list: + unit = i[0] + if unit == 'name': + u = name + else: + u = cgroup_v1 + regexp = i[1] + command = i[2] + print([u, regexp, command]) + if search(regexp, u) is not None: + print('СОВПАДЕНИЕ НАЙДЕНО') + soft_match = True + break + + if soft_match: # todo: make new func m = check_mem_and_swap() @@ -1428,8 +1461,13 @@ def implement_corrective_action(signal): ) ) - cmd = etc_dict[name].replace('$PID', pid).replace( - '$NAME', pid_to_name(pid)) + cmd = command.replace( + '$PID', + pid).replace( + '$NAME', + pid_to_name(pid)).replace( + '$SERVICE', + service) exit_status = exe(cmd) @@ -1438,14 +1476,12 @@ def implement_corrective_action(signal): response_time = time() - time0 etc_info = 'Implement a corrective act' \ - 'ion:\n Run the command: {}' \ - '\n Exit status: {}; total response ' \ - 'time: {} ms'.format( - command.replace( - '$PID', pid).replace( - '$NAME', pid_to_name(pid)), - exit_status, - round(response_time * 1000)) + 'ion:\n Run the command: {}' \ + '\n Exit status: {}; total response ' \ + 'time: {} ms'.format( + cmd, + exit_status, + round(response_time * 1000)) print(etc_info) @@ -1699,6 +1735,7 @@ if len(argv) == 1: config = os.getcwd() + '/nohang.conf' else: config = '/etc/nohang/nohang.conf' + elif len(argv) == 2: if argv[1] == '--help' or argv[1] == '-h': print(help_mess) @@ -1716,12 +1753,14 @@ elif len(argv) == 2: else: errprint('Unknown option: {}'.format(argv[1])) exit(1) + elif len(argv) == 3: if argv[1] == '--config' or argv[1] == '-c': config = argv[2] else: errprint('Unknown option: {}'.format(argv[1])) exit(1) + else: errprint('Invalid CLI input: too many options') exit(1) @@ -1794,8 +1833,6 @@ print('Config:', config) config_dict = dict() processname_re_list = [] - - cmdline_re_list = [] environ_re_list = [] uid_re_list = [] @@ -1803,12 +1840,14 @@ cgroup_v1_re_list = [] cgroup_v2_re_list = [] realpath_re_list = [] -# dictionary with names and commands for the parameter -# execute_the_command -# тут тоже список нужен, а не словарь -etc_dict = dict() +soft_actions_list = [] +# separator for optional parameters (that starts with @) +opt_separator = '///' + + +# stupid conf parsing, need refactoring try: with open(config) as f: @@ -1819,9 +1858,10 @@ try: c = line.startswith('\t') d = line.startswith(' ') - etc = line.startswith('$ETC') + etc = line.startswith('@SOFT_ACTION_RE_NAME') + etc2 = line.startswith('@SOFT_ACTION_RE_CGROUP_V1') - if not a and not b and not c and not d and not etc: + if not a and not b and not c and not d and not etc and not etc2: a = line.partition('=') key = a[0].strip() @@ -1834,18 +1874,48 @@ try: exit(1) if etc: - a = line[4:].split('///') - etc_name = a[0].strip() - etc_command = a[1].strip() - if len(etc_name) > 15: - errprint('Invalid config, the length of the process ' - 'name must not exceed 15 characters\nExit') - exit(1) - etc_dict[etc_name] = etc_command + + # это остаток строки без первого ключа. Содержит: регулярка /// + # команда + a = line.partition('@SOFT_ACTION_RE_NAME')[ + 2].partition(opt_separator) + + a1 = 'name' + + a2 = a[0].strip() + valid_re(a2) + + a3 = a[2].strip() + + zzz = (a1, a2, a3) + + # print(zzz) + + soft_actions_list.append(zzz) + + if etc2: + + # это остаток строки без первого ключа. Содержит: регулярка /// + # команда + a = line.partition('@SOFT_ACTION_RE_CGROUP_V1')[ + 2].partition(opt_separator) + + a1 = 'cgroup_v1' + + a2 = a[0].strip() + valid_re(a2) + + a3 = a[2].strip() + + zzz = (a1, a2, a3) + + # print(zzz) + + soft_actions_list.append(zzz) if line.startswith('@PROCESSNAME_RE'): a = line.partition( - '@PROCESSNAME_RE')[2].strip(' \n').partition('///') + '@PROCESSNAME_RE')[2].strip(' \n').partition(opt_separator) badness_adj = a[0].strip(' ') reg_exp = a[2].strip(' ') valid_re(reg_exp) @@ -1853,7 +1923,7 @@ try: if line.startswith('@CMDLINE_RE'): a = line.partition( - '@CMDLINE_RE')[2].strip(' \n').partition('///') + '@CMDLINE_RE')[2].strip(' \n').partition(opt_separator) badness_adj = a[0].strip(' ') reg_exp = a[2].strip(' ') valid_re(reg_exp) @@ -1861,7 +1931,7 @@ try: if line.startswith('@UID_RE'): a = line.partition( - '@UID_RE')[2].strip(' \n').partition('///') + '@UID_RE')[2].strip(' \n').partition(opt_separator) badness_adj = a[0].strip(' ') reg_exp = a[2].strip(' ') valid_re(reg_exp) @@ -1869,7 +1939,7 @@ try: if line.startswith('@CGROUP_V1_RE'): a = line.partition( - '@CGROUP_V1_RE')[2].strip(' \n').partition('///') + '@CGROUP_V1_RE')[2].strip(' \n').partition(opt_separator) badness_adj = a[0].strip(' ') reg_exp = a[2].strip(' ') valid_re(reg_exp) @@ -1877,7 +1947,7 @@ try: if line.startswith('@CGROUP_V2_RE'): a = line.partition( - '@CGROUP_V2_RE')[2].strip(' \n').partition('///') + '@CGROUP_V2_RE')[2].strip(' \n').partition(opt_separator) badness_adj = a[0].strip(' ') reg_exp = a[2].strip(' ') valid_re(reg_exp) @@ -1885,7 +1955,7 @@ try: if line.startswith('@REALPATH_RE'): a = line.partition( - '@REALPATH_RE')[2].strip(' \n').partition('///') + '@REALPATH_RE')[2].strip(' \n').partition(opt_separator) badness_adj = a[0].strip(' ') reg_exp = a[2].strip(' ') valid_re(reg_exp) @@ -1893,7 +1963,7 @@ try: if line.startswith('@ENVIRON_RE'): a = line.partition( - '@ENVIRON_RE')[2].strip(' \n').partition('///') + '@ENVIRON_RE')[2].strip(' \n').partition(opt_separator) badness_adj = a[0].strip(' ') reg_exp = a[2].strip(' ') valid_re(reg_exp) @@ -1917,7 +1987,6 @@ except FileNotFoundError: exit(1) - if processname_re_list == []: regex_matching = False else: @@ -1959,10 +2028,6 @@ else: re_match_cgroup_v2 = True - - - - # print(processname_re_list) # print(cmdline_re_list) # print(uid_re_list) @@ -1972,6 +2037,15 @@ else: # print(cgroup_v2_re_list) +print(soft_actions_list) + +if soft_actions_list == []: + soft_actions = False +else: + soft_actions = True + +print('soft_actions:', soft_actions) + ########################################################################## @@ -1989,7 +2063,6 @@ print_sleep_periods = conf_parse_bool('print_sleep_periods') gui_low_memory_warnings = conf_parse_bool('gui_low_memory_warnings') gui_notifications = conf_parse_bool('gui_notifications') decrease_oom_score_adj = conf_parse_bool('decrease_oom_score_adj') -execute_the_command = conf_parse_bool('execute_the_command') ignore_psi = conf_parse_bool('ignore_psi') @@ -2365,6 +2438,11 @@ if max_sleep_time < min_sleep_time: if print_proc_table_flag: + + if not root: + log('WARNING: effective UID != 0; euid={}; processes with other e' + 'uids will be invisible for nohang'.format(self_uid)) + func_print_proc_table() @@ -2494,11 +2572,6 @@ if print_config: print('\n5. The execution of a specific command instead of sen' 'ding the\nSIGTERM signal\n') - print('execute_the_command: {}'.format(execute_the_command)) - if execute_the_command: - print('\nPROCESS NAME COMMAND TO EXECUTE') - for key in etc_dict: - print('{} {}'.format(key.ljust(15), etc_dict[key])) print('\n6. GUI notifications:\n- OOM prevention results and\n- low m' 'emory warnings\n') @@ -2553,6 +2626,9 @@ mlockall() ########################################################################## +if not root: + log('WARNING: effective UID != 0; euid={}; processes with other e' + 'uids will be invisible for nohang'.format(self_uid)) # if print_proc_table: # find_victim(print_proc_table) @@ -2561,10 +2637,8 @@ log('Monitoring started!') stdout.flush() - ########################################################################## - psi_avg_string = '' # will be overwritten if PSI monitoring enabled diff --git a/nohang.conf b/nohang.conf index e974920..2bd8d72 100644 --- a/nohang.conf +++ b/nohang.conf @@ -171,6 +171,7 @@ oom_score_adj_max = 20 Use script `oom-sort` from nohang package to view names, cmdlines and UIDs of processes. + 5.1 Matching process names with RE patterns Syntax: @@ -196,13 +197,13 @@ oom_score_adj_max = 20 @CMDLINE_RE -200 /// ^/usr/lib/virtualbox - 5.3 Matching effective UIDs with RE patterns + 5.3 Matching UIDs with RE patterns The most slow option @UID_RE -100 /// ^0$ - 5.4 Matching CGroup_v1-line with RE patterns + 5.4 Matching CGroup-line with RE patterns @CGROUP_V1_RE -50 /// ^/system.slice @@ -210,15 +211,13 @@ oom_score_adj_max = 20 @CGROUP_V1_RE -50 /// ^/user.slice - 5.5 Matching CGroup_v2-line with RE patterns - @CGROUP_V2_RE 100 /// ^/workload - 5.6 Matching realpath with RE patterns + 5.5 Matching realpath with RE patterns @REALPATH_RE 20 /// ^/usr/bin/foo - 5.7 Matching environ with RE patterns + 5.6 Matching environ with RE patterns @ENVIRON_RE 100 /// USER=user @@ -227,55 +226,22 @@ oom_score_adj_max = 20 ##################################################################### - 6. The execution of a specific command instead of sending the - SIGTERM signal. + 6. Customize corrective actions. - [this section should be remaked] + TODO: docs - For processes with a specific name you can specify a command to - run instead of sending the SIGTERM signal. + Syntax: + KEY REGEXP SEPARATOR COMMAND - For example, if the process is running as a daemon, you can run - the restart command instead of sending SIGTERM. + @SOFT_ACTION_RE_NAME ^foo$ /// kill -SEGV $PID + @SOFT_ACTION_RE_NAME ^bash$ /// kill -9 $PID - Valid values are True and False. - -execute_the_command = False - - The length of the process name can't exceed 15 characters. - The syntax is as follows: lines starting with keyword $ETC are - considered as the lines containing names of processes and - corresponding commands. After a name of process the triple slash - (///) follows. And then follows the command that will be - executed if the specified process is selected as a victim. The - ampersand (&) at the end of the command will allow nohang to - continue runing without waiting for the end of the command - execution. - - For example: - $ETC mysqld /// systemctl restart mariadb.service & - $ETC php-fpm7.0 /// systemctl restart php7.0-fpm.service - - If command will contain $PID pattern, this template ($PID) will - be replaced by PID of process which name match with RE pattern. - - Exmple: - - $ETC bash /// kill -KILL $PID - - It is way to send any signal instead of SIGTERM. - (run `kill -L` to see list of all signals) - - Also $NAME will be replaced by process name. - - $ETC bash /// kill -9 $PID - -$ETC firefox-esr /// kill -SEGV $PID - -$ETC tail /// kill -9 $PID - -$ETC apache2 /// systemctl restart apache2 +@SOFT_ACTION_RE_CGROUP_V1 ^/system.slice/ /// systemctl restart $SERVICE + @SOFT_ACTION_RE_CGROUP_V1 foo.service$ /// systemctl restart $SERVICE + $PID will be replaced by process PID. + $NAME will be replaced by process name. + $SERVICE will be replaced by .service if it exists (overwise it will be relpaced by empty line). ##################################################################### @@ -283,7 +249,7 @@ $ETC apache2 /// systemctl restart apache2 - OOM prevention results and - low memory warnings -gui_notifications = False +gui_notifications = True Enable GUI notifications about the low level of available memory. Valid values are True and False. @@ -324,9 +290,9 @@ print_config = False Print memory check results. Valid values are True and False. -print_mem_check_results = False +print_mem_check_results = True -min_mem_report_interval = 60 +min_mem_report_interval = 30 Print sleep periods between memory checks. Valid values are True and False. diff --git a/trash/memleak/install.sh b/trash/memleak/install.sh new file mode 100755 index 0000000..0f765ee --- /dev/null +++ b/trash/memleak/install.sh @@ -0,0 +1,4 @@ +#!/bin/sh +cp ./memleak /usr/sbin/memleak +cp ./memleak.service /lib/systemd/system/memleak.service +systemctl daemon-reload diff --git a/trash/memleak/memleak b/trash/memleak/memleak new file mode 100755 index 0000000..df3fb93 --- /dev/null +++ b/trash/memleak/memleak @@ -0,0 +1,12 @@ +#!/usr/bin/env python3 + +from os import system +from time import sleep + +x = [] + +while True: + x.append('#' * 9999999) + sleep(0.1) + system('sleep 99 &') + diff --git a/trash/memleak/memleak.service b/trash/memleak/memleak.service new file mode 100644 index 0000000..0a7f3c0 --- /dev/null +++ b/trash/memleak/memleak.service @@ -0,0 +1,9 @@ +[Unit] +Description=Memory leak daemon +After=sysinit.target + +[Service] +ExecStart=/usr/sbin/memleak + +[Install] +WantedBy=multi-user.target