customize corrective actions: re match with cgroups and names

This commit is contained in:
Alexey Avramov 2019-04-22 19:51:02 +09:00
parent 6af3191e89
commit 448a60c3f0
5 changed files with 166 additions and 101 deletions

160
nohang
View File

@ -1145,6 +1145,8 @@ def find_victim(_print_proc_table):
pid_badness_list.append((pid, badness)) pid_badness_list.append((pid, badness))
real_proc_num = len(pid_badness_list)
# Make list of (pid, badness) tuples, sorted by 'badness' values # Make list of (pid, badness) tuples, sorted by 'badness' values
# print(pid_badness_list) # print(pid_badness_list)
pid_tuple_list = sorted( pid_tuple_list = sorted(
@ -1162,6 +1164,8 @@ def find_victim(_print_proc_table):
if _print_proc_table: if _print_proc_table:
log(hr) log(hr)
log('Found {} processes with exists realpath'.format(real_proc_num))
log( log(
'Process with highest badness (found in {} ms):\n PID: {}, Na' 'Process with highest badness (found in {} ms):\n PID: {}, Na'
'me: {}, badness: {}'.format( 'me: {}, badness: {}'.format(
@ -1172,6 +1176,7 @@ def find_victim(_print_proc_table):
) )
) )
return pid, victim_badness, victim_name return pid, victim_badness, victim_name
@ -1413,9 +1418,37 @@ def implement_corrective_action(signal):
) )
signal = SIGKILL signal = SIGKILL
if execute_the_command and signal is SIGTERM and name in etc_dict: soft_match = False
command = etc_dict[name] if soft_actions and signal is SIGTERM:
# если мягкий порог И список мягких не пуст:
# итерируемся по списку, ища мэтчинги. Есть совпадения - выполн
# команду и выход из цикла.
name = pid_to_name(pid)
cgroup_v1 = pid_to_cgroup_v1(pid)
service = ''
cgroup_v1_tail = cgroup_v1.rpartition('/')[2]
print(cgroup_v1_tail)
if cgroup_v1_tail.endswith('.service'):
service = cgroup_v1_tail
print('$SERVICE:', [service])
print('ИЩЕМ СОВПАДЕНИЯ ДЛЯ МЯГКИХ ДЕЙСТВИЙ')
# итерируемся по списку кортежей
for i in soft_actions_list:
unit = i[0]
if unit == 'name':
u = name
else:
u = cgroup_v1
regexp = i[1]
command = i[2]
print([u, regexp, command])
if search(regexp, u) is not None:
print('СОВПАДЕНИЕ НАЙДЕНО')
soft_match = True
break
if soft_match:
# todo: make new func # todo: make new func
m = check_mem_and_swap() m = check_mem_and_swap()
@ -1428,8 +1461,13 @@ def implement_corrective_action(signal):
) )
) )
cmd = etc_dict[name].replace('$PID', pid).replace( cmd = command.replace(
'$NAME', pid_to_name(pid)) '$PID',
pid).replace(
'$NAME',
pid_to_name(pid)).replace(
'$SERVICE',
service)
exit_status = exe(cmd) exit_status = exe(cmd)
@ -1441,9 +1479,7 @@ def implement_corrective_action(signal):
'ion:\n Run the command: {}' \ 'ion:\n Run the command: {}' \
'\n Exit status: {}; total response ' \ '\n Exit status: {}; total response ' \
'time: {} ms'.format( 'time: {} ms'.format(
command.replace( cmd,
'$PID', pid).replace(
'$NAME', pid_to_name(pid)),
exit_status, exit_status,
round(response_time * 1000)) round(response_time * 1000))
@ -1699,6 +1735,7 @@ if len(argv) == 1:
config = os.getcwd() + '/nohang.conf' config = os.getcwd() + '/nohang.conf'
else: else:
config = '/etc/nohang/nohang.conf' config = '/etc/nohang/nohang.conf'
elif len(argv) == 2: elif len(argv) == 2:
if argv[1] == '--help' or argv[1] == '-h': if argv[1] == '--help' or argv[1] == '-h':
print(help_mess) print(help_mess)
@ -1716,12 +1753,14 @@ elif len(argv) == 2:
else: else:
errprint('Unknown option: {}'.format(argv[1])) errprint('Unknown option: {}'.format(argv[1]))
exit(1) exit(1)
elif len(argv) == 3: elif len(argv) == 3:
if argv[1] == '--config' or argv[1] == '-c': if argv[1] == '--config' or argv[1] == '-c':
config = argv[2] config = argv[2]
else: else:
errprint('Unknown option: {}'.format(argv[1])) errprint('Unknown option: {}'.format(argv[1]))
exit(1) exit(1)
else: else:
errprint('Invalid CLI input: too many options') errprint('Invalid CLI input: too many options')
exit(1) exit(1)
@ -1794,8 +1833,6 @@ print('Config:', config)
config_dict = dict() config_dict = dict()
processname_re_list = [] processname_re_list = []
cmdline_re_list = [] cmdline_re_list = []
environ_re_list = [] environ_re_list = []
uid_re_list = [] uid_re_list = []
@ -1803,12 +1840,14 @@ cgroup_v1_re_list = []
cgroup_v2_re_list = [] cgroup_v2_re_list = []
realpath_re_list = [] realpath_re_list = []
# dictionary with names and commands for the parameter soft_actions_list = []
# execute_the_command
# тут тоже список нужен, а не словарь
etc_dict = dict()
# separator for optional parameters (that starts with @)
opt_separator = '///'
# stupid conf parsing, need refactoring
try: try:
with open(config) as f: with open(config) as f:
@ -1819,9 +1858,10 @@ try:
c = line.startswith('\t') c = line.startswith('\t')
d = line.startswith(' ') d = line.startswith(' ')
etc = line.startswith('$ETC') etc = line.startswith('@SOFT_ACTION_RE_NAME')
etc2 = line.startswith('@SOFT_ACTION_RE_CGROUP_V1')
if not a and not b and not c and not d and not etc: if not a and not b and not c and not d and not etc and not etc2:
a = line.partition('=') a = line.partition('=')
key = a[0].strip() key = a[0].strip()
@ -1834,18 +1874,48 @@ try:
exit(1) exit(1)
if etc: if etc:
a = line[4:].split('///')
etc_name = a[0].strip() # это остаток строки без первого ключа. Содержит: регулярка ///
etc_command = a[1].strip() # команда
if len(etc_name) > 15: a = line.partition('@SOFT_ACTION_RE_NAME')[
errprint('Invalid config, the length of the process ' 2].partition(opt_separator)
'name must not exceed 15 characters\nExit')
exit(1) a1 = 'name'
etc_dict[etc_name] = etc_command
a2 = a[0].strip()
valid_re(a2)
a3 = a[2].strip()
zzz = (a1, a2, a3)
# print(zzz)
soft_actions_list.append(zzz)
if etc2:
# это остаток строки без первого ключа. Содержит: регулярка ///
# команда
a = line.partition('@SOFT_ACTION_RE_CGROUP_V1')[
2].partition(opt_separator)
a1 = 'cgroup_v1'
a2 = a[0].strip()
valid_re(a2)
a3 = a[2].strip()
zzz = (a1, a2, a3)
# print(zzz)
soft_actions_list.append(zzz)
if line.startswith('@PROCESSNAME_RE'): if line.startswith('@PROCESSNAME_RE'):
a = line.partition( a = line.partition(
'@PROCESSNAME_RE')[2].strip(' \n').partition('///') '@PROCESSNAME_RE')[2].strip(' \n').partition(opt_separator)
badness_adj = a[0].strip(' ') badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ') reg_exp = a[2].strip(' ')
valid_re(reg_exp) valid_re(reg_exp)
@ -1853,7 +1923,7 @@ try:
if line.startswith('@CMDLINE_RE'): if line.startswith('@CMDLINE_RE'):
a = line.partition( a = line.partition(
'@CMDLINE_RE')[2].strip(' \n').partition('///') '@CMDLINE_RE')[2].strip(' \n').partition(opt_separator)
badness_adj = a[0].strip(' ') badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ') reg_exp = a[2].strip(' ')
valid_re(reg_exp) valid_re(reg_exp)
@ -1861,7 +1931,7 @@ try:
if line.startswith('@UID_RE'): if line.startswith('@UID_RE'):
a = line.partition( a = line.partition(
'@UID_RE')[2].strip(' \n').partition('///') '@UID_RE')[2].strip(' \n').partition(opt_separator)
badness_adj = a[0].strip(' ') badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ') reg_exp = a[2].strip(' ')
valid_re(reg_exp) valid_re(reg_exp)
@ -1869,7 +1939,7 @@ try:
if line.startswith('@CGROUP_V1_RE'): if line.startswith('@CGROUP_V1_RE'):
a = line.partition( a = line.partition(
'@CGROUP_V1_RE')[2].strip(' \n').partition('///') '@CGROUP_V1_RE')[2].strip(' \n').partition(opt_separator)
badness_adj = a[0].strip(' ') badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ') reg_exp = a[2].strip(' ')
valid_re(reg_exp) valid_re(reg_exp)
@ -1877,7 +1947,7 @@ try:
if line.startswith('@CGROUP_V2_RE'): if line.startswith('@CGROUP_V2_RE'):
a = line.partition( a = line.partition(
'@CGROUP_V2_RE')[2].strip(' \n').partition('///') '@CGROUP_V2_RE')[2].strip(' \n').partition(opt_separator)
badness_adj = a[0].strip(' ') badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ') reg_exp = a[2].strip(' ')
valid_re(reg_exp) valid_re(reg_exp)
@ -1885,7 +1955,7 @@ try:
if line.startswith('@REALPATH_RE'): if line.startswith('@REALPATH_RE'):
a = line.partition( a = line.partition(
'@REALPATH_RE')[2].strip(' \n').partition('///') '@REALPATH_RE')[2].strip(' \n').partition(opt_separator)
badness_adj = a[0].strip(' ') badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ') reg_exp = a[2].strip(' ')
valid_re(reg_exp) valid_re(reg_exp)
@ -1893,7 +1963,7 @@ try:
if line.startswith('@ENVIRON_RE'): if line.startswith('@ENVIRON_RE'):
a = line.partition( a = line.partition(
'@ENVIRON_RE')[2].strip(' \n').partition('///') '@ENVIRON_RE')[2].strip(' \n').partition(opt_separator)
badness_adj = a[0].strip(' ') badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ') reg_exp = a[2].strip(' ')
valid_re(reg_exp) valid_re(reg_exp)
@ -1917,7 +1987,6 @@ except FileNotFoundError:
exit(1) exit(1)
if processname_re_list == []: if processname_re_list == []:
regex_matching = False regex_matching = False
else: else:
@ -1959,10 +2028,6 @@ else:
re_match_cgroup_v2 = True re_match_cgroup_v2 = True
# print(processname_re_list) # print(processname_re_list)
# print(cmdline_re_list) # print(cmdline_re_list)
# print(uid_re_list) # print(uid_re_list)
@ -1972,6 +2037,15 @@ else:
# print(cgroup_v2_re_list) # print(cgroup_v2_re_list)
print(soft_actions_list)
if soft_actions_list == []:
soft_actions = False
else:
soft_actions = True
print('soft_actions:', soft_actions)
########################################################################## ##########################################################################
@ -1989,7 +2063,6 @@ print_sleep_periods = conf_parse_bool('print_sleep_periods')
gui_low_memory_warnings = conf_parse_bool('gui_low_memory_warnings') gui_low_memory_warnings = conf_parse_bool('gui_low_memory_warnings')
gui_notifications = conf_parse_bool('gui_notifications') gui_notifications = conf_parse_bool('gui_notifications')
decrease_oom_score_adj = conf_parse_bool('decrease_oom_score_adj') decrease_oom_score_adj = conf_parse_bool('decrease_oom_score_adj')
execute_the_command = conf_parse_bool('execute_the_command')
ignore_psi = conf_parse_bool('ignore_psi') ignore_psi = conf_parse_bool('ignore_psi')
@ -2365,6 +2438,11 @@ if max_sleep_time < min_sleep_time:
if print_proc_table_flag: if print_proc_table_flag:
if not root:
log('WARNING: effective UID != 0; euid={}; processes with other e'
'uids will be invisible for nohang'.format(self_uid))
func_print_proc_table() func_print_proc_table()
@ -2494,11 +2572,6 @@ if print_config:
print('\n5. The execution of a specific command instead of sen' print('\n5. The execution of a specific command instead of sen'
'ding the\nSIGTERM signal\n') 'ding the\nSIGTERM signal\n')
print('execute_the_command: {}'.format(execute_the_command))
if execute_the_command:
print('\nPROCESS NAME COMMAND TO EXECUTE')
for key in etc_dict:
print('{} {}'.format(key.ljust(15), etc_dict[key]))
print('\n6. GUI notifications:\n- OOM prevention results and\n- low m' print('\n6. GUI notifications:\n- OOM prevention results and\n- low m'
'emory warnings\n') 'emory warnings\n')
@ -2553,6 +2626,9 @@ mlockall()
########################################################################## ##########################################################################
if not root:
log('WARNING: effective UID != 0; euid={}; processes with other e'
'uids will be invisible for nohang'.format(self_uid))
# if print_proc_table: # if print_proc_table:
# find_victim(print_proc_table) # find_victim(print_proc_table)
@ -2561,10 +2637,8 @@ log('Monitoring started!')
stdout.flush() stdout.flush()
########################################################################## ##########################################################################
psi_avg_string = '' # will be overwritten if PSI monitoring enabled psi_avg_string = '' # will be overwritten if PSI monitoring enabled

View File

@ -171,6 +171,7 @@ oom_score_adj_max = 20
Use script `oom-sort` from nohang package to view Use script `oom-sort` from nohang package to view
names, cmdlines and UIDs of processes. names, cmdlines and UIDs of processes.
5.1 Matching process names with RE patterns 5.1 Matching process names with RE patterns
Syntax: Syntax:
@ -196,13 +197,13 @@ oom_score_adj_max = 20
@CMDLINE_RE -200 /// ^/usr/lib/virtualbox @CMDLINE_RE -200 /// ^/usr/lib/virtualbox
5.3 Matching effective UIDs with RE patterns 5.3 Matching UIDs with RE patterns
The most slow option The most slow option
@UID_RE -100 /// ^0$ @UID_RE -100 /// ^0$
5.4 Matching CGroup_v1-line with RE patterns 5.4 Matching CGroup-line with RE patterns
@CGROUP_V1_RE -50 /// ^/system.slice @CGROUP_V1_RE -50 /// ^/system.slice
@ -210,15 +211,13 @@ oom_score_adj_max = 20
@CGROUP_V1_RE -50 /// ^/user.slice @CGROUP_V1_RE -50 /// ^/user.slice
5.5 Matching CGroup_v2-line with RE patterns
@CGROUP_V2_RE 100 /// ^/workload @CGROUP_V2_RE 100 /// ^/workload
5.6 Matching realpath with RE patterns 5.5 Matching realpath with RE patterns
@REALPATH_RE 20 /// ^/usr/bin/foo @REALPATH_RE 20 /// ^/usr/bin/foo
5.7 Matching environ with RE patterns 5.6 Matching environ with RE patterns
@ENVIRON_RE 100 /// USER=user @ENVIRON_RE 100 /// USER=user
@ -227,55 +226,22 @@ oom_score_adj_max = 20
##################################################################### #####################################################################
6. The execution of a specific command instead of sending the 6. Customize corrective actions.
SIGTERM signal.
[this section should be remaked] TODO: docs
For processes with a specific name you can specify a command to Syntax:
run instead of sending the SIGTERM signal. KEY REGEXP SEPARATOR COMMAND
For example, if the process is running as a daemon, you can run @SOFT_ACTION_RE_NAME ^foo$ /// kill -SEGV $PID
the restart command instead of sending SIGTERM. @SOFT_ACTION_RE_NAME ^bash$ /// kill -9 $PID
Valid values are True and False. @SOFT_ACTION_RE_CGROUP_V1 ^/system.slice/ /// systemctl restart $SERVICE
@SOFT_ACTION_RE_CGROUP_V1 foo.service$ /// systemctl restart $SERVICE
execute_the_command = False
The length of the process name can't exceed 15 characters.
The syntax is as follows: lines starting with keyword $ETC are
considered as the lines containing names of processes and
corresponding commands. After a name of process the triple slash
(///) follows. And then follows the command that will be
executed if the specified process is selected as a victim. The
ampersand (&) at the end of the command will allow nohang to
continue runing without waiting for the end of the command
execution.
For example:
$ETC mysqld /// systemctl restart mariadb.service &
$ETC php-fpm7.0 /// systemctl restart php7.0-fpm.service
If command will contain $PID pattern, this template ($PID) will
be replaced by PID of process which name match with RE pattern.
Exmple:
$ETC bash /// kill -KILL $PID
It is way to send any signal instead of SIGTERM.
(run `kill -L` to see list of all signals)
Also $NAME will be replaced by process name.
$ETC bash /// kill -9 $PID
$ETC firefox-esr /// kill -SEGV $PID
$ETC tail /// kill -9 $PID
$ETC apache2 /// systemctl restart apache2
$PID will be replaced by process PID.
$NAME will be replaced by process name.
$SERVICE will be replaced by .service if it exists (overwise it will be relpaced by empty line).
##################################################################### #####################################################################
@ -283,7 +249,7 @@ $ETC apache2 /// systemctl restart apache2
- OOM prevention results and - OOM prevention results and
- low memory warnings - low memory warnings
gui_notifications = False gui_notifications = True
Enable GUI notifications about the low level of available memory. Enable GUI notifications about the low level of available memory.
Valid values are True and False. Valid values are True and False.
@ -324,9 +290,9 @@ print_config = False
Print memory check results. Print memory check results.
Valid values are True and False. Valid values are True and False.
print_mem_check_results = False print_mem_check_results = True
min_mem_report_interval = 60 min_mem_report_interval = 30
Print sleep periods between memory checks. Print sleep periods between memory checks.
Valid values are True and False. Valid values are True and False.

4
trash/memleak/install.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/sh
cp ./memleak /usr/sbin/memleak
cp ./memleak.service /lib/systemd/system/memleak.service
systemctl daemon-reload

12
trash/memleak/memleak Executable file
View File

@ -0,0 +1,12 @@
#!/usr/bin/env python3
from os import system
from time import sleep
x = []
while True:
x.append('#' * 9999999)
sleep(0.1)
system('sleep 99 &')

View File

@ -0,0 +1,9 @@
[Unit]
Description=Memory leak daemon
After=sysinit.target
[Service]
ExecStart=/usr/sbin/memleak
[Install]
WantedBy=multi-user.target