customize corrective actions: re match with cgroups and names

This commit is contained in:
Alexey Avramov 2019-04-22 19:51:02 +09:00
parent 6af3191e89
commit 448a60c3f0
5 changed files with 166 additions and 101 deletions

160
nohang
View File

@ -1145,6 +1145,8 @@ def find_victim(_print_proc_table):
pid_badness_list.append((pid, badness))
real_proc_num = len(pid_badness_list)
# Make list of (pid, badness) tuples, sorted by 'badness' values
# print(pid_badness_list)
pid_tuple_list = sorted(
@ -1162,6 +1164,8 @@ def find_victim(_print_proc_table):
if _print_proc_table:
log(hr)
log('Found {} processes with exists realpath'.format(real_proc_num))
log(
'Process with highest badness (found in {} ms):\n PID: {}, Na'
'me: {}, badness: {}'.format(
@ -1172,6 +1176,7 @@ def find_victim(_print_proc_table):
)
)
return pid, victim_badness, victim_name
@ -1413,9 +1418,37 @@ def implement_corrective_action(signal):
)
signal = SIGKILL
if execute_the_command and signal is SIGTERM and name in etc_dict:
soft_match = False
command = etc_dict[name]
if soft_actions and signal is SIGTERM:
# если мягкий порог И список мягких не пуст:
# итерируемся по списку, ища мэтчинги. Есть совпадения - выполн
# команду и выход из цикла.
name = pid_to_name(pid)
cgroup_v1 = pid_to_cgroup_v1(pid)
service = ''
cgroup_v1_tail = cgroup_v1.rpartition('/')[2]
print(cgroup_v1_tail)
if cgroup_v1_tail.endswith('.service'):
service = cgroup_v1_tail
print('$SERVICE:', [service])
print('ИЩЕМ СОВПАДЕНИЯ ДЛЯ МЯГКИХ ДЕЙСТВИЙ')
# итерируемся по списку кортежей
for i in soft_actions_list:
unit = i[0]
if unit == 'name':
u = name
else:
u = cgroup_v1
regexp = i[1]
command = i[2]
print([u, regexp, command])
if search(regexp, u) is not None:
print('СОВПАДЕНИЕ НАЙДЕНО')
soft_match = True
break
if soft_match:
# todo: make new func
m = check_mem_and_swap()
@ -1428,8 +1461,13 @@ def implement_corrective_action(signal):
)
)
cmd = etc_dict[name].replace('$PID', pid).replace(
'$NAME', pid_to_name(pid))
cmd = command.replace(
'$PID',
pid).replace(
'$NAME',
pid_to_name(pid)).replace(
'$SERVICE',
service)
exit_status = exe(cmd)
@ -1441,9 +1479,7 @@ def implement_corrective_action(signal):
'ion:\n Run the command: {}' \
'\n Exit status: {}; total response ' \
'time: {} ms'.format(
command.replace(
'$PID', pid).replace(
'$NAME', pid_to_name(pid)),
cmd,
exit_status,
round(response_time * 1000))
@ -1699,6 +1735,7 @@ if len(argv) == 1:
config = os.getcwd() + '/nohang.conf'
else:
config = '/etc/nohang/nohang.conf'
elif len(argv) == 2:
if argv[1] == '--help' or argv[1] == '-h':
print(help_mess)
@ -1716,12 +1753,14 @@ elif len(argv) == 2:
else:
errprint('Unknown option: {}'.format(argv[1]))
exit(1)
elif len(argv) == 3:
if argv[1] == '--config' or argv[1] == '-c':
config = argv[2]
else:
errprint('Unknown option: {}'.format(argv[1]))
exit(1)
else:
errprint('Invalid CLI input: too many options')
exit(1)
@ -1794,8 +1833,6 @@ print('Config:', config)
config_dict = dict()
processname_re_list = []
cmdline_re_list = []
environ_re_list = []
uid_re_list = []
@ -1803,12 +1840,14 @@ cgroup_v1_re_list = []
cgroup_v2_re_list = []
realpath_re_list = []
# dictionary with names and commands for the parameter
# execute_the_command
# тут тоже список нужен, а не словарь
etc_dict = dict()
soft_actions_list = []
# separator for optional parameters (that starts with @)
opt_separator = '///'
# stupid conf parsing, need refactoring
try:
with open(config) as f:
@ -1819,9 +1858,10 @@ try:
c = line.startswith('\t')
d = line.startswith(' ')
etc = line.startswith('$ETC')
etc = line.startswith('@SOFT_ACTION_RE_NAME')
etc2 = line.startswith('@SOFT_ACTION_RE_CGROUP_V1')
if not a and not b and not c and not d and not etc:
if not a and not b and not c and not d and not etc and not etc2:
a = line.partition('=')
key = a[0].strip()
@ -1834,18 +1874,48 @@ try:
exit(1)
if etc:
a = line[4:].split('///')
etc_name = a[0].strip()
etc_command = a[1].strip()
if len(etc_name) > 15:
errprint('Invalid config, the length of the process '
'name must not exceed 15 characters\nExit')
exit(1)
etc_dict[etc_name] = etc_command
# это остаток строки без первого ключа. Содержит: регулярка ///
# команда
a = line.partition('@SOFT_ACTION_RE_NAME')[
2].partition(opt_separator)
a1 = 'name'
a2 = a[0].strip()
valid_re(a2)
a3 = a[2].strip()
zzz = (a1, a2, a3)
# print(zzz)
soft_actions_list.append(zzz)
if etc2:
# это остаток строки без первого ключа. Содержит: регулярка ///
# команда
a = line.partition('@SOFT_ACTION_RE_CGROUP_V1')[
2].partition(opt_separator)
a1 = 'cgroup_v1'
a2 = a[0].strip()
valid_re(a2)
a3 = a[2].strip()
zzz = (a1, a2, a3)
# print(zzz)
soft_actions_list.append(zzz)
if line.startswith('@PROCESSNAME_RE'):
a = line.partition(
'@PROCESSNAME_RE')[2].strip(' \n').partition('///')
'@PROCESSNAME_RE')[2].strip(' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
@ -1853,7 +1923,7 @@ try:
if line.startswith('@CMDLINE_RE'):
a = line.partition(
'@CMDLINE_RE')[2].strip(' \n').partition('///')
'@CMDLINE_RE')[2].strip(' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
@ -1861,7 +1931,7 @@ try:
if line.startswith('@UID_RE'):
a = line.partition(
'@UID_RE')[2].strip(' \n').partition('///')
'@UID_RE')[2].strip(' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
@ -1869,7 +1939,7 @@ try:
if line.startswith('@CGROUP_V1_RE'):
a = line.partition(
'@CGROUP_V1_RE')[2].strip(' \n').partition('///')
'@CGROUP_V1_RE')[2].strip(' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
@ -1877,7 +1947,7 @@ try:
if line.startswith('@CGROUP_V2_RE'):
a = line.partition(
'@CGROUP_V2_RE')[2].strip(' \n').partition('///')
'@CGROUP_V2_RE')[2].strip(' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
@ -1885,7 +1955,7 @@ try:
if line.startswith('@REALPATH_RE'):
a = line.partition(
'@REALPATH_RE')[2].strip(' \n').partition('///')
'@REALPATH_RE')[2].strip(' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
@ -1893,7 +1963,7 @@ try:
if line.startswith('@ENVIRON_RE'):
a = line.partition(
'@ENVIRON_RE')[2].strip(' \n').partition('///')
'@ENVIRON_RE')[2].strip(' \n').partition(opt_separator)
badness_adj = a[0].strip(' ')
reg_exp = a[2].strip(' ')
valid_re(reg_exp)
@ -1917,7 +1987,6 @@ except FileNotFoundError:
exit(1)
if processname_re_list == []:
regex_matching = False
else:
@ -1959,10 +2028,6 @@ else:
re_match_cgroup_v2 = True
# print(processname_re_list)
# print(cmdline_re_list)
# print(uid_re_list)
@ -1972,6 +2037,15 @@ else:
# print(cgroup_v2_re_list)
print(soft_actions_list)
if soft_actions_list == []:
soft_actions = False
else:
soft_actions = True
print('soft_actions:', soft_actions)
##########################################################################
@ -1989,7 +2063,6 @@ print_sleep_periods = conf_parse_bool('print_sleep_periods')
gui_low_memory_warnings = conf_parse_bool('gui_low_memory_warnings')
gui_notifications = conf_parse_bool('gui_notifications')
decrease_oom_score_adj = conf_parse_bool('decrease_oom_score_adj')
execute_the_command = conf_parse_bool('execute_the_command')
ignore_psi = conf_parse_bool('ignore_psi')
@ -2365,6 +2438,11 @@ if max_sleep_time < min_sleep_time:
if print_proc_table_flag:
if not root:
log('WARNING: effective UID != 0; euid={}; processes with other e'
'uids will be invisible for nohang'.format(self_uid))
func_print_proc_table()
@ -2494,11 +2572,6 @@ if print_config:
print('\n5. The execution of a specific command instead of sen'
'ding the\nSIGTERM signal\n')
print('execute_the_command: {}'.format(execute_the_command))
if execute_the_command:
print('\nPROCESS NAME COMMAND TO EXECUTE')
for key in etc_dict:
print('{} {}'.format(key.ljust(15), etc_dict[key]))
print('\n6. GUI notifications:\n- OOM prevention results and\n- low m'
'emory warnings\n')
@ -2553,6 +2626,9 @@ mlockall()
##########################################################################
if not root:
log('WARNING: effective UID != 0; euid={}; processes with other e'
'uids will be invisible for nohang'.format(self_uid))
# if print_proc_table:
# find_victim(print_proc_table)
@ -2561,10 +2637,8 @@ log('Monitoring started!')
stdout.flush()
##########################################################################
psi_avg_string = '' # will be overwritten if PSI monitoring enabled

View File

@ -171,6 +171,7 @@ oom_score_adj_max = 20
Use script `oom-sort` from nohang package to view
names, cmdlines and UIDs of processes.
5.1 Matching process names with RE patterns
Syntax:
@ -196,13 +197,13 @@ oom_score_adj_max = 20
@CMDLINE_RE -200 /// ^/usr/lib/virtualbox
5.3 Matching effective UIDs with RE patterns
5.3 Matching UIDs with RE patterns
The most slow option
@UID_RE -100 /// ^0$
5.4 Matching CGroup_v1-line with RE patterns
5.4 Matching CGroup-line with RE patterns
@CGROUP_V1_RE -50 /// ^/system.slice
@ -210,15 +211,13 @@ oom_score_adj_max = 20
@CGROUP_V1_RE -50 /// ^/user.slice
5.5 Matching CGroup_v2-line with RE patterns
@CGROUP_V2_RE 100 /// ^/workload
5.6 Matching realpath with RE patterns
5.5 Matching realpath with RE patterns
@REALPATH_RE 20 /// ^/usr/bin/foo
5.7 Matching environ with RE patterns
5.6 Matching environ with RE patterns
@ENVIRON_RE 100 /// USER=user
@ -227,55 +226,22 @@ oom_score_adj_max = 20
#####################################################################
6. The execution of a specific command instead of sending the
SIGTERM signal.
6. Customize corrective actions.
[this section should be remaked]
TODO: docs
For processes with a specific name you can specify a command to
run instead of sending the SIGTERM signal.
Syntax:
KEY REGEXP SEPARATOR COMMAND
For example, if the process is running as a daemon, you can run
the restart command instead of sending SIGTERM.
@SOFT_ACTION_RE_NAME ^foo$ /// kill -SEGV $PID
@SOFT_ACTION_RE_NAME ^bash$ /// kill -9 $PID
Valid values are True and False.
execute_the_command = False
The length of the process name can't exceed 15 characters.
The syntax is as follows: lines starting with keyword $ETC are
considered as the lines containing names of processes and
corresponding commands. After a name of process the triple slash
(///) follows. And then follows the command that will be
executed if the specified process is selected as a victim. The
ampersand (&) at the end of the command will allow nohang to
continue runing without waiting for the end of the command
execution.
For example:
$ETC mysqld /// systemctl restart mariadb.service &
$ETC php-fpm7.0 /// systemctl restart php7.0-fpm.service
If command will contain $PID pattern, this template ($PID) will
be replaced by PID of process which name match with RE pattern.
Exmple:
$ETC bash /// kill -KILL $PID
It is way to send any signal instead of SIGTERM.
(run `kill -L` to see list of all signals)
Also $NAME will be replaced by process name.
$ETC bash /// kill -9 $PID
$ETC firefox-esr /// kill -SEGV $PID
$ETC tail /// kill -9 $PID
$ETC apache2 /// systemctl restart apache2
@SOFT_ACTION_RE_CGROUP_V1 ^/system.slice/ /// systemctl restart $SERVICE
@SOFT_ACTION_RE_CGROUP_V1 foo.service$ /// systemctl restart $SERVICE
$PID will be replaced by process PID.
$NAME will be replaced by process name.
$SERVICE will be replaced by .service if it exists (overwise it will be relpaced by empty line).
#####################################################################
@ -283,7 +249,7 @@ $ETC apache2 /// systemctl restart apache2
- OOM prevention results and
- low memory warnings
gui_notifications = False
gui_notifications = True
Enable GUI notifications about the low level of available memory.
Valid values are True and False.
@ -324,9 +290,9 @@ print_config = False
Print memory check results.
Valid values are True and False.
print_mem_check_results = False
print_mem_check_results = True
min_mem_report_interval = 60
min_mem_report_interval = 30
Print sleep periods between memory checks.
Valid values are True and False.

4
trash/memleak/install.sh Executable file
View File

@ -0,0 +1,4 @@
#!/bin/sh
cp ./memleak /usr/sbin/memleak
cp ./memleak.service /lib/systemd/system/memleak.service
systemctl daemon-reload

12
trash/memleak/memleak Executable file
View File

@ -0,0 +1,12 @@
#!/usr/bin/env python3
from os import system
from time import sleep
x = []
while True:
x.append('#' * 9999999)
sleep(0.1)
system('sleep 99 &')

View File

@ -0,0 +1,9 @@
[Unit]
Description=Memory leak daemon
After=sysinit.target
[Service]
ExecStart=/usr/sbin/memleak
[Install]
WantedBy=multi-user.target