Remove nohang_notify_helper, use threading instead

This commit is contained in:
Alexey Avramov 2019-09-18 01:11:45 +09:00
parent 28f02c4cab
commit 9b8cd499e2
8 changed files with 4026 additions and 130 deletions

View File

@ -12,7 +12,6 @@ install:
install -d $(DESTDIR)$(BINDIR)
install -m0755 nohang $(DESTDIR)$(BINDIR)/nohang
install -m0755 nohang_notify_helper $(DESTDIR)$(BINDIR)/nohang_notify_helper
install -m0755 oom-sort $(DESTDIR)$(BINDIR)/oom-sort
install -m0755 psi-top $(DESTDIR)$(BINDIR)/psi-top
install -m0755 psi-monitor $(DESTDIR)$(BINDIR)/psi-monitor
@ -43,7 +42,6 @@ install-desktop:
install -d $(DESTDIR)$(BINDIR)
install -m0755 nohang $(DESTDIR)$(BINDIR)/nohang
install -m0755 nohang_notify_helper $(DESTDIR)$(BINDIR)/nohang_notify_helper
install -m0755 oom-sort $(DESTDIR)$(BINDIR)/oom-sort
install -m0755 psi-top $(DESTDIR)$(BINDIR)/psi-top
install -m0755 psi-monitor $(DESTDIR)$(BINDIR)/psi-monitor
@ -76,7 +74,6 @@ uninstall:
-systemctl disable nohang.service || true
-systemctl daemon-reload
rm -fv $(DESTDIR)$(BINDIR)/nohang
rm -fv $(DESTDIR)$(BINDIR)/nohang_notify_helper
rm -fv $(DESTDIR)$(BINDIR)/oom-sort
rm -fv $(DESTDIR)$(BINDIR)/psi-top
rm -fv $(DESTDIR)$(BINDIR)/psi-monitor
@ -95,7 +92,6 @@ systemd:
pylint:
-pylint3 -E nohang
-pylint3 -E nohang_notify_helper
-pylint3 -E oom-sort
-pylint3 -E psi-top
-pylint3 -E psi-monitor

419
nohang
View File

@ -17,6 +17,265 @@ from threading import Thread
# define functions
def exe(cmd):
""" execute cmd
"""
log('Execute the command: {}'.format(cmd))
t0 = time()
write_self_oom_score_adj(self_oom_score_adj_max)
err = os.system(cmd)
write_self_oom_score_adj(self_oom_score_adj_min)
dt = time() - t0
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
return err
def go(func, *a):
""" run func in new thread
"""
t1 = time()
th = Thread(target=func, args=a)
th_name = th.getName()
if debug_threading:
log('Starting {}'.format(th_name))
try:
th.start()
t2 = time()
if debug_threading:
log('{} has started in {} ms'.format(
th_name, round((t2 - t1) * 1000, 1)))
except RuntimeError:
if debug_threading:
log('RuntimeError: cannot start {}'.format(th_name))
def re_pid_environ(pid):
"""
read environ of 1 process
returns tuple with USER, DBUS, DISPLAY like follow:
('user', 'DISPLAY=:0',
'DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus')
returns None if these vars is not in /proc/[pid]/environ
"""
try:
with open('/proc/' + pid + '/environ') as f:
env = f.read()
except FileNotFoundError:
log('notify helper: FileNotFoundError')
return None
except ProcessLookupError:
log('notify helper: ProcessLookupError')
return None
if display_env in env and dbus_env in env and user_env in env:
env_list = env.split('\x00')
# iterating over a list of process environment variables
for i in env_list:
# exclude Display Manager's user
if i.startswith('HOME=/var'):
return None
if i.startswith(user_env):
user = i
if user == 'USER=root':
return None
continue
if i.startswith(display_env):
display = i[:10]
continue
if i.startswith(dbus_env):
dbus = i
continue
try:
return user.partition('USER=')[2], display, dbus
except UnboundLocalError:
log('notify helper: UnboundLocalError')
return None
def root_notify_env():
"""return set(user, display, dbus)"""
unsorted_envs_list = []
# iterates over processes, find processes with suitable env
for pid in os.listdir('/proc'):
if os.path.exists('/proc/' + pid + '/exe') is True:
one_env = re_pid_environ(pid)
unsorted_envs_list.append(one_env)
env = set(unsorted_envs_list)
env.discard(None)
# deduplicate dbus
new_env = []
end = []
for i in env:
key = i[0] + i[1]
if key not in end:
end.append(key)
new_env.append(i)
else:
continue
return new_env
def pop(cmd, username):
"""
"""
if swap_total == 0:
wait_time = 2
else:
wait_time = 20
t3 = time()
with Popen(cmd) as proc:
try:
proc.wait(timeout=wait_time)
except TimeoutExpired:
proc.kill()
if debug_gui_notifications:
log('TimeoutExpired: notify user: {}'.format(username))
t4 = time()
if debug_gui_notifications:
log('Popen time: {} sec; cmd: {}'.format(round(t4 - t3, 3), cmd))
def send_notification(title, body):
"""
"""
if self_uid != 0:
cmd = ['notify-send', '--icon=dialog-warning', title, body]
username = '(UID={})'.format(self_uid)
pop(cmd, username)
return None
t1 = time()
if envd['t'] is None:
list_with_envs = root_notify_env()
envd['list_with_envs'] = list_with_envs
envd['t'] = time()
elif time() - envd['t'] > env_cache_time:
list_with_envs = root_notify_env()
envd['list_with_envs'] = list_with_envs
envd['t'] = time()
else:
list_with_envs = envd['list_with_envs']
list_len = len(list_with_envs)
t2 = time()
if debug_gui_notifications:
log('Find env time: {} ms'.format(round((t2 - t1) * 1000)))
# if somebody logged in with GUI
if list_len > 0:
for i in list_with_envs:
if debug_gui_notifications:
log('Send a GUI notification:\n ',
'title: ', [title],
'\n body: ', [body],
'\n user/env:', i
)
# iterating over logged-in users
for i in list_with_envs:
username, display_env, dbus_env = i[0], i[1], i[2]
display_tuple = display_env.partition('=')
dbus_tuple = dbus_env.partition('=')
display_value = display_tuple[2]
dbus_value = dbus_tuple[2]
cmd = [
'sudo', '-u', username,
'env',
'DISPLAY=' + display_value,
'DBUS_SESSION_BUS_ADDRESS=' + dbus_value,
'notify-send',
'--icon=dialog-warning',
title,
body
]
go(pop, cmd, username)
def send_notify_warn():
""" Implement Low memory warnings
"""
log('Warning threshold exceeded')
if check_warning_exe:
go(exe, warning_exe)
else:
title = 'Low memory'
body = 'MemAvail: {}%\nSwapFree: {}%'.format(
round(mem_available / mem_total * 100),
round(swap_free / (swap_total + 0.1) * 100)
)
go(send_notification, title, body)
def send_notify(threshold, name, pid):
"""
Notificate about OOM Preventing.
threshold: key for notify_sig_dict
name: str process name
pid: str process pid
"""
title = 'Freeze prevention'
body = '<b>{}</b> [{}] <b>{}</b>'.format(
notify_sig_dict[threshold],
pid,
name.replace(
# symbol '&' can break notifications in some themes,
# therefore it is replaced by '*'
'&', '*'
)
)
go(send_notification, title, body)
def send_notify_etc(pid, name, command):
"""
Notificate about OOM Preventing.
command: str command that will be executed
name: str process name
pid: str process pid
"""
title = 'Freeze prevention'
body = '<b>Victim is</b> [{}] <b>{}</b>\nExecute the co' \
'mmand:\n<b>{}</b>'.format(
pid, name.replace('&', '*'), command.replace('&', '*'))
go(send_notification, title, body)
def check_config():
"""
"""
@ -181,15 +440,6 @@ def check_config():
exit()
def encoder(string):
"""
"""
encoded = ''
for i in string:
encoded += str(ord(i)) + ':'
return encoded[:-1]
def get_swap_threshold_tuple(string):
# re (Num %, True) or (Num KiB, False)
"""Returns KiB value if abs val was set in config, or tuple with %"""
@ -292,36 +542,6 @@ def signal_handler_inner(signum, frame):
sig_dict[signum]))
def exe(cmd):
"""
"""
log('Execute the command: {}'.format(cmd))
t0 = time()
write_self_oom_score_adj(self_oom_score_adj_max)
err = os.system(cmd)
write_self_oom_score_adj(self_oom_score_adj_min)
dt = time() - t0
log('Exit status: {}; exe duration: {} sec'.format(err, round(dt, 3)))
return err
def go(func, *a):
""" run func in new thread
"""
t1 = time()
try:
Thread(target=func, args=a).start()
except RuntimeError:
print('RuntimeError: cannot spawn a new thread')
return 1
t2 = time()
log('New thread spawned in {} ms'.format(
round((t2 - t1) * 1000, 1)
))
return 0
def write(path, string):
"""
"""
@ -357,15 +577,9 @@ def func_print_proc_table():
def log(*msg):
"""
"""
try:
print(*msg)
except OSError:
sleep(0.01)
print(*msg)
if separate_log:
try:
logging.info(*msg)
except OSError:
sleep(0.01)
logging.info(*msg)
def print_version():
@ -1009,80 +1223,6 @@ def zram_stat(zram_id):
return disksize, mem_used_total # BYTES, str
def send_notify_warn():
"""
Look for process with maximum 'badness' and warn user with notification.
(implement Low memory warnings)
"""
log('Warning threshold exceeded')
if check_warning_exe:
exe(warning_exe)
else:
title = 'Low memory'
body = 'MemAvail: {}%\nSwapFree: {}%'.format(
round(mem_available / mem_total * 100),
round(swap_free / (swap_total + 0.1) * 100)
)
send_notification(title, body)
def send_notify(threshold, name, pid):
"""
Notificate about OOM Preventing.
threshold: key for notify_sig_dict
name: str process name
pid: str process pid
"""
title = 'Freeze prevention'
body = '<b>{}</b> [{}] <b>{}</b>'.format(
notify_sig_dict[threshold],
pid,
name.replace(
# symbol '&' can break notifications in some themes,
# therefore it is replaced by '*'
'&', '*'
)
)
send_notification(title, body)
def send_notify_etc(pid, name, command):
"""
Notificate about OOM Preventing.
command: str command that will be executed
name: str process name
pid: str process pid
"""
title = 'Freeze prevention'
body = '<b>Victim is</b> [{}] <b>{}</b>\nExecute the co' \
'mmand:\n<b>{}</b>'.format(
pid, name.replace('&', '*'), command.replace('&', '*'))
send_notification(title, body)
def send_notification(title, body):
"""
"""
cmd = '{} "--euid={}" "--debug={}" "--title={}" "--body={}" &'.format(
notify_helper_path,
self_uid,
debug_gui_notifications,
title,
encoder(body))
go(exe, cmd)
def get_pid_list():
"""
Find pid list expect kthreads and zombies
@ -2163,11 +2303,7 @@ def sleep_after_check_mem():
log('Sleep {} sec (t_mem={}, t_swap={}{})'.format(round(t, 2), round(
t_mem, 2), round(t_swap, 2), z))
try:
stdout.flush()
except OSError:
pass
stdout.flush()
sleep(t)
@ -2631,10 +2767,19 @@ print_config_at_startup = conf_parse_bool('print_config_at_startup')
print_mem_check_results = conf_parse_bool('print_mem_check_results')
debug_sleep = conf_parse_bool('debug_sleep')
low_memory_warnings_enabled = conf_parse_bool('low_memory_warnings_enabled')
if low_memory_warnings_enabled or post_action_gui_notifications:
from subprocess import Popen, TimeoutExpired
post_action_gui_notifications = conf_parse_bool(
'post_action_gui_notifications')
debug_threading = conf_parse_bool('debug_threading')
psi_checking_enabled = conf_parse_bool('psi_checking_enabled')
ignore_psi = not psi_checking_enabled
@ -2694,6 +2839,20 @@ else:
exit(1)
if 'env_cache_time' in config_dict:
env_cache_time = string_to_float_convert_test(
config_dict['env_cache_time'])
if env_cache_time is None:
errprint('Invalid env_cache_time value, not float\nExit')
exit(1)
if env_cache_time < 0:
errprint('fill_rate_mem MUST be >= 0\nExit')
exit(1)
else:
errprint('fill_rate_mem not in config\nExit')
exit(1)
if 'fill_rate_mem' in config_dict:
fill_rate_mem = string_to_float_convert_test(config_dict['fill_rate_mem'])
if fill_rate_mem is None:
@ -3201,6 +3360,14 @@ log('Monitoring has started!')
stdout.flush()
display_env = 'DISPLAY='
dbus_env = 'DBUS_SESSION_BUS_ADDRESS='
user_env = 'USER='
envd = dict()
envd['list_with_envs'] = envd['t'] = None
##########################################################################

View File

@ -297,6 +297,9 @@ warning_threshold_max_psi = 100
min_post_warning_delay = 30
env_cache_time = 300
Ampersands (&) will be replaced with asterisks (*) in process
names and in commands.
@ -347,6 +350,8 @@ debug_sleep = False
separate_log = False
debug_threading = False
###############################################################################
9. Misc

View File

@ -293,6 +293,8 @@ warning_threshold_max_psi = 100
min_post_warning_delay = 20
env_cache_time = 300
Ampersands (&) will be replaced with asterisks (*) in process
names and in commands.
@ -343,6 +345,8 @@ debug_sleep = False
separate_log = False
debug_threading = False
###############################################################################
9. Misc

3360
old/nohang Executable file

File diff suppressed because it is too large Load Diff

359
old/nohang.conf Normal file
View File

@ -0,0 +1,359 @@
This is nohang config file.
Lines starting with #, tabs and spaces are comments.
Lines starting with @ contain optional parameters.
All values are case sensitive.
Be careful: nohang doesn't forbid you to shoot yourself in the foot.
The configuration includes the following sections:
0. Common zram settings
1. Memory levels to respond to as an OOM threat
2. Response on PSI memory metrics
3. The frequency of checking the level of available memory
(and CPU usage)
4. The prevention of killing innocent victims
5. Impact on the badness of processes via matching their names, cgroups and
cmdlines with specified regular expressions
6. Customize corrective actions: the execution of a specific command
instead of sending the SIGTERM signal
7. GUI notifications:
- low memory warnings
- OOM prevention results
8. Output verbosity
9. Misc
Just read the description of the parameters and edit the values.
Please restart the program after editing the config.
More docs will be written later.
###############################################################################
0. Common zram settings
See https://www.kernel.org/doc/Documentation/blockdev/zram.txt
You maybe need to set `zram_checking_enabled = True` if you has a big zram disksize.
zram_checking_enabled = False
###############################################################################
1. Thresholds below which a signal should be sent to the victim
Sets the available memory levels at or below which SIGTERM or SIGKILL
signals are sent. The signal will be sent if MemAvailable and
SwapFree (in /proc/meminfo) at the same time will drop below the
corresponding values. Can be specified in % (percent) and M (MiB).
Valid values are floating-point numbers from the range [0; 100] %.
MemAvailable levels.
soft_threshold_min_mem = 8 %
hard_threshold_min_mem = 4 %
SwapFree levels.
soft_threshold_min_swap = 10 %
hard_threshold_min_swap = 5 %
Specifying the total share of zram in memory, if exceeded the
corresponding signals are sent. As the share of zram in memory
increases, it may fall responsiveness of the system. 90 % is a
usual hang level, not recommended to set very high.
Can be specified in % and M. Valid values are floating-point
numbers from the range [0; 90] %.
soft_threshold_max_zram = 60 %
hard_threshold_max_zram = 65 %
###############################################################################
2. Response on PSI memory metrics (it needs Linux 4.20 and up)
About PSI:
https://facebookmicrosites.github.io/psi/
Disabled by default (psi_checking_enabled = False).
psi_checking_enabled = False
Choose a path to PSI file.
By default it monitors system-wide file: /proc/pressure/memory
You also can set file to monitor one cgroup slice.
For example:
psi_path = /sys/fs/cgroup/unified/user.slice/memory.pressure
psi_path = /sys/fs/cgroup/unified/system.slice/memory.pressure
psi_path = /sys/fs/cgroup/unified/system.slice/foo.service/memory.pressure
Execute the command
find /sys/fs/cgroup -name memory.pressure
to find available memory.pressue files (except /proc/pressure/memory).
(actual for cgroup2)
psi_path = /proc/pressure/memory
Valid psi_metrics are:
some_avg10
some_avg60
some_avg300
full_avg10
full_avg60
full_avg300
some_avg10 is most sensitive.
psi_metrics = some_avg10
soft_threshold_max_psi = 60
hard_threshold_max_psi = 90
>= 0, float
psi_excess_duration = 60
psi_post_action_delay = 60
###############################################################################
3. The frequency of checking the amount of available memory
(and CPU usage)
Coefficients that affect the intensity of monitoring. Reducing
the coefficients can reduce CPU usage and increase the periods
between memory checks.
Why three coefficients instead of one? Because the swap fill rate
is usually lower than the RAM fill rate.
It is possible to set a lower intensity of monitoring for swap
without compromising to prevent OOM and thus reduce the CPU load.
Default values are well for desktop. On servers without rapid
fluctuations in memory levels the values can be reduced.
Valid values are positive floating-point numbers.
fill_rate_mem = 4000
fill_rate_swap = 1500
fill_rate_zram = 6000
See also https://github.com/rfjakob/earlyoom/issues/61
max_sleep = 3
min_sleep = 0.1
Sleep time if soft threshold exceeded.
over_sleep = 0.05
###############################################################################
4. The prevention of killing innocent victims
Valid values are integers from the range [0; 1000].
min_badness = 10
Valid values are non-negative floating-point numbers.
Min delay if a victim doesn't respond to SIGTERM in 10 ms.
post_soft_action_delay = 3
post_zombie_delay = 0.1
victim_cache_time = 10
Valid values are True and False.
ignore_positive_oom_score_adj = False
###############################################################################
5. Impact on the badness of processes via matching their names,
cmdlines or UIDs with regular expressions using re.search().
See https://en.wikipedia.org/wiki/Regular_expression and
https://en.wikipedia.org/wiki/Perl_Compatible_Regular_Expressions
Enabling this options slows down the search for the victim
because the names, cmdlines or UIDs of all processes
(except init and kthreads) are compared with the
specified regex patterns (in fact slowing down is caused by
reading all /proc/*/cmdline and /proc/*/status files).
Use script `oom-sort` from nohang package to view
names, cmdlines and UIDs of processes.
5.1. Matching process names with RE patterns
Syntax:
@BADNESS_ADJ_RE_NAME badness_adj /// RE_pattern
New badness value will be += badness_adj
It is possible to compare multiple patterns
with different badness_adj values.
Example:
@BADNESS_ADJ_RE_NAME -500 /// ^sshd$
5.2. Matching CGroup_v1-line with RE patterns
@BADNESS_ADJ_RE_CGROUP_V1 -100 /// ^/system\.slice/
@BADNESS_ADJ_RE_CGROUP_V1 50 /// /foo\.service$
@BADNESS_ADJ_RE_CGROUP_V1 -50 /// ^/user\.slice/
5.3. Matching CGroup_v2-line with RE patterns
@BADNESS_ADJ_RE_CGROUP_V2 100 /// ^/workload
5.4. Matching eUIDs with RE patterns
@BADNESS_ADJ_RE_UID -100 /// ^0$
5.5. Matching realpath with RE patterns
@BADNESS_ADJ_RE_REALPATH 20 /// ^/usr/bin/foo
5.6. Matching cmdlines with RE patterns
A good option that allows fine adjustment.
Prefer chromium tabs and electron-based apps
@BADNESS_ADJ_RE_CMDLINE 200 /// --type=renderer
Prefer firefox tabs (Web Content and WebExtensions)
@BADNESS_ADJ_RE_CMDLINE 300 /// -appomni
@BADNESS_ADJ_RE_CMDLINE -200 /// ^/usr/lib/virtualbox
5.7. Matching environ with RE patterns
@BADNESS_ADJ_RE_ENVIRON 100 /// USER=user
Note that you can control badness also via systemd units via
OOMScoreAdjust, see
www.freedesktop.org/software/systemd/man/systemd.exec.html#OOMScoreAdjust=
###############################################################################
6. Customize corrective actions.
TODO: docs
Syntax:
KEY REGEXP SEPARATOR COMMAND
@SOFT_ACTION_RE_NAME ^foo$ /// kill -SEGV $PID
@SOFT_ACTION_RE_NAME ^bash$ /// kill -9 $PID
@SOFT_ACTION_RE_CGROUP_V1 ^/system\.slice/ /// systemctl restart $SERVICE
@SOFT_ACTION_RE_CGROUP_V1 /foo\.service$ /// systemctl restart $SERVICE
$PID will be replaced by process PID.
$NAME will be replaced by process name.
$SERVICE will be replaced by .service if it exists (overwise it will be
relpaced by empty line)
###############################################################################
7. GUI notifications & low memory warnings
post_action_gui_notifications = False
Enable GUI notifications about the low level of available memory.
Valid values are True and False.
low_memory_warnings_enabled = False
Execute the command instead of sending GUI notifications if the value is
not empty line. For example:
warning_exe = cat /proc/meminfo &
warning_exe =
Can be specified in % (percent) and M (MiB).
Valid values are floating-point numbers from the range [0; 100] %.
warning_threshold_min_mem = 20 %
warning_threshold_min_swap = 25 %
warning_threshold_max_zram = 50 %
warning_threshold_max_psi = 100
Valid values are floating-point numbers from the range [1; 300].
min_post_warning_delay = 20
Ampersands (&) will be replaced with asterisks (*) in process
names and in commands.
###############################################################################
8. Verbosity
Display the configuration when the program starts.
Valid values are True and False.
print_config_at_startup = False
Print memory check results.
Valid values are True and False.
print_mem_check_results = False
min_mem_report_interval = 60
print_proc_table = False
Valid values:
None
cgroup_v1
cgroup_v2
realpath
cmdline
environ
extra_table_info = None
print_victim_status = True
max_victim_ancestry_depth = 3
print_victim_cmdline = False
print_statistics = True
Print sleep periods between memory checks.
Valid values are True and False.
debug_psi = False
debug_gui_notifications = False
debug_sleep = False
separate_log = False
###############################################################################
9. Misc
max_soft_exit_time = 10
post_kill_exe =
forbid_negative_badness = True
###############################################################################
Use cases, feature requests and any questions are welcome:
https://github.com/hakavlad/nohang/issues

View File

@ -291,6 +291,8 @@ warning_threshold_max_psi = 100
min_post_warning_delay = 20
env_cache_time = 300
Ampersands (&) will be replaced with asterisks (*) in process
names and in commands.
@ -341,6 +343,9 @@ debug_sleep = True
separate_log = True
debug_threading = True
###############################################################################
9. Misc