fix GUI warns, fix poll rate alg

This commit is contained in:
Alexey Avramov 2019-01-12 17:24:31 +09:00
parent 5867606c4a
commit afd848c22f
4 changed files with 92 additions and 265 deletions

View File

@ -176,7 +176,7 @@ See also `man journalctl`.
## Known problems
- Awful documentation
- Slowly starting, slowly looking for a victim, especially when using swapspace
- Slowly starting, slowly looking for a victim, especially when using swapspace (although this should be enough for more than 95% of all cases, IMHO)
- It is written in an interpreted language and is actually a prototype
## Contribution
@ -194,8 +194,11 @@ Please create [issues](https://github.com/hakavlad/nohang/issues). Use cases, fe
- Display `UID`, `oom_score`, `oom_score_adj`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem` and `cmdline` of the victim in corrective action reports
- Print in terminal with colors
- Print statistics on corrective actions after each corrective action
- Optimize limiting `oom_score_adj`: now it can works without UID=0
- Optimize GUI warnings: find env without run `ps` and `env`
- Improve poll rate algorithm
- Improve limiting `oom_score_adj`: now it can works without UID=0
- Improve GUI warnings:
- Find env without run `ps` and `env`
- Handle all timeouts when notify-send starts
- Fix conf parsing: use of `line.partition('=')` instead of `line.split('=')`
- Add `PSI` support (using `/proc/pressure/memory`, need Linux 4.20+)
- Add `oom-sort`

249
nohang
View File

@ -21,10 +21,15 @@ self_pid = str(os.getpid())
self_uid = os.geteuid()
if self_uid == 0:
root = True
else:
root = False
wait_time = 2
cache_time = 30
cache_path = '/dev/shm/nohang_env_cache'
wait_time = 14
max_sleep_time = 2
min_sleep_time = 0.1
notify_helper_path = '/usr/bin/nohang_notify_helper'
psi_path = '/proc/pressure/memory'
psi_support = os.path.exists(psi_path)
@ -119,79 +124,6 @@ def format_time(t):
return '{} h {} min {} sec'.format(h, m, s)
def re_pid_environ(pid):
"""
read environ of 1 process
returns tuple with USER, DBUS, DISPLAY like follow:
('user', 'DISPLAY=:0',
'DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus')
returns None if these vars is not in /proc/[pid]/environ
"""
display_env = 'DISPLAY='
dbus_env = 'DBUS_SESSION_BUS_ADDRESS='
user_env = 'USER='
try:
env = str(rline1('/proc/' + pid + '/environ'))
if display_env in env and dbus_env in env and user_env in env:
env_list = env.split('\x00')
# iterating over a list of process environment variables
for i in env_list:
if i.startswith(user_env):
user = i
continue
if i.startswith(display_env):
display = i[:10]
continue
if i.startswith(dbus_env):
#if ',guid=' in i:
# return None
dbus = i
continue
if i.startswith('HOME='):
# exclude Display Manager's user
if i.startswith('HOME=/var'):
return None
env = user.partition('USER=')[2], display, dbus
return env
except FileNotFoundError:
return None
except ProcessLookupError:
return None
def root_notify_env():
"""return set(user, display, dbus)"""
unsorted_envs_list = []
# iterates over processes, find processes with suitable env
for pid in os.listdir('/proc'):
if pid[0].isdecimal() is False:
continue
one_env = re_pid_environ(pid)
unsorted_envs_list.append(one_env)
env = set(unsorted_envs_list)
env.discard(None)
new_env = []
end = []
for i in env:
#print(i)
key = i[0] + i[1]
#print(key)
if key not in end:
end.append(key)
new_env.append(i)
else:
continue
#print(new_env)
return new_env
def string_to_float_convert_test(string):
"""Try to interprete string values as floats."""
try:
@ -351,9 +283,28 @@ def pid_to_uid(pid):
return line.split('\t')[1]
def notify_send_wait(title, body):
with Popen(['notify-send', '--icon=dialog-warning', title, body]) as proc:
try:
proc.wait(timeout=wait_time)
except TimeoutExpired:
proc.kill()
print('TimeoutExpired: notify-send {} {}'.format(title, body))
def notify_helper(title, body):
with Popen([notify_helper_path, title, body]) as proc:
try:
proc.wait(timeout=wait_time)
except TimeoutExpired:
proc.kill()
print('TimeoutExpired: nohang_notify_helper {} {}'.format(title, body))
def send_notify_warn():
"""
Look for process with maximum 'badness' and warn user with notification.
(implement Low memory warnings)
"""
# find process with max badness
fat_tuple = fattest()
@ -378,23 +329,10 @@ def send_notify_warn():
if root: # If nohang was started by root
# send notification to all active users with special script
# теперь можно напрямую уведомлять из кэша если он не устарел
Popen([
'/usr/bin/nohang_notify_low_mem',
'--mem', low_mem_percent,
'--pid', pid,
'--name', name
])
notify_helper(title, body)
else: # Or by regular user
# send notification to user that runs this nohang
try:
Popen(['notify-send', '--icon=dialog-warning',
'{}'.format(title), '{}'.format(body)]).wait(wait_time)
except TimeoutExpired:
print('TimeoutExpired: ' + 'notify low mem')
notify_send_wait(title, body)
def send_notify(signal, name, pid):
@ -413,26 +351,10 @@ def send_notify(signal, name, pid):
'&', '*'))
if root:
# send notification to all active users with notify-send
b = root_notify_env()
if len(b) > 0:
for i in b:
username, display_env, dbus_env = i[0], i[1], i[2]
#if '1000' in dbus_env:
# continue
#print(username, display_env, dbus_env)
try:
Popen(['sudo', '-u', username, 'env', display_env,
dbus_env, 'notify-send', '--icon=dialog-warning',
'{}'.format(title), '{}'.format(body)]).wait(wait_time)
except TimeoutExpired:
print('TimeoutExpired: ' + 'notify send signal')
notify_helper(title, body)
else:
# send notification to user that runs this nohang
try:
Popen(['notify-send', '--icon=dialog-warning',
'{}'.format(title), '{}'.format(body)]).wait(wait_time)
except TimeoutExpired:
print('TimeoutExpired: ' + 'notify send signal')
notify_send_wait(title, body)
def send_notify_etc(pid, name, command):
@ -448,20 +370,10 @@ def send_notify_etc(pid, name, command):
pid, name.replace('&', '*'), command.replace('&', '*'))
if root:
# send notification to all active users with notify-send
b = root_notify_env()
if len(b) > 0:
for i in b:
username, display_env, dbus_env = i[0], i[1], i[2]
try:
Popen(['sudo', '-u', username, 'env', display_env,
dbus_env, 'notify-send', '--icon=dialog-warning',
'{}'.format(title), '{}'.format(body)]).wait(wait_time)
except TimeoutExpired:
print('TimeoutExpired: notify run command')
notify_send_wait(title, body)
else:
# send notification to user that runs this nohang
Popen(['notify-send', '--icon=dialog-warning', '{}'.format(title), '{}'
.format(body)])
notify_send_wait(title, body)
def sleep_after_send_signal(signal):
@ -789,9 +701,20 @@ def find_victim_and_send_signal(signal):
def sleep_after_check_mem():
"""Specify sleep times depends on rates and avialable memory."""
t_mem = mem_available / rate_mem
t_swap = swap_free / rate_swap
t_zram = (mem_total - mem_used_zram) / rate_zram
if mem_min_sigkill_kb < mem_min_sigterm_kb:
mem_point = mem_available - mem_min_sigterm_kb
else:
mem_point = mem_available - mem_min_sigkill_kb
if swap_min_sigkill_kb < swap_min_sigterm_kb:
swap_point = swap_free - swap_min_sigterm_kb
else:
swap_point = swap_free - swap_min_sigkill_kb
t_mem = mem_point / rate_mem
t_swap = swap_point / rate_swap
t_zram = (mem_total * 0.9 - mem_used_zram) / rate_zram
t_mem_swap = t_mem + t_swap
t_mem_zram = t_mem + t_zram
@ -801,17 +724,20 @@ def sleep_after_check_mem():
else:
t = t_mem_zram
max_sleep_time = 1
if t > max_sleep_time:
t = 1
t = max_sleep_time
elif t < min_sleep_time:
t = min_sleep_time
else:
pass
try:
if print_sleep_periods:
print('sleep', round(t, 2),
' (t_mem={}, t_swap={}, t_zram={})'.format(
round(t_mem, 2),
round(t_swap, 2),
round(t_zram, 2)))
print('sleep', round(t, 2))
# ' (t_mem={}, t_swap={}, t_zram={})'.format(
#round(t_mem, 2),
#round(t_swap, 2),
#round(t_zram, 2)))
stdout.flush()
sleep(t)
except KeyboardInterrupt:
@ -1425,68 +1351,10 @@ print('Startup time:',
print('Monitoring started!')
def save_env_cache():
z = '{}\n'.format(int(time()))
a = root_notify_env()
# print(a)
for i in a:
z = z + '{}\x00{}\x00{}\n'.format(i[0], i[1], i[2])
write(cache_path, z)
os.chmod(cache_path, 0000)
return a
def read_env_cache():
x, y = [], []
try:
with open(cache_path) as f:
for n, line in enumerate(f):
if n is 0:
t = line[:-1]
y.append(t)
continue
if n > 0:
x.append(line[:-1].split('\x00'))
except FileNotFoundError:
return None
y.append(x)
return y
def root_env_cache():
cache = read_env_cache()
if cache is None:
print('cache not found, get new env and cache it')
return save_env_cache()
delta_t = time() - int(cache[0])
if delta_t > cache_time:
print('cache time: {}, delta: {}, '
'get new env and cache it'.format(
cache_time, round(delta_t)))
save_env_cache()
return root_notify_env()
else:
print('cache time: {}, delta: {}, '
'get cached env'.format(
cache_time, round(delta_t)))
return cache[1]
t1 = time()
# root_env_cache()
t2 = time()
# print(t2 - t1)
stdout.flush()
# exit()
sigterm_psi = sigterm_psi_avg10
sigkill_psi = sigkill_psi_avg10
# avg_min_time = 4
psi_min_sleep_time_after_action = psi_avg10_sleep_time
@ -1494,12 +1362,9 @@ psi_min_sleep_time_after_action = psi_avg10_sleep_time
if psi_support and not ignore_psi:
# ta0 = time()
# a0 = psi_mem_some_avg_total()
kill_psi_t0 = time() + psi_avg10_sleep_time
term_psi_t0 = time() + psi_avg10_sleep_time
avg_value = ''
while True:

View File

@ -56,7 +56,7 @@ swap_min_sigkill = 5 %
usual hang level, not recommended to set very high.
Can be specified in % and M. Valid values are floating-point
numbers from the range [0; 100] %.
numbers from the range [0; 90] %.
zram_max_sigterm = 50 %
zram_max_sigkill = 55 %
@ -93,8 +93,8 @@ psi_avg10_sleep_time = 60
Valid values are positive floating-point numbers.
rate_mem = 6
rate_swap = 3
rate_mem = 4
rate_swap = 2
rate_zram = 1
See also https://github.com/rfjakob/earlyoom/issues/61
@ -261,7 +261,7 @@ gui_notifications = True
Enable GUI notifications about the low level of available memory.
Valid values are True and False.
gui_low_memory_warnings = True
gui_low_memory_warnings = False
Минимальное время между отправками уведомлений в секундах.
Valid values are floating-point numbers from the range [1; 300].
@ -304,5 +304,5 @@ print_mem_check_results = True
Print sleep periods between memory checks.
Valid values are True and False.
print_sleep_periods = False
print_sleep_periods = True

View File

@ -1,53 +1,17 @@
#!/usr/bin/env python3
#
# Usage:
# ./nohang_notify_helper "title" "body"
# nohang_notify_low_mem --mem '14% 12%' --name 'stress' --pid '6666'
# need UID=0
# output:
# Low memory: 14% 12%
# Fattest process: 6666, stress
# need to remove this slow and fat parser
from argparse import ArgumentParser
from sys import argv
from os import listdir
from subprocess import Popen, TimeoutExpired
wait_time = 10
if len(argv) < 2 or argv[1] == "-h" or argv[1] == "--help":
print('Usage: ./nohang_notify_helper "title" "body"')
exit(1)
parser = ArgumentParser()
parser.add_argument(
'--mem',
help="""available memory percent (15%, for example)""",
default=None,
type=str
)
parser.add_argument(
'--pid',
help="""pid""",
default=None,
type=str
)
parser.add_argument(
'--name',
help="""process name""",
default=None,
type=str
)
args = parser.parse_args()
pid = args.pid
name = args.name
mem = args.mem
title = 'Low memory: {}'.format(mem)
body = 'Fattest process: <b>{}</b>, <b>{}</b>'.format(pid, name)
wait_time = 12
display_env = 'DISPLAY='
dbus_env = 'DBUS_SESSION_BUS_ADDRESS='
@ -69,9 +33,6 @@ def re_pid_environ(pid):
'DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus')
returns None if these vars is not in /proc/[pid]/environ
"""
display_env = 'DISPLAY='
dbus_env = 'DBUS_SESSION_BUS_ADDRESS='
user_env = 'USER='
try:
env = str(rline1('/proc/' + pid + '/environ'))
if display_env in env and dbus_env in env and user_env in env:
@ -88,8 +49,6 @@ def re_pid_environ(pid):
continue
if i.startswith(dbus_env):
#if ',guid=' in i:
# return None
dbus = i
continue
@ -119,27 +78,27 @@ def root_notify_env():
env = set(unsorted_envs_list)
env.discard(None)
# deduplicate dbus
new_env = []
end = []
for i in env:
#print(i)
key = i[0] + i[1]
#print(key)
if key not in end:
end.append(key)
new_env.append(i)
else:
continue
#print(new_env)
return new_env
b = root_notify_env()
list_with_envs = root_notify_env()
# if somebody logged in with GUI
if len(b) > 0:
if len(list_with_envs) > 0:
# iterating over logged-in users
for i in b:
for i in list_with_envs:
username, display_env, dbus_env = i[0], i[1], i[2]
display_tuple = display_env.partition('=')
dbus_tuple = dbus_env.partition('=')
@ -147,16 +106,16 @@ if len(b) > 0:
dbus_key, dbus_value = dbus_tuple[0], dbus_tuple[2]
with Popen(['sudo', '-u', username,
'notify-send', '--icon=dialog-warning',
'{}'.format(title), '{}'.format(body)
], env={
'notify-send', '--icon=dialog-warning',
argv[1], argv[2]
], env={
display_key: display_value,
dbus_key: dbus_value
}) as proc:
try:
proc.wait(timeout=wait_time)
except TimeoutExpired:
proc.kill()
print('TimeoutExpired: notify' + username)
}) as proc:
try:
proc.wait(timeout=wait_time)
except TimeoutExpired:
proc.kill()
print('TimeoutExpired: notify' + username)
else:
print('Low memory warnings: nobody logged in with GUI. Nothing to do.')
print('Nobody logged-in with GUI. Nothing to do.')