fix GUI warns, fix poll rate alg

This commit is contained in:
Alexey Avramov 2019-01-12 17:24:31 +09:00
parent 5867606c4a
commit afd848c22f
4 changed files with 92 additions and 265 deletions

View File

@ -176,7 +176,7 @@ See also `man journalctl`.
## Known problems ## Known problems
- Awful documentation - Awful documentation
- Slowly starting, slowly looking for a victim, especially when using swapspace - Slowly starting, slowly looking for a victim, especially when using swapspace (although this should be enough for more than 95% of all cases, IMHO)
- It is written in an interpreted language and is actually a prototype - It is written in an interpreted language and is actually a prototype
## Contribution ## Contribution
@ -194,8 +194,11 @@ Please create [issues](https://github.com/hakavlad/nohang/issues). Use cases, fe
- Display `UID`, `oom_score`, `oom_score_adj`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem` and `cmdline` of the victim in corrective action reports - Display `UID`, `oom_score`, `oom_score_adj`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem` and `cmdline` of the victim in corrective action reports
- Print in terminal with colors - Print in terminal with colors
- Print statistics on corrective actions after each corrective action - Print statistics on corrective actions after each corrective action
- Optimize limiting `oom_score_adj`: now it can works without UID=0 - Improve poll rate algorithm
- Optimize GUI warnings: find env without run `ps` and `env` - Improve limiting `oom_score_adj`: now it can works without UID=0
- Improve GUI warnings:
- Find env without run `ps` and `env`
- Handle all timeouts when notify-send starts
- Fix conf parsing: use of `line.partition('=')` instead of `line.split('=')` - Fix conf parsing: use of `line.partition('=')` instead of `line.split('=')`
- Add `PSI` support (using `/proc/pressure/memory`, need Linux 4.20+) - Add `PSI` support (using `/proc/pressure/memory`, need Linux 4.20+)
- Add `oom-sort` - Add `oom-sort`

249
nohang
View File

@ -21,10 +21,15 @@ self_pid = str(os.getpid())
self_uid = os.geteuid() self_uid = os.geteuid()
if self_uid == 0: if self_uid == 0:
root = True root = True
else:
root = False
wait_time = 2 wait_time = 14
cache_time = 30
cache_path = '/dev/shm/nohang_env_cache' max_sleep_time = 2
min_sleep_time = 0.1
notify_helper_path = '/usr/bin/nohang_notify_helper'
psi_path = '/proc/pressure/memory' psi_path = '/proc/pressure/memory'
psi_support = os.path.exists(psi_path) psi_support = os.path.exists(psi_path)
@ -119,79 +124,6 @@ def format_time(t):
return '{} h {} min {} sec'.format(h, m, s) return '{} h {} min {} sec'.format(h, m, s)
def re_pid_environ(pid):
"""
read environ of 1 process
returns tuple with USER, DBUS, DISPLAY like follow:
('user', 'DISPLAY=:0',
'DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus')
returns None if these vars is not in /proc/[pid]/environ
"""
display_env = 'DISPLAY='
dbus_env = 'DBUS_SESSION_BUS_ADDRESS='
user_env = 'USER='
try:
env = str(rline1('/proc/' + pid + '/environ'))
if display_env in env and dbus_env in env and user_env in env:
env_list = env.split('\x00')
# iterating over a list of process environment variables
for i in env_list:
if i.startswith(user_env):
user = i
continue
if i.startswith(display_env):
display = i[:10]
continue
if i.startswith(dbus_env):
#if ',guid=' in i:
# return None
dbus = i
continue
if i.startswith('HOME='):
# exclude Display Manager's user
if i.startswith('HOME=/var'):
return None
env = user.partition('USER=')[2], display, dbus
return env
except FileNotFoundError:
return None
except ProcessLookupError:
return None
def root_notify_env():
"""return set(user, display, dbus)"""
unsorted_envs_list = []
# iterates over processes, find processes with suitable env
for pid in os.listdir('/proc'):
if pid[0].isdecimal() is False:
continue
one_env = re_pid_environ(pid)
unsorted_envs_list.append(one_env)
env = set(unsorted_envs_list)
env.discard(None)
new_env = []
end = []
for i in env:
#print(i)
key = i[0] + i[1]
#print(key)
if key not in end:
end.append(key)
new_env.append(i)
else:
continue
#print(new_env)
return new_env
def string_to_float_convert_test(string): def string_to_float_convert_test(string):
"""Try to interprete string values as floats.""" """Try to interprete string values as floats."""
try: try:
@ -351,9 +283,28 @@ def pid_to_uid(pid):
return line.split('\t')[1] return line.split('\t')[1]
def notify_send_wait(title, body):
with Popen(['notify-send', '--icon=dialog-warning', title, body]) as proc:
try:
proc.wait(timeout=wait_time)
except TimeoutExpired:
proc.kill()
print('TimeoutExpired: notify-send {} {}'.format(title, body))
def notify_helper(title, body):
with Popen([notify_helper_path, title, body]) as proc:
try:
proc.wait(timeout=wait_time)
except TimeoutExpired:
proc.kill()
print('TimeoutExpired: nohang_notify_helper {} {}'.format(title, body))
def send_notify_warn(): def send_notify_warn():
""" """
Look for process with maximum 'badness' and warn user with notification. Look for process with maximum 'badness' and warn user with notification.
(implement Low memory warnings)
""" """
# find process with max badness # find process with max badness
fat_tuple = fattest() fat_tuple = fattest()
@ -378,23 +329,10 @@ def send_notify_warn():
if root: # If nohang was started by root if root: # If nohang was started by root
# send notification to all active users with special script # send notification to all active users with special script
notify_helper(title, body)
# теперь можно напрямую уведомлять из кэша если он не устарел
Popen([
'/usr/bin/nohang_notify_low_mem',
'--mem', low_mem_percent,
'--pid', pid,
'--name', name
])
else: # Or by regular user else: # Or by regular user
# send notification to user that runs this nohang # send notification to user that runs this nohang
try: notify_send_wait(title, body)
Popen(['notify-send', '--icon=dialog-warning',
'{}'.format(title), '{}'.format(body)]).wait(wait_time)
except TimeoutExpired:
print('TimeoutExpired: ' + 'notify low mem')
def send_notify(signal, name, pid): def send_notify(signal, name, pid):
@ -413,26 +351,10 @@ def send_notify(signal, name, pid):
'&', '*')) '&', '*'))
if root: if root:
# send notification to all active users with notify-send # send notification to all active users with notify-send
b = root_notify_env() notify_helper(title, body)
if len(b) > 0:
for i in b:
username, display_env, dbus_env = i[0], i[1], i[2]
#if '1000' in dbus_env:
# continue
#print(username, display_env, dbus_env)
try:
Popen(['sudo', '-u', username, 'env', display_env,
dbus_env, 'notify-send', '--icon=dialog-warning',
'{}'.format(title), '{}'.format(body)]).wait(wait_time)
except TimeoutExpired:
print('TimeoutExpired: ' + 'notify send signal')
else: else:
# send notification to user that runs this nohang # send notification to user that runs this nohang
try: notify_send_wait(title, body)
Popen(['notify-send', '--icon=dialog-warning',
'{}'.format(title), '{}'.format(body)]).wait(wait_time)
except TimeoutExpired:
print('TimeoutExpired: ' + 'notify send signal')
def send_notify_etc(pid, name, command): def send_notify_etc(pid, name, command):
@ -448,20 +370,10 @@ def send_notify_etc(pid, name, command):
pid, name.replace('&', '*'), command.replace('&', '*')) pid, name.replace('&', '*'), command.replace('&', '*'))
if root: if root:
# send notification to all active users with notify-send # send notification to all active users with notify-send
b = root_notify_env() notify_send_wait(title, body)
if len(b) > 0:
for i in b:
username, display_env, dbus_env = i[0], i[1], i[2]
try:
Popen(['sudo', '-u', username, 'env', display_env,
dbus_env, 'notify-send', '--icon=dialog-warning',
'{}'.format(title), '{}'.format(body)]).wait(wait_time)
except TimeoutExpired:
print('TimeoutExpired: notify run command')
else: else:
# send notification to user that runs this nohang # send notification to user that runs this nohang
Popen(['notify-send', '--icon=dialog-warning', '{}'.format(title), '{}' notify_send_wait(title, body)
.format(body)])
def sleep_after_send_signal(signal): def sleep_after_send_signal(signal):
@ -789,9 +701,20 @@ def find_victim_and_send_signal(signal):
def sleep_after_check_mem(): def sleep_after_check_mem():
"""Specify sleep times depends on rates and avialable memory.""" """Specify sleep times depends on rates and avialable memory."""
t_mem = mem_available / rate_mem
t_swap = swap_free / rate_swap if mem_min_sigkill_kb < mem_min_sigterm_kb:
t_zram = (mem_total - mem_used_zram) / rate_zram mem_point = mem_available - mem_min_sigterm_kb
else:
mem_point = mem_available - mem_min_sigkill_kb
if swap_min_sigkill_kb < swap_min_sigterm_kb:
swap_point = swap_free - swap_min_sigterm_kb
else:
swap_point = swap_free - swap_min_sigkill_kb
t_mem = mem_point / rate_mem
t_swap = swap_point / rate_swap
t_zram = (mem_total * 0.9 - mem_used_zram) / rate_zram
t_mem_swap = t_mem + t_swap t_mem_swap = t_mem + t_swap
t_mem_zram = t_mem + t_zram t_mem_zram = t_mem + t_zram
@ -801,17 +724,20 @@ def sleep_after_check_mem():
else: else:
t = t_mem_zram t = t_mem_zram
max_sleep_time = 1
if t > max_sleep_time: if t > max_sleep_time:
t = 1 t = max_sleep_time
elif t < min_sleep_time:
t = min_sleep_time
else:
pass
try: try:
if print_sleep_periods: if print_sleep_periods:
print('sleep', round(t, 2), print('sleep', round(t, 2))
' (t_mem={}, t_swap={}, t_zram={})'.format( # ' (t_mem={}, t_swap={}, t_zram={})'.format(
round(t_mem, 2), #round(t_mem, 2),
round(t_swap, 2), #round(t_swap, 2),
round(t_zram, 2))) #round(t_zram, 2)))
stdout.flush() stdout.flush()
sleep(t) sleep(t)
except KeyboardInterrupt: except KeyboardInterrupt:
@ -1425,68 +1351,10 @@ print('Startup time:',
print('Monitoring started!') print('Monitoring started!')
def save_env_cache():
z = '{}\n'.format(int(time()))
a = root_notify_env()
# print(a)
for i in a:
z = z + '{}\x00{}\x00{}\n'.format(i[0], i[1], i[2])
write(cache_path, z)
os.chmod(cache_path, 0000)
return a
def read_env_cache():
x, y = [], []
try:
with open(cache_path) as f:
for n, line in enumerate(f):
if n is 0:
t = line[:-1]
y.append(t)
continue
if n > 0:
x.append(line[:-1].split('\x00'))
except FileNotFoundError:
return None
y.append(x)
return y
def root_env_cache():
cache = read_env_cache()
if cache is None:
print('cache not found, get new env and cache it')
return save_env_cache()
delta_t = time() - int(cache[0])
if delta_t > cache_time:
print('cache time: {}, delta: {}, '
'get new env and cache it'.format(
cache_time, round(delta_t)))
save_env_cache()
return root_notify_env()
else:
print('cache time: {}, delta: {}, '
'get cached env'.format(
cache_time, round(delta_t)))
return cache[1]
t1 = time()
# root_env_cache()
t2 = time()
# print(t2 - t1)
stdout.flush() stdout.flush()
# exit()
sigterm_psi = sigterm_psi_avg10 sigterm_psi = sigterm_psi_avg10
sigkill_psi = sigkill_psi_avg10 sigkill_psi = sigkill_psi_avg10
# avg_min_time = 4
psi_min_sleep_time_after_action = psi_avg10_sleep_time psi_min_sleep_time_after_action = psi_avg10_sleep_time
@ -1494,12 +1362,9 @@ psi_min_sleep_time_after_action = psi_avg10_sleep_time
if psi_support and not ignore_psi: if psi_support and not ignore_psi:
# ta0 = time()
# a0 = psi_mem_some_avg_total()
kill_psi_t0 = time() + psi_avg10_sleep_time kill_psi_t0 = time() + psi_avg10_sleep_time
term_psi_t0 = time() + psi_avg10_sleep_time term_psi_t0 = time() + psi_avg10_sleep_time
avg_value = '' avg_value = ''
while True: while True:

View File

@ -56,7 +56,7 @@ swap_min_sigkill = 5 %
usual hang level, not recommended to set very high. usual hang level, not recommended to set very high.
Can be specified in % and M. Valid values are floating-point Can be specified in % and M. Valid values are floating-point
numbers from the range [0; 100] %. numbers from the range [0; 90] %.
zram_max_sigterm = 50 % zram_max_sigterm = 50 %
zram_max_sigkill = 55 % zram_max_sigkill = 55 %
@ -93,8 +93,8 @@ psi_avg10_sleep_time = 60
Valid values are positive floating-point numbers. Valid values are positive floating-point numbers.
rate_mem = 6 rate_mem = 4
rate_swap = 3 rate_swap = 2
rate_zram = 1 rate_zram = 1
See also https://github.com/rfjakob/earlyoom/issues/61 See also https://github.com/rfjakob/earlyoom/issues/61
@ -261,7 +261,7 @@ gui_notifications = True
Enable GUI notifications about the low level of available memory. Enable GUI notifications about the low level of available memory.
Valid values are True and False. Valid values are True and False.
gui_low_memory_warnings = True gui_low_memory_warnings = False
Минимальное время между отправками уведомлений в секундах. Минимальное время между отправками уведомлений в секундах.
Valid values are floating-point numbers from the range [1; 300]. Valid values are floating-point numbers from the range [1; 300].
@ -304,5 +304,5 @@ print_mem_check_results = True
Print sleep periods between memory checks. Print sleep periods between memory checks.
Valid values are True and False. Valid values are True and False.
print_sleep_periods = False print_sleep_periods = True

View File

@ -1,53 +1,17 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
#
# Usage:
# ./nohang_notify_helper "title" "body"
# nohang_notify_low_mem --mem '14% 12%' --name 'stress' --pid '6666' from sys import argv
# need UID=0
# output:
# Low memory: 14% 12%
# Fattest process: 6666, stress
# need to remove this slow and fat parser
from argparse import ArgumentParser
from os import listdir from os import listdir
from subprocess import Popen, TimeoutExpired from subprocess import Popen, TimeoutExpired
wait_time = 10 if len(argv) < 2 or argv[1] == "-h" or argv[1] == "--help":
print('Usage: ./nohang_notify_helper "title" "body"')
exit(1)
parser = ArgumentParser() wait_time = 12
parser.add_argument(
'--mem',
help="""available memory percent (15%, for example)""",
default=None,
type=str
)
parser.add_argument(
'--pid',
help="""pid""",
default=None,
type=str
)
parser.add_argument(
'--name',
help="""process name""",
default=None,
type=str
)
args = parser.parse_args()
pid = args.pid
name = args.name
mem = args.mem
title = 'Low memory: {}'.format(mem)
body = 'Fattest process: <b>{}</b>, <b>{}</b>'.format(pid, name)
display_env = 'DISPLAY=' display_env = 'DISPLAY='
dbus_env = 'DBUS_SESSION_BUS_ADDRESS=' dbus_env = 'DBUS_SESSION_BUS_ADDRESS='
@ -69,9 +33,6 @@ def re_pid_environ(pid):
'DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus') 'DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/1000/bus')
returns None if these vars is not in /proc/[pid]/environ returns None if these vars is not in /proc/[pid]/environ
""" """
display_env = 'DISPLAY='
dbus_env = 'DBUS_SESSION_BUS_ADDRESS='
user_env = 'USER='
try: try:
env = str(rline1('/proc/' + pid + '/environ')) env = str(rline1('/proc/' + pid + '/environ'))
if display_env in env and dbus_env in env and user_env in env: if display_env in env and dbus_env in env and user_env in env:
@ -88,8 +49,6 @@ def re_pid_environ(pid):
continue continue
if i.startswith(dbus_env): if i.startswith(dbus_env):
#if ',guid=' in i:
# return None
dbus = i dbus = i
continue continue
@ -119,27 +78,27 @@ def root_notify_env():
env = set(unsorted_envs_list) env = set(unsorted_envs_list)
env.discard(None) env.discard(None)
# deduplicate dbus
new_env = [] new_env = []
end = [] end = []
for i in env: for i in env:
#print(i)
key = i[0] + i[1] key = i[0] + i[1]
#print(key)
if key not in end: if key not in end:
end.append(key) end.append(key)
new_env.append(i) new_env.append(i)
else: else:
continue continue
#print(new_env)
return new_env return new_env
b = root_notify_env() list_with_envs = root_notify_env()
# if somebody logged in with GUI # if somebody logged in with GUI
if len(b) > 0: if len(list_with_envs) > 0:
# iterating over logged-in users # iterating over logged-in users
for i in b: for i in list_with_envs:
username, display_env, dbus_env = i[0], i[1], i[2] username, display_env, dbus_env = i[0], i[1], i[2]
display_tuple = display_env.partition('=') display_tuple = display_env.partition('=')
dbus_tuple = dbus_env.partition('=') dbus_tuple = dbus_env.partition('=')
@ -148,7 +107,7 @@ if len(b) > 0:
with Popen(['sudo', '-u', username, with Popen(['sudo', '-u', username,
'notify-send', '--icon=dialog-warning', 'notify-send', '--icon=dialog-warning',
'{}'.format(title), '{}'.format(body) argv[1], argv[2]
], env={ ], env={
display_key: display_value, display_key: display_value,
dbus_key: dbus_value dbus_key: dbus_value
@ -159,4 +118,4 @@ if len(b) > 0:
proc.kill() proc.kill()
print('TimeoutExpired: notify' + username) print('TimeoutExpired: notify' + username)
else: else:
print('Low memory warnings: nobody logged in with GUI. Nothing to do.') print('Nobody logged-in with GUI. Nothing to do.')