fix alerts

This commit is contained in:
Alexey Avramov 2019-07-18 22:57:41 +09:00
parent 9f438726b4
commit 75f05959fc
5 changed files with 66 additions and 86 deletions

51
nohang
View File

@ -344,7 +344,7 @@ def log(*msg):
sleep(0.01) sleep(0.01)
if separate_log: if separate_log:
try: try:
info(*msg) logging.info(*msg)
except OSError: except OSError:
sleep(0.01) sleep(0.01)
@ -835,16 +835,22 @@ def check_zram():
return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0 return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0
def format_time(t): def format_time(t):
""" """
""" """
t = int(t) t = int(t)
if t < 60: if t < 60:
return '{} sec'.format(t) return '{} sec'.format(t)
elif t >= 60 and t < 3600:
elif (t < 3600 and t >= 60):
m = t // 60 m = t // 60
s = t % 60 s = t % 60
return '{} min {} sec'.format(m, s) return '{} min {} sec'.format(m, s)
else: else:
h = t // 3600 h = t // 3600
s0 = t - h * 3600 s0 = t - h * 3600
@ -853,6 +859,8 @@ def format_time(t):
return '{} h {} min {} sec'.format(h, m, s) return '{} h {} min {} sec'.format(h, m, s)
def string_to_float_convert_test(string): def string_to_float_convert_test(string):
"""Try to interprete string values as floats.""" """Try to interprete string values as floats."""
try: try:
@ -1002,11 +1010,6 @@ def send_notify(threshold, name, pid):
pid: str process pid pid: str process pid
""" """
# wait for memory release after corrective action
# may be useful if free memory was about 0 immediately after
# corrective action
sleep(0.05)
title = 'Freeze prevention' title = 'Freeze prevention'
body = '<b>{}</b> [{}] <b>{}</b>'.format( body = '<b>{}</b> [{}] <b>{}</b>'.format(
notify_sig_dict[threshold], notify_sig_dict[threshold],
@ -1237,9 +1240,14 @@ def find_victim_info(pid, victim_badness, name):
state = line.split('\t')[1].rstrip() state = line.split('\t')[1].rstrip()
continue continue
"""
if n is ppid_index: if n is ppid_index:
ppid = line.split('\t')[1] # ppid = line.split('\t')[1]
continue continue
"""
if n is uid_index: if n is uid_index:
uid = line.split('\t')[2] uid = line.split('\t')[2]
@ -1299,8 +1307,13 @@ def find_victim_info(pid, victim_badness, name):
if i is state_index: if i is state_index:
state = f_list[i].split('\t')[1].rstrip() state = f_list[i].split('\t')[1].rstrip()
"""
if i is ppid_index: if i is ppid_index:
ppid = f_list[i].split('\t')[1] pass
# ppid = f_list[i].split('\t')[1]
"""
if i is uid_index: if i is uid_index:
uid = f_list[i].split('\t')[2] uid = f_list[i].split('\t')[2]
@ -1901,10 +1914,14 @@ def implement_corrective_action(
pid)).replace('$SERVICE', service) pid)).replace('$SERVICE', service)
exit_status = exe(cmd) exit_status = exe(cmd)
"""
if exit_status == 0: if exit_status == 0:
success = True success = True
else: else:
success = False success = False
"""
response_time = time() - time0 response_time = time() - time0
@ -1937,18 +1954,16 @@ def implement_corrective_action(
except FileNotFoundError: except FileNotFoundError:
vwd = True vwd = True
success = False # success = False
response_time = time() - time0 response_time = time() - time0
send_result = 'no such process; response time: {} ms'.format( # send_result = 'no such process; response time: {} ms'.format(round(response_time * 1000))
round(response_time * 1000))
key = 'The victim died in the search process: ' \ key = 'The victim died in the search process: ' \
'FileNotFoundError' 'FileNotFoundError'
except ProcessLookupError: except ProcessLookupError:
vwd = True vwd = True
success = False # success = False
response_time = time() - time0 response_time = time() - time0
send_result = 'no such process; response time: {} ms'.format( # send_result = 'no such process; response time: {} ms'.format(round(response_time * 1000))
round(response_time * 1000))
key = 'The victim died in the search process: ' \ key = 'The victim died in the search process: ' \
'ProcessLookupError' 'ProcessLookupError'
@ -1974,7 +1989,7 @@ def implement_corrective_action(
# print(v_dict) # print(v_dict)
response_time = time() - time0 # response_time = time() - time0
# log('success: ' + str(success)) # log('success: ' + str(success))
# log('victim will die: ' + str(vwd)) # log('victim will die: ' + str(vwd))
@ -2934,8 +2949,6 @@ separate_log = conf_parse_bool('separate_log')
if separate_log: if separate_log:
import logging import logging
from logging import basicConfig
from logging import info
log_dir = '/var/log/nohang' log_dir = '/var/log/nohang'
@ -2957,7 +2970,7 @@ if separate_log:
print('ERROR: log PermissionError') print('ERROR: log PermissionError')
try: try:
basicConfig( logging.basicConfig(
filename=logfile, filename=logfile,
level=logging.INFO, level=logging.INFO,
format="%(asctime)s: %(message)s") format="%(asctime)s: %(message)s")

View File

@ -119,7 +119,7 @@ except Exception:
try: try:
from os import listdir, path, remove from os import listdir, path
from subprocess import Popen, TimeoutExpired from subprocess import Popen, TimeoutExpired
from sys import argv from sys import argv
except OSError: except OSError:

View File

@ -45,13 +45,13 @@ def pid_to_status_units(pid):
if i is 1: if i is 1:
name = f_list[0].split('\t')[1] name = f_list[0].split('\t')[1]
if i is uid_index: if i == uid_index:
uid = f_list[i].split('\t')[2] uid = f_list[i].split('\t')[2]
if i is vm_rss_index: if i == vm_rss_index:
vm_rss = f_list[i].split('\t')[1][:-3] vm_rss = f_list[i].split('\t')[1][:-3]
if i is vm_swap_index: if i == vm_swap_index:
vm_swap = f_list[i].split('\t')[1][:-3] vm_swap = f_list[i].split('\t')[1][:-3]
return name, uid, vm_rss, vm_swap return name, uid, vm_rss, vm_swap

View File

@ -3,19 +3,7 @@
from ctypes import CDLL from ctypes import CDLL
from time import sleep from time import sleep
from sys import argv from sys import argv
import os
"""
Execute the command
find /sys/fs/cgroup -name memory.pressure
to find available memory.pressue files (except /proc/pressure/memory).
(actual for cgroup2)
"""
if len(argv) > 1:
psi_path = argv[1]
else:
psi_path = '/proc/pressure/memory'
def mlockall(): def mlockall():
@ -33,16 +21,13 @@ def mlockall():
MCL_CURRENT | MCL_FUTURE MCL_CURRENT | MCL_FUTURE
) )
if result != 0: if result != 0:
print('WARNING: cannot lock all memory') pass
else: else:
pass pass
else: else:
pass pass
mlockall()
def psi_path_to_metrics(psi_path): def psi_path_to_metrics(psi_path):
with open(psi_path) as f: with open(psi_path) as f:
@ -62,8 +47,18 @@ def psi_path_to_metrics(psi_path):
full_avg10, full_avg60, full_avg300) full_avg10, full_avg60, full_avg300)
print('Path to PSI file: {}\n'.format(psi_path)) if len(argv) > 1:
psi_path = argv[1]
else:
psi_path = '/proc/pressure/memory'
if not os.path.exists(psi_path):
print('PSI path does not exist. Exit.')
exit()
mlockall()
print('Path to PSI file: {}\n'.format(psi_path))
print(' avg10 avg60 avg300 avg10 avg60 avg300') print(' avg10 avg60 avg300 avg10 avg60 avg300')

62
psi-top
View File

@ -1,45 +1,9 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from ctypes import CDLL
from time import sleep, time
import os import os
"""
Execute the command
find /sys/fs/cgroup -name memory.pressure
to find available memory.pressue files (except /proc/pressure/memory).
(actual for cgroup2)
"""
psi_path = '/proc/pressure/memory' psi_path = '/proc/pressure/memory'
def mlockall():
MCL_CURRENT = 1
MCL_FUTURE = 2
MCL_ONFAULT = 4
libc = CDLL('libc.so.6', use_errno=True)
result = libc.mlockall(
MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT
)
if result != 0:
result = libc.mlockall(
MCL_CURRENT | MCL_FUTURE
)
if result != 0:
print('WARNING: cannot lock all memory')
else:
pass
else:
pass
mlockall()
t0 = time()
def psi_path_to_metrics(psi_path): def psi_path_to_metrics(psi_path):
with open(psi_path) as f: with open(psi_path) as f:
@ -59,7 +23,6 @@ def psi_path_to_metrics(psi_path):
full_avg10, full_avg60, full_avg300) full_avg10, full_avg60, full_avg300)
def cgroup2_root(): def cgroup2_root():
""" """
""" """
@ -93,7 +56,19 @@ def psi_path_to_cgroup2(path):
i = cgroup2_root() i = cgroup2_root()
print('cgroup2 root dir:', i) if i is None:
print('cgroup2 not mounted')
else:
print('cgroup2 root dir:', i)
psi_support = os.path.exists(psi_path)
if not psi_support:
print('PSI is not supported, /proc/pressure/memory does not exist. Exit.')
exit(1)
if i is not None: if i is not None:
y = get_psi_mem_files(i) y = get_psi_mem_files(i)
for path in y: for path in y:
@ -105,14 +80,16 @@ print(' avg10 avg60 avg300 avg10 avg60 avg300 cgroup2')
print(' ----- ----- ------ ----- ----- ------ ---------') print(' ----- ----- ------ ----- ----- ------ ---------')
(some_avg10, some_avg60, some_avg300, full_avg10, full_avg60, full_avg300) = psi_path_to_metrics('/proc/pressure/memory') (some_avg10, some_avg60, some_avg300, full_avg10, full_avg60, full_avg300
) = psi_path_to_metrics('/proc/pressure/memory')
print('some {} {} {} | full {} {} {} {}'.format( print('some {} {} {} | full {} {} {} {}'.format(
some_avg10.rjust(6), some_avg10.rjust(6),
some_avg60.rjust(6), some_avg60.rjust(6),
some_avg300.rjust(6), some_avg300.rjust(6),
full_avg10.rjust(6), full_avg10.rjust(6),
full_avg60.rjust(6), full_avg60.rjust(6),
full_avg300.rjust(6), '[SYSTEM]')) full_avg300.rjust(6), '[SYSTEM_WIDE]'))
for psi_path in path_list: for psi_path in path_list:
@ -126,8 +103,3 @@ for psi_path in path_list:
full_avg10.rjust(6), full_avg10.rjust(6),
full_avg60.rjust(6), full_avg60.rjust(6),
full_avg300.rjust(6), psi_path_to_cgroup2(psi_path))) full_avg300.rjust(6), psi_path_to_cgroup2(psi_path)))
print(time() - t0)