fix alerts
This commit is contained in:
parent
9f438726b4
commit
75f05959fc
51
nohang
51
nohang
@ -344,7 +344,7 @@ def log(*msg):
|
|||||||
sleep(0.01)
|
sleep(0.01)
|
||||||
if separate_log:
|
if separate_log:
|
||||||
try:
|
try:
|
||||||
info(*msg)
|
logging.info(*msg)
|
||||||
except OSError:
|
except OSError:
|
||||||
sleep(0.01)
|
sleep(0.01)
|
||||||
|
|
||||||
@ -835,16 +835,22 @@ def check_zram():
|
|||||||
return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0
|
return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def format_time(t):
|
def format_time(t):
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
|
|
||||||
t = int(t)
|
t = int(t)
|
||||||
|
|
||||||
if t < 60:
|
if t < 60:
|
||||||
return '{} sec'.format(t)
|
return '{} sec'.format(t)
|
||||||
elif t >= 60 and t < 3600:
|
|
||||||
|
elif (t < 3600 and t >= 60):
|
||||||
m = t // 60
|
m = t // 60
|
||||||
s = t % 60
|
s = t % 60
|
||||||
return '{} min {} sec'.format(m, s)
|
return '{} min {} sec'.format(m, s)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
h = t // 3600
|
h = t // 3600
|
||||||
s0 = t - h * 3600
|
s0 = t - h * 3600
|
||||||
@ -853,6 +859,8 @@ def format_time(t):
|
|||||||
return '{} h {} min {} sec'.format(h, m, s)
|
return '{} h {} min {} sec'.format(h, m, s)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def string_to_float_convert_test(string):
|
def string_to_float_convert_test(string):
|
||||||
"""Try to interprete string values as floats."""
|
"""Try to interprete string values as floats."""
|
||||||
try:
|
try:
|
||||||
@ -1002,11 +1010,6 @@ def send_notify(threshold, name, pid):
|
|||||||
pid: str process pid
|
pid: str process pid
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# wait for memory release after corrective action
|
|
||||||
# may be useful if free memory was about 0 immediately after
|
|
||||||
# corrective action
|
|
||||||
sleep(0.05)
|
|
||||||
|
|
||||||
title = 'Freeze prevention'
|
title = 'Freeze prevention'
|
||||||
body = '<b>{}</b> [{}] <b>{}</b>'.format(
|
body = '<b>{}</b> [{}] <b>{}</b>'.format(
|
||||||
notify_sig_dict[threshold],
|
notify_sig_dict[threshold],
|
||||||
@ -1237,9 +1240,14 @@ def find_victim_info(pid, victim_badness, name):
|
|||||||
state = line.split('\t')[1].rstrip()
|
state = line.split('\t')[1].rstrip()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
if n is ppid_index:
|
if n is ppid_index:
|
||||||
ppid = line.split('\t')[1]
|
# ppid = line.split('\t')[1]
|
||||||
continue
|
continue
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if n is uid_index:
|
if n is uid_index:
|
||||||
uid = line.split('\t')[2]
|
uid = line.split('\t')[2]
|
||||||
@ -1299,8 +1307,13 @@ def find_victim_info(pid, victim_badness, name):
|
|||||||
if i is state_index:
|
if i is state_index:
|
||||||
state = f_list[i].split('\t')[1].rstrip()
|
state = f_list[i].split('\t')[1].rstrip()
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
if i is ppid_index:
|
if i is ppid_index:
|
||||||
ppid = f_list[i].split('\t')[1]
|
pass
|
||||||
|
# ppid = f_list[i].split('\t')[1]
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
if i is uid_index:
|
if i is uid_index:
|
||||||
uid = f_list[i].split('\t')[2]
|
uid = f_list[i].split('\t')[2]
|
||||||
@ -1901,10 +1914,14 @@ def implement_corrective_action(
|
|||||||
pid)).replace('$SERVICE', service)
|
pid)).replace('$SERVICE', service)
|
||||||
exit_status = exe(cmd)
|
exit_status = exe(cmd)
|
||||||
|
|
||||||
|
"""
|
||||||
if exit_status == 0:
|
if exit_status == 0:
|
||||||
success = True
|
success = True
|
||||||
else:
|
else:
|
||||||
success = False
|
success = False
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
response_time = time() - time0
|
response_time = time() - time0
|
||||||
|
|
||||||
@ -1937,18 +1954,16 @@ def implement_corrective_action(
|
|||||||
|
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
vwd = True
|
vwd = True
|
||||||
success = False
|
# success = False
|
||||||
response_time = time() - time0
|
response_time = time() - time0
|
||||||
send_result = 'no such process; response time: {} ms'.format(
|
# send_result = 'no such process; response time: {} ms'.format(round(response_time * 1000))
|
||||||
round(response_time * 1000))
|
|
||||||
key = 'The victim died in the search process: ' \
|
key = 'The victim died in the search process: ' \
|
||||||
'FileNotFoundError'
|
'FileNotFoundError'
|
||||||
except ProcessLookupError:
|
except ProcessLookupError:
|
||||||
vwd = True
|
vwd = True
|
||||||
success = False
|
# success = False
|
||||||
response_time = time() - time0
|
response_time = time() - time0
|
||||||
send_result = 'no such process; response time: {} ms'.format(
|
# send_result = 'no such process; response time: {} ms'.format(round(response_time * 1000))
|
||||||
round(response_time * 1000))
|
|
||||||
key = 'The victim died in the search process: ' \
|
key = 'The victim died in the search process: ' \
|
||||||
'ProcessLookupError'
|
'ProcessLookupError'
|
||||||
|
|
||||||
@ -1974,7 +1989,7 @@ def implement_corrective_action(
|
|||||||
|
|
||||||
# print(v_dict)
|
# print(v_dict)
|
||||||
|
|
||||||
response_time = time() - time0
|
# response_time = time() - time0
|
||||||
|
|
||||||
# log('success: ' + str(success))
|
# log('success: ' + str(success))
|
||||||
# log('victim will die: ' + str(vwd))
|
# log('victim will die: ' + str(vwd))
|
||||||
@ -2934,8 +2949,6 @@ separate_log = conf_parse_bool('separate_log')
|
|||||||
if separate_log:
|
if separate_log:
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from logging import basicConfig
|
|
||||||
from logging import info
|
|
||||||
|
|
||||||
log_dir = '/var/log/nohang'
|
log_dir = '/var/log/nohang'
|
||||||
|
|
||||||
@ -2957,7 +2970,7 @@ if separate_log:
|
|||||||
print('ERROR: log PermissionError')
|
print('ERROR: log PermissionError')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
basicConfig(
|
logging.basicConfig(
|
||||||
filename=logfile,
|
filename=logfile,
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="%(asctime)s: %(message)s")
|
format="%(asctime)s: %(message)s")
|
||||||
|
@ -119,7 +119,7 @@ except Exception:
|
|||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from os import listdir, path, remove
|
from os import listdir, path
|
||||||
from subprocess import Popen, TimeoutExpired
|
from subprocess import Popen, TimeoutExpired
|
||||||
from sys import argv
|
from sys import argv
|
||||||
except OSError:
|
except OSError:
|
||||||
|
6
oom-sort
6
oom-sort
@ -45,13 +45,13 @@ def pid_to_status_units(pid):
|
|||||||
if i is 1:
|
if i is 1:
|
||||||
name = f_list[0].split('\t')[1]
|
name = f_list[0].split('\t')[1]
|
||||||
|
|
||||||
if i is uid_index:
|
if i == uid_index:
|
||||||
uid = f_list[i].split('\t')[2]
|
uid = f_list[i].split('\t')[2]
|
||||||
|
|
||||||
if i is vm_rss_index:
|
if i == vm_rss_index:
|
||||||
vm_rss = f_list[i].split('\t')[1][:-3]
|
vm_rss = f_list[i].split('\t')[1][:-3]
|
||||||
|
|
||||||
if i is vm_swap_index:
|
if i == vm_swap_index:
|
||||||
vm_swap = f_list[i].split('\t')[1][:-3]
|
vm_swap = f_list[i].split('\t')[1][:-3]
|
||||||
|
|
||||||
return name, uid, vm_rss, vm_swap
|
return name, uid, vm_rss, vm_swap
|
||||||
|
31
psi-monitor
31
psi-monitor
@ -3,19 +3,7 @@
|
|||||||
from ctypes import CDLL
|
from ctypes import CDLL
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from sys import argv
|
from sys import argv
|
||||||
|
import os
|
||||||
"""
|
|
||||||
Execute the command
|
|
||||||
find /sys/fs/cgroup -name memory.pressure
|
|
||||||
to find available memory.pressue files (except /proc/pressure/memory).
|
|
||||||
(actual for cgroup2)
|
|
||||||
"""
|
|
||||||
|
|
||||||
if len(argv) > 1:
|
|
||||||
psi_path = argv[1]
|
|
||||||
else:
|
|
||||||
psi_path = '/proc/pressure/memory'
|
|
||||||
|
|
||||||
|
|
||||||
def mlockall():
|
def mlockall():
|
||||||
|
|
||||||
@ -33,16 +21,13 @@ def mlockall():
|
|||||||
MCL_CURRENT | MCL_FUTURE
|
MCL_CURRENT | MCL_FUTURE
|
||||||
)
|
)
|
||||||
if result != 0:
|
if result != 0:
|
||||||
print('WARNING: cannot lock all memory')
|
pass
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
mlockall()
|
|
||||||
|
|
||||||
|
|
||||||
def psi_path_to_metrics(psi_path):
|
def psi_path_to_metrics(psi_path):
|
||||||
|
|
||||||
with open(psi_path) as f:
|
with open(psi_path) as f:
|
||||||
@ -62,8 +47,18 @@ def psi_path_to_metrics(psi_path):
|
|||||||
full_avg10, full_avg60, full_avg300)
|
full_avg10, full_avg60, full_avg300)
|
||||||
|
|
||||||
|
|
||||||
print('Path to PSI file: {}\n'.format(psi_path))
|
if len(argv) > 1:
|
||||||
|
psi_path = argv[1]
|
||||||
|
else:
|
||||||
|
psi_path = '/proc/pressure/memory'
|
||||||
|
|
||||||
|
if not os.path.exists(psi_path):
|
||||||
|
print('PSI path does not exist. Exit.')
|
||||||
|
exit()
|
||||||
|
|
||||||
|
mlockall()
|
||||||
|
|
||||||
|
print('Path to PSI file: {}\n'.format(psi_path))
|
||||||
|
|
||||||
print(' avg10 avg60 avg300 avg10 avg60 avg300')
|
print(' avg10 avg60 avg300 avg10 avg60 avg300')
|
||||||
|
|
||||||
|
62
psi-top
62
psi-top
@ -1,45 +1,9 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
from ctypes import CDLL
|
|
||||||
from time import sleep, time
|
|
||||||
import os
|
import os
|
||||||
|
|
||||||
"""
|
|
||||||
Execute the command
|
|
||||||
find /sys/fs/cgroup -name memory.pressure
|
|
||||||
to find available memory.pressue files (except /proc/pressure/memory).
|
|
||||||
(actual for cgroup2)
|
|
||||||
"""
|
|
||||||
|
|
||||||
psi_path = '/proc/pressure/memory'
|
psi_path = '/proc/pressure/memory'
|
||||||
|
|
||||||
def mlockall():
|
|
||||||
|
|
||||||
MCL_CURRENT = 1
|
|
||||||
MCL_FUTURE = 2
|
|
||||||
MCL_ONFAULT = 4
|
|
||||||
|
|
||||||
libc = CDLL('libc.so.6', use_errno=True)
|
|
||||||
|
|
||||||
result = libc.mlockall(
|
|
||||||
MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT
|
|
||||||
)
|
|
||||||
if result != 0:
|
|
||||||
result = libc.mlockall(
|
|
||||||
MCL_CURRENT | MCL_FUTURE
|
|
||||||
)
|
|
||||||
if result != 0:
|
|
||||||
print('WARNING: cannot lock all memory')
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
mlockall()
|
|
||||||
|
|
||||||
t0 = time()
|
|
||||||
|
|
||||||
def psi_path_to_metrics(psi_path):
|
def psi_path_to_metrics(psi_path):
|
||||||
|
|
||||||
with open(psi_path) as f:
|
with open(psi_path) as f:
|
||||||
@ -59,7 +23,6 @@ def psi_path_to_metrics(psi_path):
|
|||||||
full_avg10, full_avg60, full_avg300)
|
full_avg10, full_avg60, full_avg300)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def cgroup2_root():
|
def cgroup2_root():
|
||||||
"""
|
"""
|
||||||
"""
|
"""
|
||||||
@ -93,7 +56,19 @@ def psi_path_to_cgroup2(path):
|
|||||||
|
|
||||||
i = cgroup2_root()
|
i = cgroup2_root()
|
||||||
|
|
||||||
print('cgroup2 root dir:', i)
|
if i is None:
|
||||||
|
print('cgroup2 not mounted')
|
||||||
|
else:
|
||||||
|
print('cgroup2 root dir:', i)
|
||||||
|
|
||||||
|
|
||||||
|
psi_support = os.path.exists(psi_path)
|
||||||
|
|
||||||
|
if not psi_support:
|
||||||
|
print('PSI is not supported, /proc/pressure/memory does not exist. Exit.')
|
||||||
|
exit(1)
|
||||||
|
|
||||||
|
|
||||||
if i is not None:
|
if i is not None:
|
||||||
y = get_psi_mem_files(i)
|
y = get_psi_mem_files(i)
|
||||||
for path in y:
|
for path in y:
|
||||||
@ -105,14 +80,16 @@ print(' avg10 avg60 avg300 avg10 avg60 avg300 cgroup2')
|
|||||||
|
|
||||||
print(' ----- ----- ------ ----- ----- ------ ---------')
|
print(' ----- ----- ------ ----- ----- ------ ---------')
|
||||||
|
|
||||||
(some_avg10, some_avg60, some_avg300, full_avg10, full_avg60, full_avg300) = psi_path_to_metrics('/proc/pressure/memory')
|
(some_avg10, some_avg60, some_avg300, full_avg10, full_avg60, full_avg300
|
||||||
|
) = psi_path_to_metrics('/proc/pressure/memory')
|
||||||
|
|
||||||
print('some {} {} {} | full {} {} {} {}'.format(
|
print('some {} {} {} | full {} {} {} {}'.format(
|
||||||
some_avg10.rjust(6),
|
some_avg10.rjust(6),
|
||||||
some_avg60.rjust(6),
|
some_avg60.rjust(6),
|
||||||
some_avg300.rjust(6),
|
some_avg300.rjust(6),
|
||||||
full_avg10.rjust(6),
|
full_avg10.rjust(6),
|
||||||
full_avg60.rjust(6),
|
full_avg60.rjust(6),
|
||||||
full_avg300.rjust(6), '[SYSTEM]'))
|
full_avg300.rjust(6), '[SYSTEM_WIDE]'))
|
||||||
|
|
||||||
|
|
||||||
for psi_path in path_list:
|
for psi_path in path_list:
|
||||||
@ -126,8 +103,3 @@ for psi_path in path_list:
|
|||||||
full_avg10.rjust(6),
|
full_avg10.rjust(6),
|
||||||
full_avg60.rjust(6),
|
full_avg60.rjust(6),
|
||||||
full_avg300.rjust(6), psi_path_to_cgroup2(psi_path)))
|
full_avg300.rjust(6), psi_path_to_cgroup2(psi_path)))
|
||||||
|
|
||||||
|
|
||||||
print(time() - t0)
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user