Improve PSI debug and fix #91

This commit is contained in:
Alexey Avramov 2020-05-04 01:34:10 +09:00
parent 16f7db180a
commit 01f17c4690

View File

@ -1306,7 +1306,7 @@ def print_stat_dict():
def find_psi_metrics_value(psi_path, psi_metrics):
"""
"""
if psi_support:
try:
if psi_metrics == 'some_avg10':
return float(rline1(psi_path).split(' ')[1].split('=')[1])
@ -1328,7 +1328,13 @@ def find_psi_metrics_value(psi_path, psi_metrics):
psi_list = f.readlines()
return float(psi_list[1].split(' ')[3].split('=')[1])
except Exception as e:
if debug_psi:
log('Invalid psi_path: {}'.format(e))
return None
''''
def check_mem_and_swap0():
"""
"""
@ -1337,6 +1343,7 @@ def check_mem_and_swap0():
return (int(m_list[mem_available_index].split(':')[1]),
int(m_list[swap_total_index].split(':')[1]),
int(m_list[swap_free_index].split(':')[1]))
'''
def check_mem_and_swap():
@ -2133,6 +2140,11 @@ def check_psi_ex(psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0,
if not (ma_warning_threshold_exceded or ma_soft_threshold_exceded or
ma_hard_threshold_exceded) or swap_total == 0:
if debug_psi:
log('Do not measure the value of PSI, since none of the thresho'
'lds of available memory is exceeded')
return (None, None,
psi_t0, psi_kill_exceeded_timer,
psi_term_exceeded_timer, x0)
@ -2142,6 +2154,17 @@ def check_psi_ex(psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0,
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
if debug_psi:
log('-------------------------------------------------------------'
'-----------')
log('PSI {} value in {}: {}'.format(
psi_metrics, psi_path, psi_avg_value))
if psi_avg_value is None:
return (None, None, psi_t0, -0.0001, -0.0001, x0)
psi_post_action_delay_timer = monotonic() - last_action_dict['t'] # psi_t0
if psi_post_action_delay_timer >= psi_post_action_delay:
@ -2164,9 +2187,6 @@ def check_psi_ex(psi_t0, psi_kill_exceeded_timer, psi_term_exceeded_timer, x0,
if debug_psi:
log('-------------------------------------------------------------'
'-----------')
log('psi_post_action_delay_timer: {}, psi_post_action_delay_exceed'
'ed: {}'.format(
round(psi_post_action_delay_timer, 1),
@ -2379,7 +2399,7 @@ def implement_corrective_action(
(masf_threshold, masf_info, mem_available, hard_threshold_min_swap_kb,
soft_threshold_min_swap_kb, swap_free, swap_total) = check_mem_swap_ex()
if CHECK_ZRAM:
if zram_checking_enabled:
zram_threshold, zram_info, mem_used_zram = check_zram_ex()
if CHECK_PSI:
@ -2490,7 +2510,7 @@ def implement_corrective_action(
round(mid['swap_used'] / 1024),
round(mid['swap_free'] / 1024)
))
if psi_support:
if PSI_KERNEL_OK:
mp = memory_pressure()
log('Memory pressure (system-wide):')
log(' some avg10={} avg60={} avg300={}'.format(
@ -2756,7 +2776,7 @@ def sleep_after_check_mem():
t_mem = mem_point / fill_rate_mem
t_swap = swap_point / fill_rate_swap
if CHECK_ZRAM:
if zram_checking_enabled:
t_zram = (mem_total * 0.8 - mem_used_zram) / fill_rate_zram
if t_zram < 0:
t_zram = 0
@ -2767,7 +2787,7 @@ def sleep_after_check_mem():
t_mem_swap = t_mem + t_swap
if CHECK_ZRAM:
if zram_checking_enabled:
if t_mem_swap <= t_mem_zram:
t = t_mem_swap
@ -3278,10 +3298,43 @@ else:
###############################################################################
# extracting parameters from the dictionary
# check for all necessary parameters
# validation of all parameters
separate_log = conf_parse_bool('separate_log')
if separate_log:
import logging
log_dir = '/var/log/nohang'
logfile = log_dir + '/nohang.log'
try:
os.mkdir(log_dir)
except FileExistsError:
pass
except PermissionError:
errprint('ERROR: cannot create {}'.format(log_dir))
try:
os.chmod(log_dir, mode=0o750)
except FileNotFoundError:
errprint('ERROR: file not found: {}'.format(log_dir))
except PermissionError:
errprint('ERROR: permission denied: {}'.format(log_dir))
try:
logging.basicConfig(
filename=logfile,
level=logging.INFO,
format="%(asctime)s: %(message)s")
except FileNotFoundError:
errprint('ERROR: file not found: {}'.format(logfile))
except PermissionError:
errprint('ERROR: permission denied: {}'.format(logfile))
debug_psi = conf_parse_bool('debug_psi')
print_statistics = conf_parse_bool('print_statistics')
print_proc_table = conf_parse_bool('print_proc_table')
@ -3300,20 +3353,21 @@ debug_threading = conf_parse_bool('debug_threading')
psi_checking_enabled = conf_parse_bool('psi_checking_enabled')
ignore_psi = not psi_checking_enabled
if psi_checking_enabled:
try:
psi_file_mem_to_metrics('/proc/pressure/memory')
except Exception as e:
print('WARNING: PSI metrics are not provided by the kernel: {}'.format(
try:
psi_file_mem_to_metrics('/proc/pressure/memory')
PSI_KERNEL_OK = True
except Exception as e:
PSI_KERNEL_OK = False
if psi_checking_enabled:
log('WARNING: PSI metrics are not provided by the kernel: {}'.format(
e))
ignore_psi = True
if PSI_KERNEL_OK and psi_checking_enabled:
CHECK_PSI = True
else:
CHECK_PSI = False
zram_checking_enabled = conf_parse_bool('zram_checking_enabled')
ignore_zram = not zram_checking_enabled
debug_gui_notifications = conf_parse_bool('debug_gui_notifications')
ignore_positive_oom_score_adj = conf_parse_bool(
@ -3515,7 +3569,7 @@ else:
if 'psi_path' in config_dict:
psi_path = config_dict['psi_path']
if not ignore_psi:
if CHECK_PSI:
try:
psi_file_mem_to_metrics(psi_path)
except Exception as e:
@ -3556,40 +3610,6 @@ else:
missing_config_key('extra_table_info')
separate_log = conf_parse_bool('separate_log')
if separate_log:
import logging
log_dir = '/var/log/nohang'
logfile = log_dir + '/nohang.log'
try:
os.mkdir(log_dir)
except FileExistsError:
pass
except PermissionError:
errprint('ERROR: cannot create {}'.format(log_dir))
try:
os.chmod(log_dir, mode=0o750)
except FileNotFoundError:
errprint('ERROR: file not found: {}'.format(log_dir))
except PermissionError:
errprint('ERROR: permission denied: {}'.format(log_dir))
try:
logging.basicConfig(
filename=logfile,
level=logging.INFO,
format="%(asctime)s: %(message)s")
except FileNotFoundError:
errprint('ERROR: file not found: {}'.format(logfile))
except PermissionError:
errprint('ERROR: permission denied: {}'.format(logfile))
if 'min_mem_report_interval' in config_dict:
min_mem_report_interval = string_to_float_convert_test(
config_dict['min_mem_report_interval'])
@ -3652,9 +3672,6 @@ if (low_memory_warnings_enabled or
from subprocess import Popen, TimeoutExpired
psi_support = os.path.exists(psi_path)
# Get KiB levels if it's possible.
soft_threshold_min_swap_tuple = get_swap_threshold_tuple(
@ -3747,17 +3764,11 @@ threshold = None
mem_info = None
CHECK_PSI = False
if psi_support and not ignore_psi:
CHECK_PSI = True
psi_kill_exceeded_timer = psi_term_exceeded_timer = -0.0001
psi_t0 = monotonic()
psi_threshold = zram_threshold = zram_info = psi_info = None
CHECK_ZRAM = not ignore_zram
log('Monitoring has started!')
stdout.flush()
@ -3819,7 +3830,7 @@ while True:
(masf_threshold, masf_info, mem_available, hard_threshold_min_swap_kb,
soft_threshold_min_swap_kb, swap_free, swap_total) = check_mem_swap_ex()
if CHECK_ZRAM:
if zram_checking_enabled:
zram_threshold, zram_info, mem_used_zram = check_zram_ex()
if CHECK_PSI:
@ -3832,7 +3843,6 @@ while True:
if CHECK_PSI:
psi_avg_value = find_psi_metrics_value(psi_path, psi_metrics)
# print(psi_avg_value)
if monotonic() - psi_t0 >= psi_post_action_delay:
psi_post_action_delay_exceeded = True
else: