From 75f05959fc7b2e801b90d8cba0eeafa041fd7fd0 Mon Sep 17 00:00:00 2001 From: Alexey Avramov Date: Thu, 18 Jul 2019 22:57:41 +0900 Subject: [PATCH] fix alerts --- nohang | 51 ++++++++++++++++++++++-------------- nohang_notify_helper | 2 +- oom-sort | 6 ++--- psi-monitor | 31 ++++++++++------------ psi-top | 62 ++++++++++++-------------------------------- 5 files changed, 66 insertions(+), 86 deletions(-) diff --git a/nohang b/nohang index 97351e2..23ab5bc 100755 --- a/nohang +++ b/nohang @@ -344,7 +344,7 @@ def log(*msg): sleep(0.01) if separate_log: try: - info(*msg) + logging.info(*msg) except OSError: sleep(0.01) @@ -835,16 +835,22 @@ def check_zram(): return (mem_used_total_sum + disksize_sum * ZRAM_DISKSIZE_FACTOR) / 1024.0 + + def format_time(t): """ """ + t = int(t) + if t < 60: return '{} sec'.format(t) - elif t >= 60 and t < 3600: + + elif (t < 3600 and t >= 60): m = t // 60 s = t % 60 return '{} min {} sec'.format(m, s) + else: h = t // 3600 s0 = t - h * 3600 @@ -853,6 +859,8 @@ def format_time(t): return '{} h {} min {} sec'.format(h, m, s) + + def string_to_float_convert_test(string): """Try to interprete string values as floats.""" try: @@ -1002,11 +1010,6 @@ def send_notify(threshold, name, pid): pid: str process pid """ - # wait for memory release after corrective action - # may be useful if free memory was about 0 immediately after - # corrective action - sleep(0.05) - title = 'Freeze prevention' body = '{} [{}] {}'.format( notify_sig_dict[threshold], @@ -1237,9 +1240,14 @@ def find_victim_info(pid, victim_badness, name): state = line.split('\t')[1].rstrip() continue + + """ if n is ppid_index: - ppid = line.split('\t')[1] + # ppid = line.split('\t')[1] continue + """ + + if n is uid_index: uid = line.split('\t')[2] @@ -1299,8 +1307,13 @@ def find_victim_info(pid, victim_badness, name): if i is state_index: state = f_list[i].split('\t')[1].rstrip() + + """ if i is ppid_index: - ppid = f_list[i].split('\t')[1] + pass + # ppid = f_list[i].split('\t')[1] + """ + if i is uid_index: uid = f_list[i].split('\t')[2] @@ -1901,10 +1914,14 @@ def implement_corrective_action( pid)).replace('$SERVICE', service) exit_status = exe(cmd) + """ if exit_status == 0: success = True else: success = False + """ + + response_time = time() - time0 @@ -1937,18 +1954,16 @@ def implement_corrective_action( except FileNotFoundError: vwd = True - success = False + # success = False response_time = time() - time0 - send_result = 'no such process; response time: {} ms'.format( - round(response_time * 1000)) + # send_result = 'no such process; response time: {} ms'.format(round(response_time * 1000)) key = 'The victim died in the search process: ' \ 'FileNotFoundError' except ProcessLookupError: vwd = True - success = False + # success = False response_time = time() - time0 - send_result = 'no such process; response time: {} ms'.format( - round(response_time * 1000)) + # send_result = 'no such process; response time: {} ms'.format(round(response_time * 1000)) key = 'The victim died in the search process: ' \ 'ProcessLookupError' @@ -1974,7 +1989,7 @@ def implement_corrective_action( # print(v_dict) - response_time = time() - time0 + # response_time = time() - time0 # log('success: ' + str(success)) # log('victim will die: ' + str(vwd)) @@ -2934,8 +2949,6 @@ separate_log = conf_parse_bool('separate_log') if separate_log: import logging - from logging import basicConfig - from logging import info log_dir = '/var/log/nohang' @@ -2957,7 +2970,7 @@ if separate_log: print('ERROR: log PermissionError') try: - basicConfig( + logging.basicConfig( filename=logfile, level=logging.INFO, format="%(asctime)s: %(message)s") diff --git a/nohang_notify_helper b/nohang_notify_helper index 46a6796..492d841 100755 --- a/nohang_notify_helper +++ b/nohang_notify_helper @@ -119,7 +119,7 @@ except Exception: try: - from os import listdir, path, remove + from os import listdir, path from subprocess import Popen, TimeoutExpired from sys import argv except OSError: diff --git a/oom-sort b/oom-sort index 4703fbd..3e9ffb1 100755 --- a/oom-sort +++ b/oom-sort @@ -45,13 +45,13 @@ def pid_to_status_units(pid): if i is 1: name = f_list[0].split('\t')[1] - if i is uid_index: + if i == uid_index: uid = f_list[i].split('\t')[2] - if i is vm_rss_index: + if i == vm_rss_index: vm_rss = f_list[i].split('\t')[1][:-3] - if i is vm_swap_index: + if i == vm_swap_index: vm_swap = f_list[i].split('\t')[1][:-3] return name, uid, vm_rss, vm_swap diff --git a/psi-monitor b/psi-monitor index 70b4c42..8491215 100755 --- a/psi-monitor +++ b/psi-monitor @@ -3,19 +3,7 @@ from ctypes import CDLL from time import sleep from sys import argv - -""" - Execute the command - find /sys/fs/cgroup -name memory.pressure - to find available memory.pressue files (except /proc/pressure/memory). - (actual for cgroup2) -""" - -if len(argv) > 1: - psi_path = argv[1] -else: - psi_path = '/proc/pressure/memory' - +import os def mlockall(): @@ -33,16 +21,13 @@ def mlockall(): MCL_CURRENT | MCL_FUTURE ) if result != 0: - print('WARNING: cannot lock all memory') + pass else: pass else: pass -mlockall() - - def psi_path_to_metrics(psi_path): with open(psi_path) as f: @@ -62,8 +47,18 @@ def psi_path_to_metrics(psi_path): full_avg10, full_avg60, full_avg300) -print('Path to PSI file: {}\n'.format(psi_path)) +if len(argv) > 1: + psi_path = argv[1] +else: + psi_path = '/proc/pressure/memory' +if not os.path.exists(psi_path): + print('PSI path does not exist. Exit.') + exit() + +mlockall() + +print('Path to PSI file: {}\n'.format(psi_path)) print(' avg10 avg60 avg300 avg10 avg60 avg300') diff --git a/psi-top b/psi-top index d4b6f52..5f6119a 100755 --- a/psi-top +++ b/psi-top @@ -1,45 +1,9 @@ #!/usr/bin/env python3 -from ctypes import CDLL -from time import sleep, time import os -""" - Execute the command - find /sys/fs/cgroup -name memory.pressure - to find available memory.pressue files (except /proc/pressure/memory). - (actual for cgroup2) -""" - psi_path = '/proc/pressure/memory' -def mlockall(): - - MCL_CURRENT = 1 - MCL_FUTURE = 2 - MCL_ONFAULT = 4 - - libc = CDLL('libc.so.6', use_errno=True) - - result = libc.mlockall( - MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT - ) - if result != 0: - result = libc.mlockall( - MCL_CURRENT | MCL_FUTURE - ) - if result != 0: - print('WARNING: cannot lock all memory') - else: - pass - else: - pass - - -mlockall() - -t0 = time() - def psi_path_to_metrics(psi_path): with open(psi_path) as f: @@ -59,7 +23,6 @@ def psi_path_to_metrics(psi_path): full_avg10, full_avg60, full_avg300) - def cgroup2_root(): """ """ @@ -93,7 +56,19 @@ def psi_path_to_cgroup2(path): i = cgroup2_root() -print('cgroup2 root dir:', i) +if i is None: + print('cgroup2 not mounted') +else: + print('cgroup2 root dir:', i) + + +psi_support = os.path.exists(psi_path) + +if not psi_support: + print('PSI is not supported, /proc/pressure/memory does not exist. Exit.') + exit(1) + + if i is not None: y = get_psi_mem_files(i) for path in y: @@ -105,14 +80,16 @@ print(' avg10 avg60 avg300 avg10 avg60 avg300 cgroup2') print(' ----- ----- ------ ----- ----- ------ ---------') -(some_avg10, some_avg60, some_avg300, full_avg10, full_avg60, full_avg300) = psi_path_to_metrics('/proc/pressure/memory') +(some_avg10, some_avg60, some_avg300, full_avg10, full_avg60, full_avg300 + ) = psi_path_to_metrics('/proc/pressure/memory') + print('some {} {} {} | full {} {} {} {}'.format( some_avg10.rjust(6), some_avg60.rjust(6), some_avg300.rjust(6), full_avg10.rjust(6), full_avg60.rjust(6), - full_avg300.rjust(6), '[SYSTEM]')) + full_avg300.rjust(6), '[SYSTEM_WIDE]')) for psi_path in path_list: @@ -126,8 +103,3 @@ for psi_path in path_list: full_avg10.rjust(6), full_avg60.rjust(6), full_avg300.rjust(6), psi_path_to_cgroup2(psi_path))) - - -print(time() - t0) - -