Added the ability to log into a separate file

2019-03-18 13:21:10 +09:00 · 2019-03-18 13:21:10 +09:00 · 71dcb30cfb
commit 71dcb30cfb
parent 6e7eea2da7
3 changed files with 66 additions and 49 deletions
--- a/README.md
+++ b/README.md
@ -51,8 +51,8 @@ The tools listed above may work at the same time on one computer.
    - Notification of corrective actions taken and displaying the name and PID of the victim
    - Low memory warnings (displays available memory)
 - `zram` support (`mem_used_total` as a trigger)
- Initial [PSI](https://lwn.net/Articles/759658/) support (since Linux 4.20+, using `/proc/pressure/memory` and `some avg10` as a trigger)
+- Initial [PSI](https://lwn.net/Articles/759658/) support
- Convenient configuration with a ~~well~~ commented [config file](https://github.com/hakavlad/nohang/blob/master/nohang.conf)
+- Easy configuration with a ~~well~~ commented [config file](https://github.com/hakavlad/nohang/blob/master/nohang.conf)
 ## Requirements
@ -186,15 +186,7 @@ See also `man journalctl`.
 ## Known problems
- Awful documentation (the problem will be solved gradually in the next releases)
+- Awful documentation.
 - It is written in Python and is actually a prototype (although the algorithm may be good)
 - No tests (by itself this does not make the algorithm bad)
 ## Todo
 - Make installer for non-systemd users
 - Deb/rpm packaging
 - Rewrite all code in Golang with tests and good documentation.
 ## Nohang don't help you
@ -218,6 +210,7 @@ Please create [issues](https://github.com/hakavlad/nohang/issues). Use cases, fe
        - [x] Add delta memory info (the rate of change of available memory)
        - [x] Print statistics on corrective actions after each corrective action
        - [x] Added ability to print a process table before each corrective action
    -     [x] Added the ability to log into a separate file
    - [x] Improve poll rate algorithm
    - [x] Add `max_post_sigterm_victim_lifetime` option: send SIGKILL to the victim if it doesn't respond to SIGTERM for a certain time
    - [x] Improve victim search algorithm (do it ~30% faster) ([rfjakob/earlyoom#114](https://github.com/rfjakob/earlyoom/issues/114))
--- a/86
+++ b/86
@ -9,6 +9,36 @@ from sys import stdout, stderr, argv, exit
 from signal import SIGKILL, SIGTERM
 import sys
 import logging
 from logging import basicConfig
 from logging import info
 logfile = '/var/log/nohang/nohang.log'
 basicConfig(filename=logfile,
            level=logging.INFO,
            format="%(asctime)s: %(message)s")
 separate_log = True
 def log(msg):
    print(msg)
    if separate_log:
        info(msg)
 start_time = time()
@ -43,7 +73,6 @@ wait_time = 10
 notify_helper_path = '/usr/sbin/nohang_notify_helper'
 HR = ''
 victim_dict = dict()
@ -203,8 +232,8 @@ def update_stat_dict_and_print(key):
    if print_total_stat:
-        stats_msg = '{}\nTotal stat (what happened in the last {}):'.format(
+        stats_msg = 'Total stat (what happened in the last {}):'.format(
-            HR, format_time(time() - start_time))
+            format_time(time() - start_time))
        for i in stat_dict:
            stats_msg += '\n  {}: {}'.format(i, stat_dict[i])
@ -775,9 +804,9 @@ def find_victim():
    pid_badness_list = []
    if print_proc_table:
-        print('===============================================================================')
+        log('===============================================================================')
-        print('    PID  badness  Name                  eUID  cmdline')
+        log('    PID  badness  Name                  eUID  cmdline')
-        print('-------  -------  --------------- ----------  ---------------------------------')
+        log('-------  -------  --------------- ----------  ---------------------------------')
    for pid in pid_list:
@ -786,7 +815,7 @@ def find_victim():
            continue
        if print_proc_table:
-            print('{}  {}  {} {}  {}'.format(
+            log('{}  {}  {} {}  {}'.format(
                pid.rjust(7),
                str(badness).rjust(7),
                pid_to_name(pid).ljust(15),
@ -813,10 +842,10 @@ def find_victim():
    victim_name = pid_to_name(pid)
    if print_proc_table:
-        print('===============================================================================')
+        log('===============================================================================')
-    print(
+    log(
-        '\nProcess with highest badness (found in {} ms):\n  PID: {}, Name: {}, badness: {}'.format(
+        'Process with highest badness (found in {} ms):\n  PID: {}, Name: {}, badness: {}'.format(
            round((time() - ft1) * 1000),
            pid,
            victim_name,
@ -987,7 +1016,7 @@ def find_victim_info(pid, victim_badness, name):
    victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
-    victim_info = '\nVictim information (found in {} ms):' \
+    victim_info = 'Victim information (found in {} ms):' \
        '\n  Name:     {}' \
        '\n  State:    {}' \
        '\n  PID:      {}' \
@ -1027,7 +1056,7 @@ def implement_corrective_action(signal):
    Find victim with highest badness and send SIGTERM/SIGKILL
    """
-    print(mem_info)
+    log(mem_info)
    pid, victim_badness, name = find_victim()
@ -1035,7 +1064,7 @@ def implement_corrective_action(signal):
        if print_victim_info:
            victim_info = find_victim_info(pid, victim_badness, name)
-            print(victim_info)
+            log(victim_info)
        # kill the victim if it doesn't respond to SIGTERM
        if signal is SIGTERM:
@ -1059,7 +1088,7 @@ def implement_corrective_action(signal):
            m = check_mem_and_swap()
            ma = round(int(m[0]) / 1024.0)
            sf = round(int(m[2]) / 1024.0)
-            print('\nMemory status before implementing a corrective action:\n  MemAvailable'
+            log('Memory status before implementing a corrective action:\n  MemAvailable'
                  ': {} MiB, SwapFree: {} MiB'.format(ma, sf))
            exit_status = os.system(etc_dict[name].replace(
@ -1072,7 +1101,7 @@ def implement_corrective_action(signal):
            response_time = time() - time0
-            etc_info = '\nImplement a corrective action:\n  Run the command: {}' \
+            etc_info = 'Implement a corrective action:\n  Run the command: {}' \
                '\n  Exit status: {}; total response time: {} ms'.format(
                    command.replace(
                        '$PID', pid).replace(
@ -1098,7 +1127,7 @@ def implement_corrective_action(signal):
                m = check_mem_and_swap()
                ma = round(int(m[0]) / 1024.0)
                sf = round(int(m[2]) / 1024.0)
-                print('\nMemory status before implementing a corrective action:\n  MemAvailable'
+                log('Memory status before implementing a corrective action:\n  MemAvailable'
                      ': {} MiB, SwapFree: {} MiB'.format(ma, sf))
                os.kill(int(pid), signal)
@ -1106,7 +1135,7 @@ def implement_corrective_action(signal):
                send_result = 'total response time: {} ms'.format(
                    round(response_time * 1000))
-                preventing_oom_message = '\nImplement a corrective action:' \
+                preventing_oom_message = 'Implement a corrective action:' \
                    '\n  Send {} to the victim; {}'.format(
                        sig_dict[signal], send_result)
@ -1135,7 +1164,7 @@ def implement_corrective_action(signal):
                    round(response_time * 1000))
                key = 'ProcessLookupError (the victim died in the search process): '
-            print(preventing_oom_message)
+            log(preventing_oom_message)
            update_stat_dict_and_print(key)
@ -1154,7 +1183,6 @@ def implement_corrective_action(signal):
        key = 'victim badness < min_badness'
        update_stat_dict_and_print(key)
    print('###############################################################################')
    sleep_after_send_signal(signal)
@ -1987,18 +2015,18 @@ while True:
        avg10 = find_psi_metrics_value(psi_path, psi_metrics)
        if print_mem_check_results:
-            avg_value = 'PSI value: {} | '.format(str(avg10).rjust(6))
+            avg_value = 'PSI avg value: {} | '.format(str(avg10).rjust(6))
        if avg10 >= sigkill_psi and time() - psi_t0 >= psi_avg10_sleep_time:
            time0 = time()
-            mem_info = 'PSI value ({}) > sigkill_psi ({})'.format(avg10, sigkill_psi)
+            mem_info = 'PSI avg value ({}) > sigkill_psi ({})'.format(avg10, sigkill_psi)
            implement_corrective_action(SIGKILL)
            psi_t0 = time()
            continue
        if avg10 >= sigterm_psi and time() - psi_t0 >= psi_avg10_sleep_time:
            time0 = time()
-            mem_info = 'PSI value ({}) > sigterm_psi ({})'.format(avg10, sigterm_psi)
+            mem_info = 'PSI avg value ({}) > sigterm_psi ({})'.format(avg10, sigterm_psi)
            implement_corrective_action(SIGTERM)
            psi_t0 = time()
            continue
@ -2058,7 +2086,7 @@ while True:
            # Output avialable mem sizes
            if swap_total == 0 and mem_used_zram == 0:
-                print('{}MemAvail: {} M, {} %{}'.format(
+                log('{}MemAvail: {} M, {} %{}'.format(
                    avg_value,
                    human(mem_available, mem_len),
                    just_percent_mem(mem_available / mem_total),
@ -2113,11 +2141,10 @@ while True:
            swap_free <= swap_min_sigkill_kb):
        time0 = time()
-        mem_info = '{}\nSIGKILL threshold exeeded\nMemory status that requires corrective actions:' \
+        mem_info = 'Hard threshold exeeded\nMemory status that requires corrective actions:' \
            '\n  MemAvailable [{} MiB, {} %] <= mem_min_sig' \
            'kill [{} MiB, {} %]\n  SwapFree [{} MiB, {} %] <= swa' \
            'p_min_sigkill [{} MiB, {} %]'.format(
                HR,
                kib_to_mib(mem_available),
                percent(mem_available / mem_total),
                kib_to_mib(mem_min_sigkill_kb),
@ -2135,10 +2162,9 @@ while True:
    if mem_used_zram >= zram_max_sigkill_kb:
        time0 = time()
-        mem_info = '{}\nSIGKILL threshold exeeded\nMemory status that requires corrective actions:' \
+        mem_info = 'Hard threshold exeeded\nMemory status that requires corrective actions:' \
            '\n  MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
            'kill [{} MiB, {} %]'.format(
                HR,
                kib_to_mib(mem_used_zram),
                percent(mem_used_zram / mem_total),
                kib_to_mib(zram_max_sigkill_kb),
@ -2154,11 +2180,10 @@ while True:
        time0 = time()
-        mem_info = '{}\nSIGTERM threshold exeeded\nMemory status that requires corrective actions:' \
+        mem_info = 'Soft threshold exeeded\nMemory status that requires corrective actions:' \
            '\n  MemAvailable [{} MiB, {} %] <= mem_min_sig' \
            'term [{} MiB, {} %]\n  SwapFree [{} MiB, {} %] <= swa' \
            'p_min_sigterm [{} MiB, {} %]'.format(
                HR,
                kib_to_mib(mem_available),
                percent(mem_available / mem_total),
                kib_to_mib(mem_min_sigterm_kb),
@ -2178,10 +2203,9 @@ while True:
    if mem_used_zram >= zram_max_sigterm_kb:
        time0 = time()
-        mem_info = '{}\nSIGTERM threshold exeeded\nMemory status that requires corrective actions:' \
+        mem_info = 'Soft threshold exeeded\nMemory status that requires corrective actions:' \
            '\n  MemUsedZram [{} MiB, {} %] >= ' \
            'zram_max_sigterm [{} M, {} %]'.format(
                HR,
                kib_to_mib(mem_used_zram),
                percent(mem_used_zram / mem_total),
                kib_to_mib(zram_max_sigterm_kb),
--- a/nohang.conf
+++ b/nohang.conf
@ -65,16 +65,16 @@ zram_max_sigkill = 55 %
    Response on PSI memory some/full avg10/avg60/avg300 value
    (/proc/pressure/memory on systems with Linux 4.20+).
-ignore_psi = False
+ignore_psi = True
    Choose path to PSI file.
    psi_path = /sys/fs/cgroup/unified/user.slice/memory.pressure
    psi_path = /sys/fs/cgroup/unified/system.slice/memory.pressure
    psi_path = /sys/fs/cgroup/unified/system.slice/foo.service/memory.pressure
-    psi_path = /proc/pressure/memory
+    psi_path = ./psi_dummy
-psi_path = ./psi_dummy
+psi_path = /proc/pressure/memory
 psi_metrics = some_avg10
@ -323,7 +323,7 @@ zram_max_warnings = 40 %
 #####################################################################
-    7. Output verbosity
+    7. Verbosity
    Display the configuration when the program starts.
    Valid values are True and False.
@ -333,13 +333,13 @@ print_config = False
    Print memory check results.
    Valid values are True and False.
-print_mem_check_results = True
+print_mem_check_results = False
    Минимальная периодичность печати состояния памяти.
    0 - печатать все проверки памяти.
    Неотрицательное число.
-min_mem_report_interval = 1
+min_mem_report_interval = 10
    Print sleep periods between memory checks.
    Valid values are True and False.
@ -372,7 +372,7 @@ max_ancestry_depth = 3
    которого жертва получит SIGKILL.
    Неотрицательные числа.
-max_post_sigterm_victim_lifetime = 9
+max_post_sigterm_victim_lifetime = 10
    Выполнить произвольную команду после SIGKILL.
    Пустая строка - ничего не выполнять.