From 71dcb30cfb5a13447e8f0eae1a336c43d40eb482 Mon Sep 17 00:00:00 2001
From: Alexey Avramov <hakavlad@gmail.com>
Date: Mon, 18 Mar 2019 13:21:10 +0900
Subject: [PATCH] Added the ability to log into a separate file

---
 README.md   | 15 +++-------
 nohang      | 86 ++++++++++++++++++++++++++++++++++-------------------
 nohang.conf | 14 ++++-----
 3 files changed, 66 insertions(+), 49 deletions(-)

diff --git a/README.md b/README.md
index 6b56bb4..4c3d998 100644
--- a/README.md
+++ b/README.md
@@ -51,8 +51,8 @@ The tools listed above may work at the same time on one computer.
     - Notification of corrective actions taken and displaying the name and PID of the victim
     - Low memory warnings (displays available memory)
 - `zram` support (`mem_used_total` as a trigger)
-- Initial [PSI](https://lwn.net/Articles/759658/) support (since Linux 4.20+, using `/proc/pressure/memory` and `some avg10` as a trigger)
-- Convenient configuration with a ~~well~~ commented [config file](https://github.com/hakavlad/nohang/blob/master/nohang.conf)
+- Initial [PSI](https://lwn.net/Articles/759658/) support
+- Easy configuration with a ~~well~~ commented [config file](https://github.com/hakavlad/nohang/blob/master/nohang.conf)
 
 ## Requirements
 
@@ -186,15 +186,7 @@ See also `man journalctl`.
 
 ## Known problems
 
-- Awful documentation (the problem will be solved gradually in the next releases)
-- It is written in Python and is actually a prototype (although the algorithm may be good)
-- No tests (by itself this does not make the algorithm bad)
-
-## Todo
-
-- Make installer for non-systemd users
-- Deb/rpm packaging
-- Rewrite all code in Golang with tests and good documentation.
+- Awful documentation.
 
 ## Nohang don't help you
 
@@ -218,6 +210,7 @@ Please create [issues](https://github.com/hakavlad/nohang/issues). Use cases, fe
         - [x] Add delta memory info (the rate of change of available memory)
         - [x] Print statistics on corrective actions after each corrective action
         - [x] Added ability to print a process table before each corrective action
+    -     [x] Added the ability to log into a separate file
     - [x] Improve poll rate algorithm
     - [x] Add `max_post_sigterm_victim_lifetime` option: send SIGKILL to the victim if it doesn't respond to SIGTERM for a certain time
     - [x] Improve victim search algorithm (do it ~30% faster) ([rfjakob/earlyoom#114](https://github.com/rfjakob/earlyoom/issues/114))
diff --git a/nohang b/nohang
index 881bae3..6c4f6e9 100755
--- a/nohang
+++ b/nohang
@@ -9,6 +9,36 @@ from sys import stdout, stderr, argv, exit
 from signal import SIGKILL, SIGTERM
 import sys
 
+import logging
+from logging import basicConfig
+from logging import info
+
+
+logfile = '/var/log/nohang/nohang.log'
+
+
+basicConfig(filename=logfile,
+            level=logging.INFO,
+            format="%(asctime)s: %(message)s")
+
+
+separate_log = True
+
+
+
+def log(msg):
+    print(msg)
+    if separate_log:
+        info(msg)
+
+
+
+
+
+
+
+
+
 
 start_time = time()
 
@@ -43,7 +73,6 @@ wait_time = 10
 notify_helper_path = '/usr/sbin/nohang_notify_helper'
 
 
-HR = ''
 
 victim_dict = dict()
 
@@ -203,8 +232,8 @@ def update_stat_dict_and_print(key):
 
     if print_total_stat:
 
-        stats_msg = '{}\nTotal stat (what happened in the last {}):'.format(
-            HR, format_time(time() - start_time))
+        stats_msg = 'Total stat (what happened in the last {}):'.format(
+            format_time(time() - start_time))
 
         for i in stat_dict:
             stats_msg += '\n  {}: {}'.format(i, stat_dict[i])
@@ -775,9 +804,9 @@ def find_victim():
     pid_badness_list = []
 
     if print_proc_table:
-        print('===============================================================================')
-        print('    PID  badness  Name                  eUID  cmdline')
-        print('-------  -------  --------------- ----------  ---------------------------------')
+        log('===============================================================================')
+        log('    PID  badness  Name                  eUID  cmdline')
+        log('-------  -------  --------------- ----------  ---------------------------------')
 
     for pid in pid_list:
 
@@ -786,7 +815,7 @@ def find_victim():
             continue
 
         if print_proc_table:
-            print('{}  {}  {} {}  {}'.format(
+            log('{}  {}  {} {}  {}'.format(
                 pid.rjust(7),
                 str(badness).rjust(7),
                 pid_to_name(pid).ljust(15),
@@ -813,10 +842,10 @@ def find_victim():
     victim_name = pid_to_name(pid)
 
     if print_proc_table:
-        print('===============================================================================')
+        log('===============================================================================')
 
-    print(
-        '\nProcess with highest badness (found in {} ms):\n  PID: {}, Name: {}, badness: {}'.format(
+    log(
+        'Process with highest badness (found in {} ms):\n  PID: {}, Name: {}, badness: {}'.format(
             round((time() - ft1) * 1000),
             pid,
             victim_name,
@@ -987,7 +1016,7 @@ def find_victim_info(pid, victim_badness, name):
 
     victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
 
-    victim_info = '\nVictim information (found in {} ms):' \
+    victim_info = 'Victim information (found in {} ms):' \
         '\n  Name:     {}' \
         '\n  State:    {}' \
         '\n  PID:      {}' \
@@ -1027,7 +1056,7 @@ def implement_corrective_action(signal):
     Find victim with highest badness and send SIGTERM/SIGKILL
     """
 
-    print(mem_info)
+    log(mem_info)
 
     pid, victim_badness, name = find_victim()
 
@@ -1035,7 +1064,7 @@ def implement_corrective_action(signal):
 
         if print_victim_info:
             victim_info = find_victim_info(pid, victim_badness, name)
-            print(victim_info)
+            log(victim_info)
 
         # kill the victim if it doesn't respond to SIGTERM
         if signal is SIGTERM:
@@ -1059,7 +1088,7 @@ def implement_corrective_action(signal):
             m = check_mem_and_swap()
             ma = round(int(m[0]) / 1024.0)
             sf = round(int(m[2]) / 1024.0)
-            print('\nMemory status before implementing a corrective action:\n  MemAvailable'
+            log('Memory status before implementing a corrective action:\n  MemAvailable'
                   ': {} MiB, SwapFree: {} MiB'.format(ma, sf))
 
             exit_status = os.system(etc_dict[name].replace(
@@ -1072,7 +1101,7 @@ def implement_corrective_action(signal):
 
             response_time = time() - time0
 
-            etc_info = '\nImplement a corrective action:\n  Run the command: {}' \
+            etc_info = 'Implement a corrective action:\n  Run the command: {}' \
                 '\n  Exit status: {}; total response time: {} ms'.format(
                     command.replace(
                         '$PID', pid).replace(
@@ -1098,7 +1127,7 @@ def implement_corrective_action(signal):
                 m = check_mem_and_swap()
                 ma = round(int(m[0]) / 1024.0)
                 sf = round(int(m[2]) / 1024.0)
-                print('\nMemory status before implementing a corrective action:\n  MemAvailable'
+                log('Memory status before implementing a corrective action:\n  MemAvailable'
                       ': {} MiB, SwapFree: {} MiB'.format(ma, sf))
 
                 os.kill(int(pid), signal)
@@ -1106,7 +1135,7 @@ def implement_corrective_action(signal):
                 send_result = 'total response time: {} ms'.format(
                     round(response_time * 1000))
 
-                preventing_oom_message = '\nImplement a corrective action:' \
+                preventing_oom_message = 'Implement a corrective action:' \
                     '\n  Send {} to the victim; {}'.format(
                         sig_dict[signal], send_result)
 
@@ -1135,7 +1164,7 @@ def implement_corrective_action(signal):
                     round(response_time * 1000))
                 key = 'ProcessLookupError (the victim died in the search process): '
 
-            print(preventing_oom_message)
+            log(preventing_oom_message)
 
             update_stat_dict_and_print(key)
 
@@ -1154,7 +1183,6 @@ def implement_corrective_action(signal):
         key = 'victim badness < min_badness'
         update_stat_dict_and_print(key)
 
-    print('###############################################################################')
 
     sleep_after_send_signal(signal)
 
@@ -1987,18 +2015,18 @@ while True:
         avg10 = find_psi_metrics_value(psi_path, psi_metrics)
 
         if print_mem_check_results:
-            avg_value = 'PSI value: {} | '.format(str(avg10).rjust(6))
+            avg_value = 'PSI avg value: {} | '.format(str(avg10).rjust(6))
 
         if avg10 >= sigkill_psi and time() - psi_t0 >= psi_avg10_sleep_time:
             time0 = time()
-            mem_info = 'PSI value ({}) > sigkill_psi ({})'.format(avg10, sigkill_psi)
+            mem_info = 'PSI avg value ({}) > sigkill_psi ({})'.format(avg10, sigkill_psi)
             implement_corrective_action(SIGKILL)
             psi_t0 = time()
             continue
 
         if avg10 >= sigterm_psi and time() - psi_t0 >= psi_avg10_sleep_time:
             time0 = time()
-            mem_info = 'PSI value ({}) > sigterm_psi ({})'.format(avg10, sigterm_psi)
+            mem_info = 'PSI avg value ({}) > sigterm_psi ({})'.format(avg10, sigterm_psi)
             implement_corrective_action(SIGTERM)
             psi_t0 = time()
             continue
@@ -2058,7 +2086,7 @@ while True:
 
             # Output avialable mem sizes
             if swap_total == 0 and mem_used_zram == 0:
-                print('{}MemAvail: {} M, {} %{}'.format(
+                log('{}MemAvail: {} M, {} %{}'.format(
                     avg_value,
                     human(mem_available, mem_len),
                     just_percent_mem(mem_available / mem_total),
@@ -2113,11 +2141,10 @@ while True:
             swap_free <= swap_min_sigkill_kb):
         time0 = time()
 
-        mem_info = '{}\nSIGKILL threshold exeeded\nMemory status that requires corrective actions:' \
+        mem_info = 'Hard threshold exeeded\nMemory status that requires corrective actions:' \
             '\n  MemAvailable [{} MiB, {} %] <= mem_min_sig' \
             'kill [{} MiB, {} %]\n  SwapFree [{} MiB, {} %] <= swa' \
             'p_min_sigkill [{} MiB, {} %]'.format(
-                HR,
                 kib_to_mib(mem_available),
                 percent(mem_available / mem_total),
                 kib_to_mib(mem_min_sigkill_kb),
@@ -2135,10 +2162,9 @@ while True:
     if mem_used_zram >= zram_max_sigkill_kb:
         time0 = time()
 
-        mem_info = '{}\nSIGKILL threshold exeeded\nMemory status that requires corrective actions:' \
+        mem_info = 'Hard threshold exeeded\nMemory status that requires corrective actions:' \
             '\n  MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
             'kill [{} MiB, {} %]'.format(
-                HR,
                 kib_to_mib(mem_used_zram),
                 percent(mem_used_zram / mem_total),
                 kib_to_mib(zram_max_sigkill_kb),
@@ -2154,11 +2180,10 @@ while True:
 
         time0 = time()
 
-        mem_info = '{}\nSIGTERM threshold exeeded\nMemory status that requires corrective actions:' \
+        mem_info = 'Soft threshold exeeded\nMemory status that requires corrective actions:' \
             '\n  MemAvailable [{} MiB, {} %] <= mem_min_sig' \
             'term [{} MiB, {} %]\n  SwapFree [{} MiB, {} %] <= swa' \
             'p_min_sigterm [{} MiB, {} %]'.format(
-                HR,
                 kib_to_mib(mem_available),
                 percent(mem_available / mem_total),
                 kib_to_mib(mem_min_sigterm_kb),
@@ -2178,10 +2203,9 @@ while True:
     if mem_used_zram >= zram_max_sigterm_kb:
         time0 = time()
 
-        mem_info = '{}\nSIGTERM threshold exeeded\nMemory status that requires corrective actions:' \
+        mem_info = 'Soft threshold exeeded\nMemory status that requires corrective actions:' \
             '\n  MemUsedZram [{} MiB, {} %] >= ' \
             'zram_max_sigterm [{} M, {} %]'.format(
-                HR,
                 kib_to_mib(mem_used_zram),
                 percent(mem_used_zram / mem_total),
                 kib_to_mib(zram_max_sigterm_kb),
diff --git a/nohang.conf b/nohang.conf
index 90d6caa..556a7d9 100644
--- a/nohang.conf
+++ b/nohang.conf
@@ -65,16 +65,16 @@ zram_max_sigkill = 55 %
     Response on PSI memory some/full avg10/avg60/avg300 value
     (/proc/pressure/memory on systems with Linux 4.20+).
 
-ignore_psi = False
+ignore_psi = True
 
     Choose path to PSI file.
 
     psi_path = /sys/fs/cgroup/unified/user.slice/memory.pressure
     psi_path = /sys/fs/cgroup/unified/system.slice/memory.pressure
     psi_path = /sys/fs/cgroup/unified/system.slice/foo.service/memory.pressure
-    psi_path = /proc/pressure/memory
+    psi_path = ./psi_dummy
 
-psi_path = ./psi_dummy
+psi_path = /proc/pressure/memory
 
 psi_metrics = some_avg10
 
@@ -323,7 +323,7 @@ zram_max_warnings = 40 %
 
 #####################################################################
 
-    7. Output verbosity
+    7. Verbosity
 
     Display the configuration when the program starts.
     Valid values are True and False.
@@ -333,13 +333,13 @@ print_config = False
     Print memory check results.
     Valid values are True and False.
 
-print_mem_check_results = True
+print_mem_check_results = False
 
     Минимальная периодичность печати состояния памяти.
     0 - печатать все проверки памяти.
     Неотрицательное число.
 
-min_mem_report_interval = 1
+min_mem_report_interval = 10
 
     Print sleep periods between memory checks.
     Valid values are True and False.
@@ -372,7 +372,7 @@ max_ancestry_depth = 3
     которого жертва получит SIGKILL.
     Неотрицательные числа.
 
-max_post_sigterm_victim_lifetime = 9
+max_post_sigterm_victim_lifetime = 10
 
     Выполнить произвольную команду после SIGKILL.
     Пустая строка - ничего не выполнять.