Added the ability to log into a separate file

This commit is contained in:
Alexey Avramov 2019-03-18 13:21:10 +09:00
parent 6e7eea2da7
commit 71dcb30cfb
3 changed files with 66 additions and 49 deletions

View File

@ -51,8 +51,8 @@ The tools listed above may work at the same time on one computer.
- Notification of corrective actions taken and displaying the name and PID of the victim - Notification of corrective actions taken and displaying the name and PID of the victim
- Low memory warnings (displays available memory) - Low memory warnings (displays available memory)
- `zram` support (`mem_used_total` as a trigger) - `zram` support (`mem_used_total` as a trigger)
- Initial [PSI](https://lwn.net/Articles/759658/) support (since Linux 4.20+, using `/proc/pressure/memory` and `some avg10` as a trigger) - Initial [PSI](https://lwn.net/Articles/759658/) support
- Convenient configuration with a ~~well~~ commented [config file](https://github.com/hakavlad/nohang/blob/master/nohang.conf) - Easy configuration with a ~~well~~ commented [config file](https://github.com/hakavlad/nohang/blob/master/nohang.conf)
## Requirements ## Requirements
@ -186,15 +186,7 @@ See also `man journalctl`.
## Known problems ## Known problems
- Awful documentation (the problem will be solved gradually in the next releases) - Awful documentation.
- It is written in Python and is actually a prototype (although the algorithm may be good)
- No tests (by itself this does not make the algorithm bad)
## Todo
- Make installer for non-systemd users
- Deb/rpm packaging
- Rewrite all code in Golang with tests and good documentation.
## Nohang don't help you ## Nohang don't help you
@ -218,6 +210,7 @@ Please create [issues](https://github.com/hakavlad/nohang/issues). Use cases, fe
- [x] Add delta memory info (the rate of change of available memory) - [x] Add delta memory info (the rate of change of available memory)
- [x] Print statistics on corrective actions after each corrective action - [x] Print statistics on corrective actions after each corrective action
- [x] Added ability to print a process table before each corrective action - [x] Added ability to print a process table before each corrective action
- [x] Added the ability to log into a separate file
- [x] Improve poll rate algorithm - [x] Improve poll rate algorithm
- [x] Add `max_post_sigterm_victim_lifetime` option: send SIGKILL to the victim if it doesn't respond to SIGTERM for a certain time - [x] Add `max_post_sigterm_victim_lifetime` option: send SIGKILL to the victim if it doesn't respond to SIGTERM for a certain time
- [x] Improve victim search algorithm (do it ~30% faster) ([rfjakob/earlyoom#114](https://github.com/rfjakob/earlyoom/issues/114)) - [x] Improve victim search algorithm (do it ~30% faster) ([rfjakob/earlyoom#114](https://github.com/rfjakob/earlyoom/issues/114))

86
nohang
View File

@ -9,6 +9,36 @@ from sys import stdout, stderr, argv, exit
from signal import SIGKILL, SIGTERM from signal import SIGKILL, SIGTERM
import sys import sys
import logging
from logging import basicConfig
from logging import info
logfile = '/var/log/nohang/nohang.log'
basicConfig(filename=logfile,
level=logging.INFO,
format="%(asctime)s: %(message)s")
separate_log = True
def log(msg):
print(msg)
if separate_log:
info(msg)
start_time = time() start_time = time()
@ -43,7 +73,6 @@ wait_time = 10
notify_helper_path = '/usr/sbin/nohang_notify_helper' notify_helper_path = '/usr/sbin/nohang_notify_helper'
HR = ''
victim_dict = dict() victim_dict = dict()
@ -203,8 +232,8 @@ def update_stat_dict_and_print(key):
if print_total_stat: if print_total_stat:
stats_msg = '{}\nTotal stat (what happened in the last {}):'.format( stats_msg = 'Total stat (what happened in the last {}):'.format(
HR, format_time(time() - start_time)) format_time(time() - start_time))
for i in stat_dict: for i in stat_dict:
stats_msg += '\n {}: {}'.format(i, stat_dict[i]) stats_msg += '\n {}: {}'.format(i, stat_dict[i])
@ -775,9 +804,9 @@ def find_victim():
pid_badness_list = [] pid_badness_list = []
if print_proc_table: if print_proc_table:
print('===============================================================================') log('===============================================================================')
print(' PID badness Name eUID cmdline') log(' PID badness Name eUID cmdline')
print('------- ------- --------------- ---------- ---------------------------------') log('------- ------- --------------- ---------- ---------------------------------')
for pid in pid_list: for pid in pid_list:
@ -786,7 +815,7 @@ def find_victim():
continue continue
if print_proc_table: if print_proc_table:
print('{} {} {} {} {}'.format( log('{} {} {} {} {}'.format(
pid.rjust(7), pid.rjust(7),
str(badness).rjust(7), str(badness).rjust(7),
pid_to_name(pid).ljust(15), pid_to_name(pid).ljust(15),
@ -813,10 +842,10 @@ def find_victim():
victim_name = pid_to_name(pid) victim_name = pid_to_name(pid)
if print_proc_table: if print_proc_table:
print('===============================================================================') log('===============================================================================')
print( log(
'\nProcess with highest badness (found in {} ms):\n PID: {}, Name: {}, badness: {}'.format( 'Process with highest badness (found in {} ms):\n PID: {}, Name: {}, badness: {}'.format(
round((time() - ft1) * 1000), round((time() - ft1) * 1000),
pid, pid,
victim_name, victim_name,
@ -987,7 +1016,7 @@ def find_victim_info(pid, victim_badness, name):
victim_lifetime = format_time(uptime() - pid_to_starttime(pid)) victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
victim_info = '\nVictim information (found in {} ms):' \ victim_info = 'Victim information (found in {} ms):' \
'\n Name: {}' \ '\n Name: {}' \
'\n State: {}' \ '\n State: {}' \
'\n PID: {}' \ '\n PID: {}' \
@ -1027,7 +1056,7 @@ def implement_corrective_action(signal):
Find victim with highest badness and send SIGTERM/SIGKILL Find victim with highest badness and send SIGTERM/SIGKILL
""" """
print(mem_info) log(mem_info)
pid, victim_badness, name = find_victim() pid, victim_badness, name = find_victim()
@ -1035,7 +1064,7 @@ def implement_corrective_action(signal):
if print_victim_info: if print_victim_info:
victim_info = find_victim_info(pid, victim_badness, name) victim_info = find_victim_info(pid, victim_badness, name)
print(victim_info) log(victim_info)
# kill the victim if it doesn't respond to SIGTERM # kill the victim if it doesn't respond to SIGTERM
if signal is SIGTERM: if signal is SIGTERM:
@ -1059,7 +1088,7 @@ def implement_corrective_action(signal):
m = check_mem_and_swap() m = check_mem_and_swap()
ma = round(int(m[0]) / 1024.0) ma = round(int(m[0]) / 1024.0)
sf = round(int(m[2]) / 1024.0) sf = round(int(m[2]) / 1024.0)
print('\nMemory status before implementing a corrective action:\n MemAvailable' log('Memory status before implementing a corrective action:\n MemAvailable'
': {} MiB, SwapFree: {} MiB'.format(ma, sf)) ': {} MiB, SwapFree: {} MiB'.format(ma, sf))
exit_status = os.system(etc_dict[name].replace( exit_status = os.system(etc_dict[name].replace(
@ -1072,7 +1101,7 @@ def implement_corrective_action(signal):
response_time = time() - time0 response_time = time() - time0
etc_info = '\nImplement a corrective action:\n Run the command: {}' \ etc_info = 'Implement a corrective action:\n Run the command: {}' \
'\n Exit status: {}; total response time: {} ms'.format( '\n Exit status: {}; total response time: {} ms'.format(
command.replace( command.replace(
'$PID', pid).replace( '$PID', pid).replace(
@ -1098,7 +1127,7 @@ def implement_corrective_action(signal):
m = check_mem_and_swap() m = check_mem_and_swap()
ma = round(int(m[0]) / 1024.0) ma = round(int(m[0]) / 1024.0)
sf = round(int(m[2]) / 1024.0) sf = round(int(m[2]) / 1024.0)
print('\nMemory status before implementing a corrective action:\n MemAvailable' log('Memory status before implementing a corrective action:\n MemAvailable'
': {} MiB, SwapFree: {} MiB'.format(ma, sf)) ': {} MiB, SwapFree: {} MiB'.format(ma, sf))
os.kill(int(pid), signal) os.kill(int(pid), signal)
@ -1106,7 +1135,7 @@ def implement_corrective_action(signal):
send_result = 'total response time: {} ms'.format( send_result = 'total response time: {} ms'.format(
round(response_time * 1000)) round(response_time * 1000))
preventing_oom_message = '\nImplement a corrective action:' \ preventing_oom_message = 'Implement a corrective action:' \
'\n Send {} to the victim; {}'.format( '\n Send {} to the victim; {}'.format(
sig_dict[signal], send_result) sig_dict[signal], send_result)
@ -1135,7 +1164,7 @@ def implement_corrective_action(signal):
round(response_time * 1000)) round(response_time * 1000))
key = 'ProcessLookupError (the victim died in the search process): ' key = 'ProcessLookupError (the victim died in the search process): '
print(preventing_oom_message) log(preventing_oom_message)
update_stat_dict_and_print(key) update_stat_dict_and_print(key)
@ -1154,7 +1183,6 @@ def implement_corrective_action(signal):
key = 'victim badness < min_badness' key = 'victim badness < min_badness'
update_stat_dict_and_print(key) update_stat_dict_and_print(key)
print('###############################################################################')
sleep_after_send_signal(signal) sleep_after_send_signal(signal)
@ -1987,18 +2015,18 @@ while True:
avg10 = find_psi_metrics_value(psi_path, psi_metrics) avg10 = find_psi_metrics_value(psi_path, psi_metrics)
if print_mem_check_results: if print_mem_check_results:
avg_value = 'PSI value: {} | '.format(str(avg10).rjust(6)) avg_value = 'PSI avg value: {} | '.format(str(avg10).rjust(6))
if avg10 >= sigkill_psi and time() - psi_t0 >= psi_avg10_sleep_time: if avg10 >= sigkill_psi and time() - psi_t0 >= psi_avg10_sleep_time:
time0 = time() time0 = time()
mem_info = 'PSI value ({}) > sigkill_psi ({})'.format(avg10, sigkill_psi) mem_info = 'PSI avg value ({}) > sigkill_psi ({})'.format(avg10, sigkill_psi)
implement_corrective_action(SIGKILL) implement_corrective_action(SIGKILL)
psi_t0 = time() psi_t0 = time()
continue continue
if avg10 >= sigterm_psi and time() - psi_t0 >= psi_avg10_sleep_time: if avg10 >= sigterm_psi and time() - psi_t0 >= psi_avg10_sleep_time:
time0 = time() time0 = time()
mem_info = 'PSI value ({}) > sigterm_psi ({})'.format(avg10, sigterm_psi) mem_info = 'PSI avg value ({}) > sigterm_psi ({})'.format(avg10, sigterm_psi)
implement_corrective_action(SIGTERM) implement_corrective_action(SIGTERM)
psi_t0 = time() psi_t0 = time()
continue continue
@ -2058,7 +2086,7 @@ while True:
# Output avialable mem sizes # Output avialable mem sizes
if swap_total == 0 and mem_used_zram == 0: if swap_total == 0 and mem_used_zram == 0:
print('{}MemAvail: {} M, {} %{}'.format( log('{}MemAvail: {} M, {} %{}'.format(
avg_value, avg_value,
human(mem_available, mem_len), human(mem_available, mem_len),
just_percent_mem(mem_available / mem_total), just_percent_mem(mem_available / mem_total),
@ -2113,11 +2141,10 @@ while True:
swap_free <= swap_min_sigkill_kb): swap_free <= swap_min_sigkill_kb):
time0 = time() time0 = time()
mem_info = '{}\nSIGKILL threshold exeeded\nMemory status that requires corrective actions:' \ mem_info = 'Hard threshold exeeded\nMemory status that requires corrective actions:' \
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ 'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
'p_min_sigkill [{} MiB, {} %]'.format( 'p_min_sigkill [{} MiB, {} %]'.format(
HR,
kib_to_mib(mem_available), kib_to_mib(mem_available),
percent(mem_available / mem_total), percent(mem_available / mem_total),
kib_to_mib(mem_min_sigkill_kb), kib_to_mib(mem_min_sigkill_kb),
@ -2135,10 +2162,9 @@ while True:
if mem_used_zram >= zram_max_sigkill_kb: if mem_used_zram >= zram_max_sigkill_kb:
time0 = time() time0 = time()
mem_info = '{}\nSIGKILL threshold exeeded\nMemory status that requires corrective actions:' \ mem_info = 'Hard threshold exeeded\nMemory status that requires corrective actions:' \
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \ '\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
'kill [{} MiB, {} %]'.format( 'kill [{} MiB, {} %]'.format(
HR,
kib_to_mib(mem_used_zram), kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total), percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigkill_kb), kib_to_mib(zram_max_sigkill_kb),
@ -2154,11 +2180,10 @@ while True:
time0 = time() time0 = time()
mem_info = '{}\nSIGTERM threshold exeeded\nMemory status that requires corrective actions:' \ mem_info = 'Soft threshold exeeded\nMemory status that requires corrective actions:' \
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \ '\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \ 'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
'p_min_sigterm [{} MiB, {} %]'.format( 'p_min_sigterm [{} MiB, {} %]'.format(
HR,
kib_to_mib(mem_available), kib_to_mib(mem_available),
percent(mem_available / mem_total), percent(mem_available / mem_total),
kib_to_mib(mem_min_sigterm_kb), kib_to_mib(mem_min_sigterm_kb),
@ -2178,10 +2203,9 @@ while True:
if mem_used_zram >= zram_max_sigterm_kb: if mem_used_zram >= zram_max_sigterm_kb:
time0 = time() time0 = time()
mem_info = '{}\nSIGTERM threshold exeeded\nMemory status that requires corrective actions:' \ mem_info = 'Soft threshold exeeded\nMemory status that requires corrective actions:' \
'\n MemUsedZram [{} MiB, {} %] >= ' \ '\n MemUsedZram [{} MiB, {} %] >= ' \
'zram_max_sigterm [{} M, {} %]'.format( 'zram_max_sigterm [{} M, {} %]'.format(
HR,
kib_to_mib(mem_used_zram), kib_to_mib(mem_used_zram),
percent(mem_used_zram / mem_total), percent(mem_used_zram / mem_total),
kib_to_mib(zram_max_sigterm_kb), kib_to_mib(zram_max_sigterm_kb),

View File

@ -65,16 +65,16 @@ zram_max_sigkill = 55 %
Response on PSI memory some/full avg10/avg60/avg300 value Response on PSI memory some/full avg10/avg60/avg300 value
(/proc/pressure/memory on systems with Linux 4.20+). (/proc/pressure/memory on systems with Linux 4.20+).
ignore_psi = False ignore_psi = True
Choose path to PSI file. Choose path to PSI file.
psi_path = /sys/fs/cgroup/unified/user.slice/memory.pressure psi_path = /sys/fs/cgroup/unified/user.slice/memory.pressure
psi_path = /sys/fs/cgroup/unified/system.slice/memory.pressure psi_path = /sys/fs/cgroup/unified/system.slice/memory.pressure
psi_path = /sys/fs/cgroup/unified/system.slice/foo.service/memory.pressure psi_path = /sys/fs/cgroup/unified/system.slice/foo.service/memory.pressure
psi_path = /proc/pressure/memory psi_path = ./psi_dummy
psi_path = ./psi_dummy psi_path = /proc/pressure/memory
psi_metrics = some_avg10 psi_metrics = some_avg10
@ -323,7 +323,7 @@ zram_max_warnings = 40 %
##################################################################### #####################################################################
7. Output verbosity 7. Verbosity
Display the configuration when the program starts. Display the configuration when the program starts.
Valid values are True and False. Valid values are True and False.
@ -333,13 +333,13 @@ print_config = False
Print memory check results. Print memory check results.
Valid values are True and False. Valid values are True and False.
print_mem_check_results = True print_mem_check_results = False
Минимальная периодичность печати состояния памяти. Минимальная периодичность печати состояния памяти.
0 - печатать все проверки памяти. 0 - печатать все проверки памяти.
Неотрицательное число. Неотрицательное число.
min_mem_report_interval = 1 min_mem_report_interval = 10
Print sleep periods between memory checks. Print sleep periods between memory checks.
Valid values are True and False. Valid values are True and False.
@ -372,7 +372,7 @@ max_ancestry_depth = 3
которого жертва получит SIGKILL. которого жертва получит SIGKILL.
Неотрицательные числа. Неотрицательные числа.
max_post_sigterm_victim_lifetime = 9 max_post_sigterm_victim_lifetime = 10
Выполнить произвольную команду после SIGKILL. Выполнить произвольную команду после SIGKILL.
Пустая строка - ничего не выполнять. Пустая строка - ничего не выполнять.