Added the ability to log into a separate file
This commit is contained in:
parent
6e7eea2da7
commit
71dcb30cfb
15
README.md
15
README.md
@ -51,8 +51,8 @@ The tools listed above may work at the same time on one computer.
|
|||||||
- Notification of corrective actions taken and displaying the name and PID of the victim
|
- Notification of corrective actions taken and displaying the name and PID of the victim
|
||||||
- Low memory warnings (displays available memory)
|
- Low memory warnings (displays available memory)
|
||||||
- `zram` support (`mem_used_total` as a trigger)
|
- `zram` support (`mem_used_total` as a trigger)
|
||||||
- Initial [PSI](https://lwn.net/Articles/759658/) support (since Linux 4.20+, using `/proc/pressure/memory` and `some avg10` as a trigger)
|
- Initial [PSI](https://lwn.net/Articles/759658/) support
|
||||||
- Convenient configuration with a ~~well~~ commented [config file](https://github.com/hakavlad/nohang/blob/master/nohang.conf)
|
- Easy configuration with a ~~well~~ commented [config file](https://github.com/hakavlad/nohang/blob/master/nohang.conf)
|
||||||
|
|
||||||
## Requirements
|
## Requirements
|
||||||
|
|
||||||
@ -186,15 +186,7 @@ See also `man journalctl`.
|
|||||||
|
|
||||||
## Known problems
|
## Known problems
|
||||||
|
|
||||||
- Awful documentation (the problem will be solved gradually in the next releases)
|
- Awful documentation.
|
||||||
- It is written in Python and is actually a prototype (although the algorithm may be good)
|
|
||||||
- No tests (by itself this does not make the algorithm bad)
|
|
||||||
|
|
||||||
## Todo
|
|
||||||
|
|
||||||
- Make installer for non-systemd users
|
|
||||||
- Deb/rpm packaging
|
|
||||||
- Rewrite all code in Golang with tests and good documentation.
|
|
||||||
|
|
||||||
## Nohang don't help you
|
## Nohang don't help you
|
||||||
|
|
||||||
@ -218,6 +210,7 @@ Please create [issues](https://github.com/hakavlad/nohang/issues). Use cases, fe
|
|||||||
- [x] Add delta memory info (the rate of change of available memory)
|
- [x] Add delta memory info (the rate of change of available memory)
|
||||||
- [x] Print statistics on corrective actions after each corrective action
|
- [x] Print statistics on corrective actions after each corrective action
|
||||||
- [x] Added ability to print a process table before each corrective action
|
- [x] Added ability to print a process table before each corrective action
|
||||||
|
- [x] Added the ability to log into a separate file
|
||||||
- [x] Improve poll rate algorithm
|
- [x] Improve poll rate algorithm
|
||||||
- [x] Add `max_post_sigterm_victim_lifetime` option: send SIGKILL to the victim if it doesn't respond to SIGTERM for a certain time
|
- [x] Add `max_post_sigterm_victim_lifetime` option: send SIGKILL to the victim if it doesn't respond to SIGTERM for a certain time
|
||||||
- [x] Improve victim search algorithm (do it ~30% faster) ([rfjakob/earlyoom#114](https://github.com/rfjakob/earlyoom/issues/114))
|
- [x] Improve victim search algorithm (do it ~30% faster) ([rfjakob/earlyoom#114](https://github.com/rfjakob/earlyoom/issues/114))
|
||||||
|
86
nohang
86
nohang
@ -9,6 +9,36 @@ from sys import stdout, stderr, argv, exit
|
|||||||
from signal import SIGKILL, SIGTERM
|
from signal import SIGKILL, SIGTERM
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from logging import basicConfig
|
||||||
|
from logging import info
|
||||||
|
|
||||||
|
|
||||||
|
logfile = '/var/log/nohang/nohang.log'
|
||||||
|
|
||||||
|
|
||||||
|
basicConfig(filename=logfile,
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s: %(message)s")
|
||||||
|
|
||||||
|
|
||||||
|
separate_log = True
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def log(msg):
|
||||||
|
print(msg)
|
||||||
|
if separate_log:
|
||||||
|
info(msg)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
start_time = time()
|
start_time = time()
|
||||||
|
|
||||||
@ -43,7 +73,6 @@ wait_time = 10
|
|||||||
notify_helper_path = '/usr/sbin/nohang_notify_helper'
|
notify_helper_path = '/usr/sbin/nohang_notify_helper'
|
||||||
|
|
||||||
|
|
||||||
HR = ''
|
|
||||||
|
|
||||||
victim_dict = dict()
|
victim_dict = dict()
|
||||||
|
|
||||||
@ -203,8 +232,8 @@ def update_stat_dict_and_print(key):
|
|||||||
|
|
||||||
if print_total_stat:
|
if print_total_stat:
|
||||||
|
|
||||||
stats_msg = '{}\nTotal stat (what happened in the last {}):'.format(
|
stats_msg = 'Total stat (what happened in the last {}):'.format(
|
||||||
HR, format_time(time() - start_time))
|
format_time(time() - start_time))
|
||||||
|
|
||||||
for i in stat_dict:
|
for i in stat_dict:
|
||||||
stats_msg += '\n {}: {}'.format(i, stat_dict[i])
|
stats_msg += '\n {}: {}'.format(i, stat_dict[i])
|
||||||
@ -775,9 +804,9 @@ def find_victim():
|
|||||||
pid_badness_list = []
|
pid_badness_list = []
|
||||||
|
|
||||||
if print_proc_table:
|
if print_proc_table:
|
||||||
print('===============================================================================')
|
log('===============================================================================')
|
||||||
print(' PID badness Name eUID cmdline')
|
log(' PID badness Name eUID cmdline')
|
||||||
print('------- ------- --------------- ---------- ---------------------------------')
|
log('------- ------- --------------- ---------- ---------------------------------')
|
||||||
|
|
||||||
for pid in pid_list:
|
for pid in pid_list:
|
||||||
|
|
||||||
@ -786,7 +815,7 @@ def find_victim():
|
|||||||
continue
|
continue
|
||||||
|
|
||||||
if print_proc_table:
|
if print_proc_table:
|
||||||
print('{} {} {} {} {}'.format(
|
log('{} {} {} {} {}'.format(
|
||||||
pid.rjust(7),
|
pid.rjust(7),
|
||||||
str(badness).rjust(7),
|
str(badness).rjust(7),
|
||||||
pid_to_name(pid).ljust(15),
|
pid_to_name(pid).ljust(15),
|
||||||
@ -813,10 +842,10 @@ def find_victim():
|
|||||||
victim_name = pid_to_name(pid)
|
victim_name = pid_to_name(pid)
|
||||||
|
|
||||||
if print_proc_table:
|
if print_proc_table:
|
||||||
print('===============================================================================')
|
log('===============================================================================')
|
||||||
|
|
||||||
print(
|
log(
|
||||||
'\nProcess with highest badness (found in {} ms):\n PID: {}, Name: {}, badness: {}'.format(
|
'Process with highest badness (found in {} ms):\n PID: {}, Name: {}, badness: {}'.format(
|
||||||
round((time() - ft1) * 1000),
|
round((time() - ft1) * 1000),
|
||||||
pid,
|
pid,
|
||||||
victim_name,
|
victim_name,
|
||||||
@ -987,7 +1016,7 @@ def find_victim_info(pid, victim_badness, name):
|
|||||||
|
|
||||||
victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
|
victim_lifetime = format_time(uptime() - pid_to_starttime(pid))
|
||||||
|
|
||||||
victim_info = '\nVictim information (found in {} ms):' \
|
victim_info = 'Victim information (found in {} ms):' \
|
||||||
'\n Name: {}' \
|
'\n Name: {}' \
|
||||||
'\n State: {}' \
|
'\n State: {}' \
|
||||||
'\n PID: {}' \
|
'\n PID: {}' \
|
||||||
@ -1027,7 +1056,7 @@ def implement_corrective_action(signal):
|
|||||||
Find victim with highest badness and send SIGTERM/SIGKILL
|
Find victim with highest badness and send SIGTERM/SIGKILL
|
||||||
"""
|
"""
|
||||||
|
|
||||||
print(mem_info)
|
log(mem_info)
|
||||||
|
|
||||||
pid, victim_badness, name = find_victim()
|
pid, victim_badness, name = find_victim()
|
||||||
|
|
||||||
@ -1035,7 +1064,7 @@ def implement_corrective_action(signal):
|
|||||||
|
|
||||||
if print_victim_info:
|
if print_victim_info:
|
||||||
victim_info = find_victim_info(pid, victim_badness, name)
|
victim_info = find_victim_info(pid, victim_badness, name)
|
||||||
print(victim_info)
|
log(victim_info)
|
||||||
|
|
||||||
# kill the victim if it doesn't respond to SIGTERM
|
# kill the victim if it doesn't respond to SIGTERM
|
||||||
if signal is SIGTERM:
|
if signal is SIGTERM:
|
||||||
@ -1059,7 +1088,7 @@ def implement_corrective_action(signal):
|
|||||||
m = check_mem_and_swap()
|
m = check_mem_and_swap()
|
||||||
ma = round(int(m[0]) / 1024.0)
|
ma = round(int(m[0]) / 1024.0)
|
||||||
sf = round(int(m[2]) / 1024.0)
|
sf = round(int(m[2]) / 1024.0)
|
||||||
print('\nMemory status before implementing a corrective action:\n MemAvailable'
|
log('Memory status before implementing a corrective action:\n MemAvailable'
|
||||||
': {} MiB, SwapFree: {} MiB'.format(ma, sf))
|
': {} MiB, SwapFree: {} MiB'.format(ma, sf))
|
||||||
|
|
||||||
exit_status = os.system(etc_dict[name].replace(
|
exit_status = os.system(etc_dict[name].replace(
|
||||||
@ -1072,7 +1101,7 @@ def implement_corrective_action(signal):
|
|||||||
|
|
||||||
response_time = time() - time0
|
response_time = time() - time0
|
||||||
|
|
||||||
etc_info = '\nImplement a corrective action:\n Run the command: {}' \
|
etc_info = 'Implement a corrective action:\n Run the command: {}' \
|
||||||
'\n Exit status: {}; total response time: {} ms'.format(
|
'\n Exit status: {}; total response time: {} ms'.format(
|
||||||
command.replace(
|
command.replace(
|
||||||
'$PID', pid).replace(
|
'$PID', pid).replace(
|
||||||
@ -1098,7 +1127,7 @@ def implement_corrective_action(signal):
|
|||||||
m = check_mem_and_swap()
|
m = check_mem_and_swap()
|
||||||
ma = round(int(m[0]) / 1024.0)
|
ma = round(int(m[0]) / 1024.0)
|
||||||
sf = round(int(m[2]) / 1024.0)
|
sf = round(int(m[2]) / 1024.0)
|
||||||
print('\nMemory status before implementing a corrective action:\n MemAvailable'
|
log('Memory status before implementing a corrective action:\n MemAvailable'
|
||||||
': {} MiB, SwapFree: {} MiB'.format(ma, sf))
|
': {} MiB, SwapFree: {} MiB'.format(ma, sf))
|
||||||
|
|
||||||
os.kill(int(pid), signal)
|
os.kill(int(pid), signal)
|
||||||
@ -1106,7 +1135,7 @@ def implement_corrective_action(signal):
|
|||||||
send_result = 'total response time: {} ms'.format(
|
send_result = 'total response time: {} ms'.format(
|
||||||
round(response_time * 1000))
|
round(response_time * 1000))
|
||||||
|
|
||||||
preventing_oom_message = '\nImplement a corrective action:' \
|
preventing_oom_message = 'Implement a corrective action:' \
|
||||||
'\n Send {} to the victim; {}'.format(
|
'\n Send {} to the victim; {}'.format(
|
||||||
sig_dict[signal], send_result)
|
sig_dict[signal], send_result)
|
||||||
|
|
||||||
@ -1135,7 +1164,7 @@ def implement_corrective_action(signal):
|
|||||||
round(response_time * 1000))
|
round(response_time * 1000))
|
||||||
key = 'ProcessLookupError (the victim died in the search process): '
|
key = 'ProcessLookupError (the victim died in the search process): '
|
||||||
|
|
||||||
print(preventing_oom_message)
|
log(preventing_oom_message)
|
||||||
|
|
||||||
update_stat_dict_and_print(key)
|
update_stat_dict_and_print(key)
|
||||||
|
|
||||||
@ -1154,7 +1183,6 @@ def implement_corrective_action(signal):
|
|||||||
key = 'victim badness < min_badness'
|
key = 'victim badness < min_badness'
|
||||||
update_stat_dict_and_print(key)
|
update_stat_dict_and_print(key)
|
||||||
|
|
||||||
print('###############################################################################')
|
|
||||||
|
|
||||||
sleep_after_send_signal(signal)
|
sleep_after_send_signal(signal)
|
||||||
|
|
||||||
@ -1987,18 +2015,18 @@ while True:
|
|||||||
avg10 = find_psi_metrics_value(psi_path, psi_metrics)
|
avg10 = find_psi_metrics_value(psi_path, psi_metrics)
|
||||||
|
|
||||||
if print_mem_check_results:
|
if print_mem_check_results:
|
||||||
avg_value = 'PSI value: {} | '.format(str(avg10).rjust(6))
|
avg_value = 'PSI avg value: {} | '.format(str(avg10).rjust(6))
|
||||||
|
|
||||||
if avg10 >= sigkill_psi and time() - psi_t0 >= psi_avg10_sleep_time:
|
if avg10 >= sigkill_psi and time() - psi_t0 >= psi_avg10_sleep_time:
|
||||||
time0 = time()
|
time0 = time()
|
||||||
mem_info = 'PSI value ({}) > sigkill_psi ({})'.format(avg10, sigkill_psi)
|
mem_info = 'PSI avg value ({}) > sigkill_psi ({})'.format(avg10, sigkill_psi)
|
||||||
implement_corrective_action(SIGKILL)
|
implement_corrective_action(SIGKILL)
|
||||||
psi_t0 = time()
|
psi_t0 = time()
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if avg10 >= sigterm_psi and time() - psi_t0 >= psi_avg10_sleep_time:
|
if avg10 >= sigterm_psi and time() - psi_t0 >= psi_avg10_sleep_time:
|
||||||
time0 = time()
|
time0 = time()
|
||||||
mem_info = 'PSI value ({}) > sigterm_psi ({})'.format(avg10, sigterm_psi)
|
mem_info = 'PSI avg value ({}) > sigterm_psi ({})'.format(avg10, sigterm_psi)
|
||||||
implement_corrective_action(SIGTERM)
|
implement_corrective_action(SIGTERM)
|
||||||
psi_t0 = time()
|
psi_t0 = time()
|
||||||
continue
|
continue
|
||||||
@ -2058,7 +2086,7 @@ while True:
|
|||||||
|
|
||||||
# Output avialable mem sizes
|
# Output avialable mem sizes
|
||||||
if swap_total == 0 and mem_used_zram == 0:
|
if swap_total == 0 and mem_used_zram == 0:
|
||||||
print('{}MemAvail: {} M, {} %{}'.format(
|
log('{}MemAvail: {} M, {} %{}'.format(
|
||||||
avg_value,
|
avg_value,
|
||||||
human(mem_available, mem_len),
|
human(mem_available, mem_len),
|
||||||
just_percent_mem(mem_available / mem_total),
|
just_percent_mem(mem_available / mem_total),
|
||||||
@ -2113,11 +2141,10 @@ while True:
|
|||||||
swap_free <= swap_min_sigkill_kb):
|
swap_free <= swap_min_sigkill_kb):
|
||||||
time0 = time()
|
time0 = time()
|
||||||
|
|
||||||
mem_info = '{}\nSIGKILL threshold exeeded\nMemory status that requires corrective actions:' \
|
mem_info = 'Hard threshold exeeded\nMemory status that requires corrective actions:' \
|
||||||
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
||||||
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
'kill [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
||||||
'p_min_sigkill [{} MiB, {} %]'.format(
|
'p_min_sigkill [{} MiB, {} %]'.format(
|
||||||
HR,
|
|
||||||
kib_to_mib(mem_available),
|
kib_to_mib(mem_available),
|
||||||
percent(mem_available / mem_total),
|
percent(mem_available / mem_total),
|
||||||
kib_to_mib(mem_min_sigkill_kb),
|
kib_to_mib(mem_min_sigkill_kb),
|
||||||
@ -2135,10 +2162,9 @@ while True:
|
|||||||
if mem_used_zram >= zram_max_sigkill_kb:
|
if mem_used_zram >= zram_max_sigkill_kb:
|
||||||
time0 = time()
|
time0 = time()
|
||||||
|
|
||||||
mem_info = '{}\nSIGKILL threshold exeeded\nMemory status that requires corrective actions:' \
|
mem_info = 'Hard threshold exeeded\nMemory status that requires corrective actions:' \
|
||||||
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
|
'\n MemUsedZram [{} MiB, {} %] >= zram_max_sig' \
|
||||||
'kill [{} MiB, {} %]'.format(
|
'kill [{} MiB, {} %]'.format(
|
||||||
HR,
|
|
||||||
kib_to_mib(mem_used_zram),
|
kib_to_mib(mem_used_zram),
|
||||||
percent(mem_used_zram / mem_total),
|
percent(mem_used_zram / mem_total),
|
||||||
kib_to_mib(zram_max_sigkill_kb),
|
kib_to_mib(zram_max_sigkill_kb),
|
||||||
@ -2154,11 +2180,10 @@ while True:
|
|||||||
|
|
||||||
time0 = time()
|
time0 = time()
|
||||||
|
|
||||||
mem_info = '{}\nSIGTERM threshold exeeded\nMemory status that requires corrective actions:' \
|
mem_info = 'Soft threshold exeeded\nMemory status that requires corrective actions:' \
|
||||||
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
'\n MemAvailable [{} MiB, {} %] <= mem_min_sig' \
|
||||||
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
'term [{} MiB, {} %]\n SwapFree [{} MiB, {} %] <= swa' \
|
||||||
'p_min_sigterm [{} MiB, {} %]'.format(
|
'p_min_sigterm [{} MiB, {} %]'.format(
|
||||||
HR,
|
|
||||||
kib_to_mib(mem_available),
|
kib_to_mib(mem_available),
|
||||||
percent(mem_available / mem_total),
|
percent(mem_available / mem_total),
|
||||||
kib_to_mib(mem_min_sigterm_kb),
|
kib_to_mib(mem_min_sigterm_kb),
|
||||||
@ -2178,10 +2203,9 @@ while True:
|
|||||||
if mem_used_zram >= zram_max_sigterm_kb:
|
if mem_used_zram >= zram_max_sigterm_kb:
|
||||||
time0 = time()
|
time0 = time()
|
||||||
|
|
||||||
mem_info = '{}\nSIGTERM threshold exeeded\nMemory status that requires corrective actions:' \
|
mem_info = 'Soft threshold exeeded\nMemory status that requires corrective actions:' \
|
||||||
'\n MemUsedZram [{} MiB, {} %] >= ' \
|
'\n MemUsedZram [{} MiB, {} %] >= ' \
|
||||||
'zram_max_sigterm [{} M, {} %]'.format(
|
'zram_max_sigterm [{} M, {} %]'.format(
|
||||||
HR,
|
|
||||||
kib_to_mib(mem_used_zram),
|
kib_to_mib(mem_used_zram),
|
||||||
percent(mem_used_zram / mem_total),
|
percent(mem_used_zram / mem_total),
|
||||||
kib_to_mib(zram_max_sigterm_kb),
|
kib_to_mib(zram_max_sigterm_kb),
|
||||||
|
14
nohang.conf
14
nohang.conf
@ -65,17 +65,17 @@ zram_max_sigkill = 55 %
|
|||||||
Response on PSI memory some/full avg10/avg60/avg300 value
|
Response on PSI memory some/full avg10/avg60/avg300 value
|
||||||
(/proc/pressure/memory on systems with Linux 4.20+).
|
(/proc/pressure/memory on systems with Linux 4.20+).
|
||||||
|
|
||||||
ignore_psi = False
|
ignore_psi = True
|
||||||
|
|
||||||
Choose path to PSI file.
|
Choose path to PSI file.
|
||||||
|
|
||||||
psi_path = /sys/fs/cgroup/unified/user.slice/memory.pressure
|
psi_path = /sys/fs/cgroup/unified/user.slice/memory.pressure
|
||||||
psi_path = /sys/fs/cgroup/unified/system.slice/memory.pressure
|
psi_path = /sys/fs/cgroup/unified/system.slice/memory.pressure
|
||||||
psi_path = /sys/fs/cgroup/unified/system.slice/foo.service/memory.pressure
|
psi_path = /sys/fs/cgroup/unified/system.slice/foo.service/memory.pressure
|
||||||
psi_path = /proc/pressure/memory
|
|
||||||
|
|
||||||
psi_path = ./psi_dummy
|
psi_path = ./psi_dummy
|
||||||
|
|
||||||
|
psi_path = /proc/pressure/memory
|
||||||
|
|
||||||
psi_metrics = some_avg10
|
psi_metrics = some_avg10
|
||||||
|
|
||||||
sigterm_psi_avg10 = 60
|
sigterm_psi_avg10 = 60
|
||||||
@ -323,7 +323,7 @@ zram_max_warnings = 40 %
|
|||||||
|
|
||||||
#####################################################################
|
#####################################################################
|
||||||
|
|
||||||
7. Output verbosity
|
7. Verbosity
|
||||||
|
|
||||||
Display the configuration when the program starts.
|
Display the configuration when the program starts.
|
||||||
Valid values are True and False.
|
Valid values are True and False.
|
||||||
@ -333,13 +333,13 @@ print_config = False
|
|||||||
Print memory check results.
|
Print memory check results.
|
||||||
Valid values are True and False.
|
Valid values are True and False.
|
||||||
|
|
||||||
print_mem_check_results = True
|
print_mem_check_results = False
|
||||||
|
|
||||||
Минимальная периодичность печати состояния памяти.
|
Минимальная периодичность печати состояния памяти.
|
||||||
0 - печатать все проверки памяти.
|
0 - печатать все проверки памяти.
|
||||||
Неотрицательное число.
|
Неотрицательное число.
|
||||||
|
|
||||||
min_mem_report_interval = 1
|
min_mem_report_interval = 10
|
||||||
|
|
||||||
Print sleep periods between memory checks.
|
Print sleep periods between memory checks.
|
||||||
Valid values are True and False.
|
Valid values are True and False.
|
||||||
@ -372,7 +372,7 @@ max_ancestry_depth = 3
|
|||||||
которого жертва получит SIGKILL.
|
которого жертва получит SIGKILL.
|
||||||
Неотрицательные числа.
|
Неотрицательные числа.
|
||||||
|
|
||||||
max_post_sigterm_victim_lifetime = 9
|
max_post_sigterm_victim_lifetime = 10
|
||||||
|
|
||||||
Выполнить произвольную команду после SIGKILL.
|
Выполнить произвольную команду после SIGKILL.
|
||||||
Пустая строка - ничего не выполнять.
|
Пустая строка - ничего не выполнять.
|
||||||
|
Loading…
Reference in New Issue
Block a user