stop running processes in finding victim process
This commit is contained in:
parent
50f08bc894
commit
e2ce62114b
214
nohang
214
nohang
@ -11,7 +11,7 @@ from operator import itemgetter
|
||||
from argparse import ArgumentParser
|
||||
|
||||
from sys import stdout
|
||||
from signal import SIGKILL, SIGTERM
|
||||
from signal import SIGKILL, SIGTERM, SIGSTOP, SIGCONT
|
||||
|
||||
sig_dict = {SIGKILL: 'SIGKILL',
|
||||
SIGTERM: 'SIGTERM'}
|
||||
@ -26,20 +26,40 @@ else:
|
||||
|
||||
wait_time = 14
|
||||
|
||||
max_sleep_time = 2
|
||||
min_sleep_time = 0.1
|
||||
max_sleep_time = 1
|
||||
min_sleep_time = 0.05
|
||||
|
||||
notify_helper_path = '/usr/bin/nohang_notify_helper'
|
||||
|
||||
psi_path = '/proc/pressure/memory'
|
||||
psi_support = os.path.exists(psi_path)
|
||||
|
||||
debug = False
|
||||
|
||||
|
||||
stop_cont = True
|
||||
|
||||
SC_CLK_TCK = os.sysconf(os.sysconf_names['SC_CLK_TCK'])
|
||||
|
||||
|
||||
##########################################################################
|
||||
|
||||
|
||||
# function definition section
|
||||
|
||||
|
||||
def uptime():
|
||||
return float(rline1('/proc/uptime').split(' ')[0])
|
||||
|
||||
|
||||
def pid_to_starttime(pid):
|
||||
return float(rline1('/proc/' + pid + '/stat').rpartition(')')[2].split(' ')[20]) / float(SC_CLK_TCK)
|
||||
|
||||
|
||||
def pid_to_state(pid):
|
||||
return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1]
|
||||
|
||||
|
||||
def update_stat_dict_and_print(key):
|
||||
if key not in stat_dict:
|
||||
stat_dict.update({key: 1})
|
||||
@ -261,6 +281,22 @@ def pid_to_name(pid):
|
||||
except ProcessLookupError:
|
||||
return ''
|
||||
|
||||
'''
|
||||
# return process name
|
||||
def pid_to_rss(pid):
|
||||
"""
|
||||
|
||||
"""
|
||||
try:
|
||||
with open('/proc/' + pid + '/statm') as f:
|
||||
for line in f:
|
||||
return line.split(' ')[1]
|
||||
except FileNotFoundError:
|
||||
return 0
|
||||
except ProcessLookupError:
|
||||
return 0
|
||||
'''
|
||||
|
||||
|
||||
def pid_to_cmdline(pid):
|
||||
"""
|
||||
@ -277,10 +313,11 @@ def pid_to_cmdline(pid):
|
||||
|
||||
|
||||
def pid_to_uid(pid):
|
||||
'''return euid'''
|
||||
with open('/proc/' + pid + '/status') as f:
|
||||
for n, line in enumerate(f):
|
||||
if n is uid_index:
|
||||
return line.split('\t')[1]
|
||||
return line.split('\t')[2]
|
||||
|
||||
|
||||
def notify_send_wait(title, body):
|
||||
@ -390,6 +427,109 @@ def sleep_after_send_signal(signal):
|
||||
if print_sleep_periods:
|
||||
print(' sleep', min_delay_after_sigterm)
|
||||
sleep(min_delay_after_sigterm)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def stop():
|
||||
print()
|
||||
print('Stop running processes...')
|
||||
t1 = time()
|
||||
t2 = time()
|
||||
stopped_list = []
|
||||
for pid in os.listdir('/proc')[::-1]:
|
||||
# only directories whose names consist only of numbers, except /proc/1/
|
||||
if pid[0].isdecimal() is False or pid is '1' or pid == self_pid:
|
||||
continue
|
||||
try:
|
||||
# print(pid)
|
||||
if pid_to_state(pid) == 'R':
|
||||
if pid_to_cmdline(pid) != '' and pid_to_name(pid) != 'Xorg':
|
||||
stopped_list.append(pid)
|
||||
print('Send SIGSTOP to {}, {}, {}...'.format(
|
||||
pid, pid_to_name(pid), pid_to_cmdline(pid)[:40]))
|
||||
os.kill(int(pid), SIGSTOP)
|
||||
t2 = time()
|
||||
except FileNotFoundError:
|
||||
continue
|
||||
except ProcessLookupError:
|
||||
continue
|
||||
print('Stop time:', t2 - t1)
|
||||
return stopped_list
|
||||
|
||||
|
||||
|
||||
def cont(stopped_list):
|
||||
print()
|
||||
print('Continue stopped processes...')
|
||||
t1 = time()
|
||||
if len(stopped_list) > 0:
|
||||
for pid in stopped_list:
|
||||
print('Send SIGCONT to', [pid], pid_to_name(pid))
|
||||
try:
|
||||
os.kill(int(pid), SIGCONT)
|
||||
except FileNotFoundError:
|
||||
continue
|
||||
except ProcessLookupError:
|
||||
continue
|
||||
t2 = time()
|
||||
print('All cont time: ', t2 - t1)
|
||||
|
||||
|
||||
|
||||
def print_states():
|
||||
print()
|
||||
t1 = time()
|
||||
print('non-S states:')
|
||||
for pid in os.listdir('/proc'):
|
||||
# only directories whose names consist only of numbers, except /proc/1/
|
||||
if pid[0].isdecimal() is False or pid is '1' or pid == self_pid:
|
||||
continue
|
||||
try:
|
||||
s = pid_to_state(pid)
|
||||
if s == 'S':
|
||||
continue
|
||||
else:
|
||||
print('State: {}, [{}], {}, {}...'.format(
|
||||
s, pid, pid_to_name(pid), pid_to_cmdline(pid)[:40]))
|
||||
except FileNotFoundError:
|
||||
continue
|
||||
except ProcessLookupError:
|
||||
continue
|
||||
t2 = time()
|
||||
print('print state time:', t2 - t1)
|
||||
print()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def fattest():
|
||||
@ -402,7 +542,7 @@ def fattest():
|
||||
|
||||
for pid in os.listdir('/proc'):
|
||||
# only directories whose names consist only of numbers, except /proc/1/
|
||||
if pid[0].isdecimal() is False or pid is '1' or pid is self_pid:
|
||||
if pid[0].isdecimal() is False or pid is '1' or pid == self_pid:
|
||||
continue
|
||||
|
||||
# find and modify badness (if it needs)
|
||||
@ -448,8 +588,37 @@ def fattest():
|
||||
pid_badness_list.append((pid, badness))
|
||||
|
||||
# Make list of (pid, badness) tuples, sorted by 'badness' values
|
||||
pid_tuple_list = sorted(
|
||||
pid_badness_list, key=itemgetter(1), reverse=True)[0]
|
||||
pid_tuple_list = sorted(pid_badness_list, key=itemgetter(1), reverse=True)[0]
|
||||
|
||||
|
||||
# badness oom_score oom_score_adj RSS UID NAME (cmdline)
|
||||
if debug:
|
||||
x = sorted(pid_badness_list, key=itemgetter(1), reverse=True)
|
||||
for i in x:
|
||||
try:
|
||||
print('PID: {} | badness: {} | name: {} | eUID: {} | cmdline: {}'.format(
|
||||
i[0].rjust(5),
|
||||
str(i[1]).rjust(5),
|
||||
pid_to_name(i[0]).ljust(15),
|
||||
pid_to_uid(i[0]).rjust(6),
|
||||
pid_to_cmdline(i[0])[:50]
|
||||
))
|
||||
print(pid_to_state(i[0]))
|
||||
|
||||
k = 0.5
|
||||
uptime_ratio = 1 - pid_to_starttime(i[0]) / uptime()
|
||||
uptime_ratio2 = uptime_ratio ** k
|
||||
print(uptime_ratio, uptime_ratio2, i[1], i[1] * uptime_ratio2)
|
||||
|
||||
#print(pid_to_starttime('1'))
|
||||
#print(uptime())
|
||||
|
||||
except FileNotFoundError:
|
||||
print('(FileNotFoundError)')
|
||||
continue
|
||||
except ProcessLookupError:
|
||||
print('(ProcessLookupError)')
|
||||
continue
|
||||
|
||||
pid = pid_tuple_list[0]
|
||||
|
||||
@ -464,6 +633,9 @@ def find_victim_and_send_signal(signal):
|
||||
Find victim with highest badness and send SIGTERM/SIGKILL
|
||||
"""
|
||||
# print()
|
||||
if stop_cont:
|
||||
print_states()
|
||||
stopped_list = stop()
|
||||
|
||||
pid, victim_badness = fattest()
|
||||
name = pid_to_name(pid)
|
||||
@ -594,6 +766,8 @@ def find_victim_and_send_signal(signal):
|
||||
|
||||
if execute_the_command and signal is SIGTERM and name in etc_dict:
|
||||
command = etc_dict[name]
|
||||
if stop_cont:
|
||||
os.kill(int(pid), SIGCONT)
|
||||
exit_status = os.system(etc_dict[name].replace('$PID', pid))
|
||||
if exit_status == 0:
|
||||
exit_status = '\033[32m0\033[0m'
|
||||
@ -622,6 +796,8 @@ def find_victim_and_send_signal(signal):
|
||||
else:
|
||||
|
||||
try:
|
||||
if stop_cont:
|
||||
os.kill(int(pid), SIGCONT)
|
||||
os.kill(int(pid), signal)
|
||||
response_time = time() - time0
|
||||
send_result = '\033[32mOK\033[0m; response time: {} ms'.format(
|
||||
@ -696,12 +872,19 @@ def find_victim_and_send_signal(signal):
|
||||
|
||||
print(stats_msg)
|
||||
|
||||
if stop_cont:
|
||||
print_states()
|
||||
cont(stopped_list)
|
||||
print_states()
|
||||
|
||||
sleep_after_send_signal(signal)
|
||||
|
||||
|
||||
def sleep_after_check_mem():
|
||||
"""Specify sleep times depends on rates and avialable memory."""
|
||||
|
||||
# It's magic!
|
||||
|
||||
if mem_min_sigkill_kb < mem_min_sigterm_kb:
|
||||
mem_point = mem_available - mem_min_sigterm_kb
|
||||
else:
|
||||
@ -1361,6 +1544,23 @@ psi_min_sleep_time_after_action = psi_avg10_sleep_time
|
||||
##########################################################################
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# stopped_list = stop()
|
||||
|
||||
|
||||
# cont(stopped_list)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if psi_support and not ignore_psi:
|
||||
kill_psi_t0 = time() + psi_avg10_sleep_time
|
||||
term_psi_t0 = time() + psi_avg10_sleep_time
|
||||
|
30
nohang.conf
30
nohang.conf
@ -42,13 +42,13 @@
|
||||
|
||||
MemAvailable levels.
|
||||
|
||||
mem_min_sigterm = 10 %
|
||||
mem_min_sigkill = 5 %
|
||||
mem_min_sigterm = 10%
|
||||
mem_min_sigkill = 5%
|
||||
|
||||
SwapFree levels.
|
||||
|
||||
swap_min_sigterm = 10 %
|
||||
swap_min_sigkill = 5 %
|
||||
swap_min_sigterm = 10%
|
||||
swap_min_sigkill = 5%
|
||||
|
||||
Specifying the total share of zram in memory, if exceeded the
|
||||
corresponding signals are sent. As the share of zram in memory
|
||||
@ -65,7 +65,7 @@ zram_max_sigkill = 55 %
|
||||
Response on PSI memory some avg10 value
|
||||
(/proc/pressure/memory on systems with Linux 4.20+).
|
||||
|
||||
ignore_psi = False
|
||||
ignore_psi = True
|
||||
|
||||
sigterm_psi_avg10 = 60
|
||||
sigkill_psi_avg10 = 90
|
||||
@ -93,8 +93,8 @@ psi_avg10_sleep_time = 60
|
||||
|
||||
Valid values are positive floating-point numbers.
|
||||
|
||||
rate_mem = 4
|
||||
rate_swap = 2
|
||||
rate_mem = 6
|
||||
rate_swap = 3
|
||||
rate_zram = 1
|
||||
|
||||
See also https://github.com/rfjakob/earlyoom/issues/61
|
||||
@ -132,7 +132,7 @@ min_delay_after_sigkill = 0.8
|
||||
Valid values are True and False.
|
||||
Values are case sensitive.
|
||||
|
||||
decrease_oom_score_adj = True
|
||||
decrease_oom_score_adj = False
|
||||
|
||||
Valid values are integers from the range [0; 1000].
|
||||
|
||||
@ -160,7 +160,7 @@ oom_score_adj_max = 30
|
||||
|
||||
Valid values are True and False.
|
||||
|
||||
regex_matching = True
|
||||
regex_matching = False
|
||||
|
||||
Syntax:
|
||||
|
||||
@ -184,7 +184,7 @@ regex_matching = True
|
||||
|
||||
A good option that allows fine adjustment.
|
||||
|
||||
re_match_cmdline = True
|
||||
re_match_cmdline = False
|
||||
|
||||
@CMDLINE_RE 300 /// -childID|--type=renderer
|
||||
|
||||
@ -195,7 +195,7 @@ re_match_cmdline = True
|
||||
|
||||
The most slow option
|
||||
|
||||
re_match_uid = True
|
||||
re_match_uid = False
|
||||
|
||||
@UID_RE -100 /// ^0$
|
||||
|
||||
@ -215,7 +215,7 @@ re_match_uid = True
|
||||
|
||||
Valid values are True and False.
|
||||
|
||||
execute_the_command = True
|
||||
execute_the_command = False
|
||||
|
||||
The length of the process name can't exceed 15 characters.
|
||||
The syntax is as follows: lines starting with keyword $ETC are
|
||||
@ -256,7 +256,7 @@ $ETC firefox-esr /// kill -SEGV $PID
|
||||
See also wiki.archlinux.org/index.php/Desktop_notifications
|
||||
Valid values are True and False.
|
||||
|
||||
gui_notifications = True
|
||||
gui_notifications = False
|
||||
|
||||
Enable GUI notifications about the low level of available memory.
|
||||
Valid values are True and False.
|
||||
@ -294,7 +294,7 @@ zram_max_warnings = 40 %
|
||||
Display the configuration when the program starts.
|
||||
Valid values are True and False.
|
||||
|
||||
print_config = True
|
||||
print_config = False
|
||||
|
||||
Print memory check results.
|
||||
Valid values are True and False.
|
||||
@ -304,5 +304,5 @@ print_mem_check_results = True
|
||||
Print sleep periods between memory checks.
|
||||
Valid values are True and False.
|
||||
|
||||
print_sleep_periods = True
|
||||
print_sleep_periods = False
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user