fix victim info: display ppid
This commit is contained in:
parent
f3d2e4e099
commit
a1e7b25ad0
@ -193,7 +193,7 @@ Please create [issues](https://github.com/hakavlad/nohang/issues). Use cases, fe
|
||||
- Fix: replace `re.fullmatch()` by `re.search()`
|
||||
- Validation RE patterns at startup
|
||||
- Improve output:
|
||||
- Display `oom_score`, `oom_score_adj`, `euid`, `state`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem`, `realpath` and `cmdline` of the victim in corrective action reports
|
||||
- Display `oom_score`, `oom_score_adj`, `PPID`, `EUID`, `State`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem`, `realpath` and `cmdline` of the victim in corrective action reports
|
||||
- Print in terminal with colors
|
||||
- Print statistics on corrective actions after each corrective action
|
||||
- Improve poll rate algorithm
|
||||
|
122
nohang
122
nohang
@ -1,11 +1,11 @@
|
||||
#!/usr/bin/env python3
|
||||
"""A daemon that prevents OOM in Linux systems."""
|
||||
import os
|
||||
import ctypes
|
||||
from ctypes import CDLL
|
||||
from time import sleep, time
|
||||
from operator import itemgetter
|
||||
from sys import stdout
|
||||
from signal import SIGKILL, SIGTERM, SIGSTOP, SIGCONT
|
||||
from signal import SIGKILL, SIGTERM
|
||||
|
||||
start_time = time()
|
||||
|
||||
@ -40,11 +40,6 @@ HR = '~' * 79
|
||||
# todo: make config option
|
||||
print_total_stat = True
|
||||
|
||||
|
||||
stop_cont = False
|
||||
stop_cont_warn = False
|
||||
|
||||
|
||||
##########################################################################
|
||||
|
||||
# define functions
|
||||
@ -56,69 +51,26 @@ def mlockall():
|
||||
MCL_FUTURE = 2
|
||||
MCL_ONFAULT = 4
|
||||
|
||||
libc = ctypes.CDLL('libc.so.6', use_errno=True)
|
||||
libc = CDLL('libc.so.6', use_errno=True)
|
||||
|
||||
result = libc.mlockall(MCL_CURRENT|MCL_FUTURE|MCL_ONFAULT)
|
||||
result = libc.mlockall(
|
||||
MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT
|
||||
)
|
||||
if result != 0:
|
||||
result = libc.mlockall(MCL_CURRENT|MCL_FUTURE)
|
||||
result = libc.mlockall(
|
||||
MCL_CURRENT | MCL_FUTURE
|
||||
)
|
||||
if result != 0:
|
||||
print('Can not lock all memory')
|
||||
print('Cannot lock all memory')
|
||||
else:
|
||||
print('All memory locked with MCL_CURRENT|MCL_FUTURE')
|
||||
print('All memory locked with MCL_CURRENT | MCL_FUTURE')
|
||||
else:
|
||||
print('All memory locked with MCL_CURRENT|MCL_FUTURE|MCL_ONFAULT')
|
||||
print('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT')
|
||||
|
||||
|
||||
def pid_to_state(pid):
|
||||
return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1]
|
||||
|
||||
def stop():
|
||||
#print()
|
||||
#print('Stop running processes...')
|
||||
t1 = time()
|
||||
t2 = time()
|
||||
stopped_list = []
|
||||
for pid in os.listdir('/proc')[::-1]:
|
||||
# only directories whose names consist only of numbers, except /proc/1/
|
||||
if pid[0].isdecimal() is False or pid is '1' or pid == self_pid:
|
||||
continue
|
||||
try:
|
||||
oom_score_r = int(rline1('/proc/' + pid + '/oom_score'))
|
||||
if oom_score_r > 9:
|
||||
uid_r = pid_to_uid(pid)
|
||||
#print('PID: {}, State: {}, oom_score {}'.format(pid, pid_to_state(pid), oom_score_r))
|
||||
if uid_r != '0':
|
||||
stopped_list.append(pid)
|
||||
print('Send SIGSTOP to {}, {}, {}...'.format(
|
||||
pid, pid_to_name(pid), pid_to_cmdline(pid)[:40]))
|
||||
os.kill(int(pid), SIGSTOP)
|
||||
t2 = time()
|
||||
except FileNotFoundError:
|
||||
continue
|
||||
except ProcessLookupError:
|
||||
continue
|
||||
print('Stop time:', t2 - t1)
|
||||
stdout.flush()
|
||||
|
||||
return stopped_list
|
||||
|
||||
def cont(stopped_list):
|
||||
print()
|
||||
print('Continue stopped processes...')
|
||||
t1 = time()
|
||||
if len(stopped_list) > 0:
|
||||
for pid in stopped_list:
|
||||
print('Send SIGCONT to', [pid], pid_to_name(pid))
|
||||
try:
|
||||
os.kill(int(pid), SIGCONT)
|
||||
except FileNotFoundError:
|
||||
continue
|
||||
except ProcessLookupError:
|
||||
continue
|
||||
t2 = time()
|
||||
print('All cont time: ', t2 - t1)
|
||||
|
||||
|
||||
|
||||
def update_stat_dict_and_print(key):
|
||||
|
||||
@ -404,9 +356,6 @@ def send_notify_warn():
|
||||
(implement Low memory warnings)
|
||||
"""
|
||||
|
||||
if stop_cont_warn:
|
||||
stopped_list = stop()
|
||||
|
||||
# find process with max badness
|
||||
fat_tuple = fattest()
|
||||
pid = fat_tuple[0]
|
||||
@ -428,7 +377,7 @@ def send_notify_warn():
|
||||
# title = 'Low memory: {}'.format(low_mem_percent)
|
||||
title = 'Low memory'
|
||||
|
||||
body = 'Hog: <b>{}</b> [{}]'.format(
|
||||
body = 'Hog: <b>{}</b>, PID: {}'.format(
|
||||
name.replace(
|
||||
# symbol '&' can break notifications in some themes,
|
||||
# therefore it is replaced by '*'
|
||||
@ -443,9 +392,6 @@ def send_notify_warn():
|
||||
# send notification to user that runs this nohang
|
||||
notify_send_wait(title, body)
|
||||
|
||||
if stop_cont_warn:
|
||||
cont(stopped_list)
|
||||
|
||||
|
||||
def send_notify(signal, name, pid):
|
||||
"""
|
||||
@ -456,7 +402,7 @@ def send_notify(signal, name, pid):
|
||||
pid: str process pid
|
||||
"""
|
||||
title = 'Hang prevention'
|
||||
body = '<b>{} {}</b> [{}]'.format(
|
||||
body = '<b>{} {}</b>, PID: {}'.format(
|
||||
notify_sig_dict[signal],
|
||||
name.replace(
|
||||
# symbol '&' can break notifications in some themes,
|
||||
@ -614,11 +560,6 @@ def find_victim_and_send_signal(signal):
|
||||
-> implement_corrective_action()
|
||||
"""
|
||||
|
||||
|
||||
if stop_cont:
|
||||
stopped_list = stop()
|
||||
|
||||
|
||||
pid, victim_badness = fattest()
|
||||
name = pid_to_name(pid)
|
||||
|
||||
@ -633,8 +574,14 @@ def find_victim_and_send_signal(signal):
|
||||
with open('/proc/' + pid + '/status') as f:
|
||||
for n, line in enumerate(f):
|
||||
|
||||
|
||||
|
||||
if n is ppid_index:
|
||||
ppid = line.split('\t')[1]
|
||||
|
||||
|
||||
if n is uid_index:
|
||||
uid = line.split('\t')[1]
|
||||
uid = line.split('\t')[2]
|
||||
continue
|
||||
|
||||
if n is vm_size_index:
|
||||
@ -691,16 +638,13 @@ def find_victim_and_send_signal(signal):
|
||||
|
||||
for i in range(len(f_list)):
|
||||
if i is ppid_index:
|
||||
ppid = f_list[i].split('\t')[2]
|
||||
|
||||
ppid = f_list[i].split('\t')[1]
|
||||
|
||||
for i in range(len(f_list)):
|
||||
if i is uid_index:
|
||||
uid = f_list[i].split('\t')[2]
|
||||
|
||||
|
||||
|
||||
|
||||
if i is vm_size_index:
|
||||
vm_size = kib_to_mib(
|
||||
int(f_list[i].split('\t')[1][:-3]))
|
||||
@ -748,10 +692,10 @@ def find_victim_and_send_signal(signal):
|
||||
len_vm = len(str(vm_size))
|
||||
|
||||
|
||||
|
||||
realpath = os.path.realpath('/proc/' + pid + '/exe')
|
||||
state = pid_to_state(pid)
|
||||
|
||||
pname = pid_to_name(ppid.strip('\n '))
|
||||
# print([ppid], [pname])
|
||||
|
||||
|
||||
if detailed_rss:
|
||||
@ -760,7 +704,8 @@ def find_victim_and_send_signal(signal):
|
||||
'\n Name: \033[33m{}\033[0m' \
|
||||
'\n State: \033[33m{}\033[0m' \
|
||||
'\n PID: \033[33m{}\033[0m' \
|
||||
'\n UID: \033[33m{}\033[0m' \
|
||||
'\n PPID: \033[33m{}\033[0m (\033[33m{}\033[0m)' \
|
||||
'\n EUID: \033[33m{}\033[0m' \
|
||||
'\n badness: \033[33m{}\033[0m, ' \
|
||||
'oom_score: \033[33m{}\033[0m, ' \
|
||||
'oom_score_adj: \033[33m{}\033[0m' \
|
||||
@ -775,6 +720,8 @@ def find_victim_and_send_signal(signal):
|
||||
name,
|
||||
state,
|
||||
pid,
|
||||
ppid.strip('\n '),
|
||||
pname,
|
||||
uid,
|
||||
victim_badness,
|
||||
oom_score,
|
||||
@ -853,13 +800,8 @@ def find_victim_and_send_signal(signal):
|
||||
m = check_mem_and_swap()
|
||||
ma = round(int(m[0]) / 1024.0)
|
||||
sf = round(int(m[2]) / 1024.0)
|
||||
print('\nMemory status before sending a signal:\nMemA'
|
||||
'v: {} MiB, SwFree: {} MiB'.format(ma, sf))
|
||||
|
||||
|
||||
if stop_cont:
|
||||
os.kill(int(pid), SIGCONT)
|
||||
|
||||
print('\nMemory status before sending a signal:\n MemAvailable'
|
||||
': {} MiB, SwapFree: {} MiB'.format(ma, sf))
|
||||
|
||||
os.kill(int(pid), signal)
|
||||
response_time = time() - time0
|
||||
@ -909,10 +851,6 @@ def find_victim_and_send_signal(signal):
|
||||
key = 'victim badness < min_badness'
|
||||
update_stat_dict_and_print(key)
|
||||
|
||||
|
||||
if stop_cont:
|
||||
cont(stopped_list)
|
||||
|
||||
sleep_after_send_signal(signal)
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user