fix victim info: display ppid
This commit is contained in:
parent
f3d2e4e099
commit
a1e7b25ad0
@ -193,7 +193,7 @@ Please create [issues](https://github.com/hakavlad/nohang/issues). Use cases, fe
|
|||||||
- Fix: replace `re.fullmatch()` by `re.search()`
|
- Fix: replace `re.fullmatch()` by `re.search()`
|
||||||
- Validation RE patterns at startup
|
- Validation RE patterns at startup
|
||||||
- Improve output:
|
- Improve output:
|
||||||
- Display `oom_score`, `oom_score_adj`, `euid`, `state`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem`, `realpath` and `cmdline` of the victim in corrective action reports
|
- Display `oom_score`, `oom_score_adj`, `PPID`, `EUID`, `State`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem`, `realpath` and `cmdline` of the victim in corrective action reports
|
||||||
- Print in terminal with colors
|
- Print in terminal with colors
|
||||||
- Print statistics on corrective actions after each corrective action
|
- Print statistics on corrective actions after each corrective action
|
||||||
- Improve poll rate algorithm
|
- Improve poll rate algorithm
|
||||||
|
116
nohang
116
nohang
@ -1,11 +1,11 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""A daemon that prevents OOM in Linux systems."""
|
"""A daemon that prevents OOM in Linux systems."""
|
||||||
import os
|
import os
|
||||||
import ctypes
|
from ctypes import CDLL
|
||||||
from time import sleep, time
|
from time import sleep, time
|
||||||
from operator import itemgetter
|
from operator import itemgetter
|
||||||
from sys import stdout
|
from sys import stdout
|
||||||
from signal import SIGKILL, SIGTERM, SIGSTOP, SIGCONT
|
from signal import SIGKILL, SIGTERM
|
||||||
|
|
||||||
start_time = time()
|
start_time = time()
|
||||||
|
|
||||||
@ -40,11 +40,6 @@ HR = '~' * 79
|
|||||||
# todo: make config option
|
# todo: make config option
|
||||||
print_total_stat = True
|
print_total_stat = True
|
||||||
|
|
||||||
|
|
||||||
stop_cont = False
|
|
||||||
stop_cont_warn = False
|
|
||||||
|
|
||||||
|
|
||||||
##########################################################################
|
##########################################################################
|
||||||
|
|
||||||
# define functions
|
# define functions
|
||||||
@ -56,11 +51,15 @@ def mlockall():
|
|||||||
MCL_FUTURE = 2
|
MCL_FUTURE = 2
|
||||||
MCL_ONFAULT = 4
|
MCL_ONFAULT = 4
|
||||||
|
|
||||||
libc = ctypes.CDLL('libc.so.6', use_errno=True)
|
libc = CDLL('libc.so.6', use_errno=True)
|
||||||
|
|
||||||
result = libc.mlockall(MCL_CURRENT|MCL_FUTURE|MCL_ONFAULT)
|
result = libc.mlockall(
|
||||||
|
MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT
|
||||||
|
)
|
||||||
if result != 0:
|
if result != 0:
|
||||||
result = libc.mlockall(MCL_CURRENT|MCL_FUTURE)
|
result = libc.mlockall(
|
||||||
|
MCL_CURRENT | MCL_FUTURE
|
||||||
|
)
|
||||||
if result != 0:
|
if result != 0:
|
||||||
print('Cannot lock all memory')
|
print('Cannot lock all memory')
|
||||||
else:
|
else:
|
||||||
@ -72,53 +71,6 @@ def mlockall():
|
|||||||
def pid_to_state(pid):
|
def pid_to_state(pid):
|
||||||
return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1]
|
return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1]
|
||||||
|
|
||||||
def stop():
|
|
||||||
#print()
|
|
||||||
#print('Stop running processes...')
|
|
||||||
t1 = time()
|
|
||||||
t2 = time()
|
|
||||||
stopped_list = []
|
|
||||||
for pid in os.listdir('/proc')[::-1]:
|
|
||||||
# only directories whose names consist only of numbers, except /proc/1/
|
|
||||||
if pid[0].isdecimal() is False or pid is '1' or pid == self_pid:
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
oom_score_r = int(rline1('/proc/' + pid + '/oom_score'))
|
|
||||||
if oom_score_r > 9:
|
|
||||||
uid_r = pid_to_uid(pid)
|
|
||||||
#print('PID: {}, State: {}, oom_score {}'.format(pid, pid_to_state(pid), oom_score_r))
|
|
||||||
if uid_r != '0':
|
|
||||||
stopped_list.append(pid)
|
|
||||||
print('Send SIGSTOP to {}, {}, {}...'.format(
|
|
||||||
pid, pid_to_name(pid), pid_to_cmdline(pid)[:40]))
|
|
||||||
os.kill(int(pid), SIGSTOP)
|
|
||||||
t2 = time()
|
|
||||||
except FileNotFoundError:
|
|
||||||
continue
|
|
||||||
except ProcessLookupError:
|
|
||||||
continue
|
|
||||||
print('Stop time:', t2 - t1)
|
|
||||||
stdout.flush()
|
|
||||||
|
|
||||||
return stopped_list
|
|
||||||
|
|
||||||
def cont(stopped_list):
|
|
||||||
print()
|
|
||||||
print('Continue stopped processes...')
|
|
||||||
t1 = time()
|
|
||||||
if len(stopped_list) > 0:
|
|
||||||
for pid in stopped_list:
|
|
||||||
print('Send SIGCONT to', [pid], pid_to_name(pid))
|
|
||||||
try:
|
|
||||||
os.kill(int(pid), SIGCONT)
|
|
||||||
except FileNotFoundError:
|
|
||||||
continue
|
|
||||||
except ProcessLookupError:
|
|
||||||
continue
|
|
||||||
t2 = time()
|
|
||||||
print('All cont time: ', t2 - t1)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def update_stat_dict_and_print(key):
|
def update_stat_dict_and_print(key):
|
||||||
|
|
||||||
@ -404,9 +356,6 @@ def send_notify_warn():
|
|||||||
(implement Low memory warnings)
|
(implement Low memory warnings)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if stop_cont_warn:
|
|
||||||
stopped_list = stop()
|
|
||||||
|
|
||||||
# find process with max badness
|
# find process with max badness
|
||||||
fat_tuple = fattest()
|
fat_tuple = fattest()
|
||||||
pid = fat_tuple[0]
|
pid = fat_tuple[0]
|
||||||
@ -428,7 +377,7 @@ def send_notify_warn():
|
|||||||
# title = 'Low memory: {}'.format(low_mem_percent)
|
# title = 'Low memory: {}'.format(low_mem_percent)
|
||||||
title = 'Low memory'
|
title = 'Low memory'
|
||||||
|
|
||||||
body = 'Hog: <b>{}</b> [{}]'.format(
|
body = 'Hog: <b>{}</b>, PID: {}'.format(
|
||||||
name.replace(
|
name.replace(
|
||||||
# symbol '&' can break notifications in some themes,
|
# symbol '&' can break notifications in some themes,
|
||||||
# therefore it is replaced by '*'
|
# therefore it is replaced by '*'
|
||||||
@ -443,9 +392,6 @@ def send_notify_warn():
|
|||||||
# send notification to user that runs this nohang
|
# send notification to user that runs this nohang
|
||||||
notify_send_wait(title, body)
|
notify_send_wait(title, body)
|
||||||
|
|
||||||
if stop_cont_warn:
|
|
||||||
cont(stopped_list)
|
|
||||||
|
|
||||||
|
|
||||||
def send_notify(signal, name, pid):
|
def send_notify(signal, name, pid):
|
||||||
"""
|
"""
|
||||||
@ -456,7 +402,7 @@ def send_notify(signal, name, pid):
|
|||||||
pid: str process pid
|
pid: str process pid
|
||||||
"""
|
"""
|
||||||
title = 'Hang prevention'
|
title = 'Hang prevention'
|
||||||
body = '<b>{} {}</b> [{}]'.format(
|
body = '<b>{} {}</b>, PID: {}'.format(
|
||||||
notify_sig_dict[signal],
|
notify_sig_dict[signal],
|
||||||
name.replace(
|
name.replace(
|
||||||
# symbol '&' can break notifications in some themes,
|
# symbol '&' can break notifications in some themes,
|
||||||
@ -614,11 +560,6 @@ def find_victim_and_send_signal(signal):
|
|||||||
-> implement_corrective_action()
|
-> implement_corrective_action()
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
if stop_cont:
|
|
||||||
stopped_list = stop()
|
|
||||||
|
|
||||||
|
|
||||||
pid, victim_badness = fattest()
|
pid, victim_badness = fattest()
|
||||||
name = pid_to_name(pid)
|
name = pid_to_name(pid)
|
||||||
|
|
||||||
@ -633,8 +574,14 @@ def find_victim_and_send_signal(signal):
|
|||||||
with open('/proc/' + pid + '/status') as f:
|
with open('/proc/' + pid + '/status') as f:
|
||||||
for n, line in enumerate(f):
|
for n, line in enumerate(f):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if n is ppid_index:
|
||||||
|
ppid = line.split('\t')[1]
|
||||||
|
|
||||||
|
|
||||||
if n is uid_index:
|
if n is uid_index:
|
||||||
uid = line.split('\t')[1]
|
uid = line.split('\t')[2]
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if n is vm_size_index:
|
if n is vm_size_index:
|
||||||
@ -691,16 +638,13 @@ def find_victim_and_send_signal(signal):
|
|||||||
|
|
||||||
for i in range(len(f_list)):
|
for i in range(len(f_list)):
|
||||||
if i is ppid_index:
|
if i is ppid_index:
|
||||||
ppid = f_list[i].split('\t')[2]
|
ppid = f_list[i].split('\t')[1]
|
||||||
|
|
||||||
|
|
||||||
for i in range(len(f_list)):
|
for i in range(len(f_list)):
|
||||||
if i is uid_index:
|
if i is uid_index:
|
||||||
uid = f_list[i].split('\t')[2]
|
uid = f_list[i].split('\t')[2]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if i is vm_size_index:
|
if i is vm_size_index:
|
||||||
vm_size = kib_to_mib(
|
vm_size = kib_to_mib(
|
||||||
int(f_list[i].split('\t')[1][:-3]))
|
int(f_list[i].split('\t')[1][:-3]))
|
||||||
@ -748,10 +692,10 @@ def find_victim_and_send_signal(signal):
|
|||||||
len_vm = len(str(vm_size))
|
len_vm = len(str(vm_size))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
realpath = os.path.realpath('/proc/' + pid + '/exe')
|
realpath = os.path.realpath('/proc/' + pid + '/exe')
|
||||||
state = pid_to_state(pid)
|
state = pid_to_state(pid)
|
||||||
|
pname = pid_to_name(ppid.strip('\n '))
|
||||||
|
# print([ppid], [pname])
|
||||||
|
|
||||||
|
|
||||||
if detailed_rss:
|
if detailed_rss:
|
||||||
@ -760,7 +704,8 @@ def find_victim_and_send_signal(signal):
|
|||||||
'\n Name: \033[33m{}\033[0m' \
|
'\n Name: \033[33m{}\033[0m' \
|
||||||
'\n State: \033[33m{}\033[0m' \
|
'\n State: \033[33m{}\033[0m' \
|
||||||
'\n PID: \033[33m{}\033[0m' \
|
'\n PID: \033[33m{}\033[0m' \
|
||||||
'\n UID: \033[33m{}\033[0m' \
|
'\n PPID: \033[33m{}\033[0m (\033[33m{}\033[0m)' \
|
||||||
|
'\n EUID: \033[33m{}\033[0m' \
|
||||||
'\n badness: \033[33m{}\033[0m, ' \
|
'\n badness: \033[33m{}\033[0m, ' \
|
||||||
'oom_score: \033[33m{}\033[0m, ' \
|
'oom_score: \033[33m{}\033[0m, ' \
|
||||||
'oom_score_adj: \033[33m{}\033[0m' \
|
'oom_score_adj: \033[33m{}\033[0m' \
|
||||||
@ -775,6 +720,8 @@ def find_victim_and_send_signal(signal):
|
|||||||
name,
|
name,
|
||||||
state,
|
state,
|
||||||
pid,
|
pid,
|
||||||
|
ppid.strip('\n '),
|
||||||
|
pname,
|
||||||
uid,
|
uid,
|
||||||
victim_badness,
|
victim_badness,
|
||||||
oom_score,
|
oom_score,
|
||||||
@ -853,13 +800,8 @@ def find_victim_and_send_signal(signal):
|
|||||||
m = check_mem_and_swap()
|
m = check_mem_and_swap()
|
||||||
ma = round(int(m[0]) / 1024.0)
|
ma = round(int(m[0]) / 1024.0)
|
||||||
sf = round(int(m[2]) / 1024.0)
|
sf = round(int(m[2]) / 1024.0)
|
||||||
print('\nMemory status before sending a signal:\nMemA'
|
print('\nMemory status before sending a signal:\n MemAvailable'
|
||||||
'v: {} MiB, SwFree: {} MiB'.format(ma, sf))
|
': {} MiB, SwapFree: {} MiB'.format(ma, sf))
|
||||||
|
|
||||||
|
|
||||||
if stop_cont:
|
|
||||||
os.kill(int(pid), SIGCONT)
|
|
||||||
|
|
||||||
|
|
||||||
os.kill(int(pid), signal)
|
os.kill(int(pid), signal)
|
||||||
response_time = time() - time0
|
response_time = time() - time0
|
||||||
@ -909,10 +851,6 @@ def find_victim_and_send_signal(signal):
|
|||||||
key = 'victim badness < min_badness'
|
key = 'victim badness < min_badness'
|
||||||
update_stat_dict_and_print(key)
|
update_stat_dict_and_print(key)
|
||||||
|
|
||||||
|
|
||||||
if stop_cont:
|
|
||||||
cont(stopped_list)
|
|
||||||
|
|
||||||
sleep_after_send_signal(signal)
|
sleep_after_send_signal(signal)
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user