fix victim info: display ppid

This commit is contained in:
Alexey Avramov 2019-02-11 17:59:25 +09:00
parent f3d2e4e099
commit a1e7b25ad0
2 changed files with 31 additions and 93 deletions

View File

@ -193,7 +193,7 @@ Please create [issues](https://github.com/hakavlad/nohang/issues). Use cases, fe
- Fix: replace `re.fullmatch()` by `re.search()`
- Validation RE patterns at startup
- Improve output:
- Display `oom_score`, `oom_score_adj`, `euid`, `state`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem`, `realpath` and `cmdline` of the victim in corrective action reports
- Display `oom_score`, `oom_score_adj`, `PPID`, `EUID`, `State`, `VmSize`, `RssAnon`, `RssFile`, `RssShmem`, `realpath` and `cmdline` of the victim in corrective action reports
- Print in terminal with colors
- Print statistics on corrective actions after each corrective action
- Improve poll rate algorithm

122
nohang
View File

@ -1,11 +1,11 @@
#!/usr/bin/env python3
"""A daemon that prevents OOM in Linux systems."""
import os
import ctypes
from ctypes import CDLL
from time import sleep, time
from operator import itemgetter
from sys import stdout
from signal import SIGKILL, SIGTERM, SIGSTOP, SIGCONT
from signal import SIGKILL, SIGTERM
start_time = time()
@ -40,11 +40,6 @@ HR = '~' * 79
# todo: make config option
print_total_stat = True
stop_cont = False
stop_cont_warn = False
##########################################################################
# define functions
@ -56,69 +51,26 @@ def mlockall():
MCL_FUTURE = 2
MCL_ONFAULT = 4
libc = ctypes.CDLL('libc.so.6', use_errno=True)
libc = CDLL('libc.so.6', use_errno=True)
result = libc.mlockall(MCL_CURRENT|MCL_FUTURE|MCL_ONFAULT)
result = libc.mlockall(
MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT
)
if result != 0:
result = libc.mlockall(MCL_CURRENT|MCL_FUTURE)
result = libc.mlockall(
MCL_CURRENT | MCL_FUTURE
)
if result != 0:
print('Can not lock all memory')
print('Cannot lock all memory')
else:
print('All memory locked with MCL_CURRENT|MCL_FUTURE')
print('All memory locked with MCL_CURRENT | MCL_FUTURE')
else:
print('All memory locked with MCL_CURRENT|MCL_FUTURE|MCL_ONFAULT')
print('All memory locked with MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT')
def pid_to_state(pid):
return rline1('/proc/' + pid + '/stat').rpartition(')')[2][1]
def stop():
#print()
#print('Stop running processes...')
t1 = time()
t2 = time()
stopped_list = []
for pid in os.listdir('/proc')[::-1]:
# only directories whose names consist only of numbers, except /proc/1/
if pid[0].isdecimal() is False or pid is '1' or pid == self_pid:
continue
try:
oom_score_r = int(rline1('/proc/' + pid + '/oom_score'))
if oom_score_r > 9:
uid_r = pid_to_uid(pid)
#print('PID: {}, State: {}, oom_score {}'.format(pid, pid_to_state(pid), oom_score_r))
if uid_r != '0':
stopped_list.append(pid)
print('Send SIGSTOP to {}, {}, {}...'.format(
pid, pid_to_name(pid), pid_to_cmdline(pid)[:40]))
os.kill(int(pid), SIGSTOP)
t2 = time()
except FileNotFoundError:
continue
except ProcessLookupError:
continue
print('Stop time:', t2 - t1)
stdout.flush()
return stopped_list
def cont(stopped_list):
print()
print('Continue stopped processes...')
t1 = time()
if len(stopped_list) > 0:
for pid in stopped_list:
print('Send SIGCONT to', [pid], pid_to_name(pid))
try:
os.kill(int(pid), SIGCONT)
except FileNotFoundError:
continue
except ProcessLookupError:
continue
t2 = time()
print('All cont time: ', t2 - t1)
def update_stat_dict_and_print(key):
@ -404,9 +356,6 @@ def send_notify_warn():
(implement Low memory warnings)
"""
if stop_cont_warn:
stopped_list = stop()
# find process with max badness
fat_tuple = fattest()
pid = fat_tuple[0]
@ -428,7 +377,7 @@ def send_notify_warn():
# title = 'Low memory: {}'.format(low_mem_percent)
title = 'Low memory'
body = 'Hog: <b>{}</b> [{}]'.format(
body = 'Hog: <b>{}</b>, PID: {}'.format(
name.replace(
# symbol '&' can break notifications in some themes,
# therefore it is replaced by '*'
@ -443,9 +392,6 @@ def send_notify_warn():
# send notification to user that runs this nohang
notify_send_wait(title, body)
if stop_cont_warn:
cont(stopped_list)
def send_notify(signal, name, pid):
"""
@ -456,7 +402,7 @@ def send_notify(signal, name, pid):
pid: str process pid
"""
title = 'Hang prevention'
body = '<b>{} {}</b> [{}]'.format(
body = '<b>{} {}</b>, PID: {}'.format(
notify_sig_dict[signal],
name.replace(
# symbol '&' can break notifications in some themes,
@ -614,11 +560,6 @@ def find_victim_and_send_signal(signal):
-> implement_corrective_action()
"""
if stop_cont:
stopped_list = stop()
pid, victim_badness = fattest()
name = pid_to_name(pid)
@ -633,8 +574,14 @@ def find_victim_and_send_signal(signal):
with open('/proc/' + pid + '/status') as f:
for n, line in enumerate(f):
if n is ppid_index:
ppid = line.split('\t')[1]
if n is uid_index:
uid = line.split('\t')[1]
uid = line.split('\t')[2]
continue
if n is vm_size_index:
@ -691,16 +638,13 @@ def find_victim_and_send_signal(signal):
for i in range(len(f_list)):
if i is ppid_index:
ppid = f_list[i].split('\t')[2]
ppid = f_list[i].split('\t')[1]
for i in range(len(f_list)):
if i is uid_index:
uid = f_list[i].split('\t')[2]
if i is vm_size_index:
vm_size = kib_to_mib(
int(f_list[i].split('\t')[1][:-3]))
@ -748,10 +692,10 @@ def find_victim_and_send_signal(signal):
len_vm = len(str(vm_size))
realpath = os.path.realpath('/proc/' + pid + '/exe')
state = pid_to_state(pid)
pname = pid_to_name(ppid.strip('\n '))
# print([ppid], [pname])
if detailed_rss:
@ -760,7 +704,8 @@ def find_victim_and_send_signal(signal):
'\n Name: \033[33m{}\033[0m' \
'\n State: \033[33m{}\033[0m' \
'\n PID: \033[33m{}\033[0m' \
'\n UID: \033[33m{}\033[0m' \
'\n PPID: \033[33m{}\033[0m (\033[33m{}\033[0m)' \
'\n EUID: \033[33m{}\033[0m' \
'\n badness: \033[33m{}\033[0m, ' \
'oom_score: \033[33m{}\033[0m, ' \
'oom_score_adj: \033[33m{}\033[0m' \
@ -775,6 +720,8 @@ def find_victim_and_send_signal(signal):
name,
state,
pid,
ppid.strip('\n '),
pname,
uid,
victim_badness,
oom_score,
@ -853,13 +800,8 @@ def find_victim_and_send_signal(signal):
m = check_mem_and_swap()
ma = round(int(m[0]) / 1024.0)
sf = round(int(m[2]) / 1024.0)
print('\nMemory status before sending a signal:\nMemA'
'v: {} MiB, SwFree: {} MiB'.format(ma, sf))
if stop_cont:
os.kill(int(pid), SIGCONT)
print('\nMemory status before sending a signal:\n MemAvailable'
': {} MiB, SwapFree: {} MiB'.format(ma, sf))
os.kill(int(pid), signal)
response_time = time() - time0
@ -909,10 +851,6 @@ def find_victim_and_send_signal(signal):
key = 'victim badness < min_badness'
update_stat_dict_and_print(key)
if stop_cont:
cont(stopped_list)
sleep_after_send_signal(signal)