Change oom metric to const
This removes the metric vec that was holding onto all task id and namespace combinations forever, until containerd was restarted. This was causing a memory leak with many task. This also removes the shim cmd where the `Args` is quite large from the reaper after the shim has been started cutting down on another leak. This is the first pass through the reaper but more code is required to fix all the issues when commands are added. Signed-off-by: Michael Crosby <crosbymichael@gmail.com>
This commit is contained in:
parent
89daacfe3f
commit
e800f08f9f
@ -49,6 +49,7 @@ func WithStart(binary, address string, debug bool) ClientOpt {
|
||||
if err != nil {
|
||||
terminate(cmd)
|
||||
}
|
||||
reaper.Default.Delete(cmd.Process.Pid)
|
||||
}()
|
||||
log.G(ctx).WithFields(logrus.Fields{
|
||||
"pid": cmd.Process.Pid,
|
||||
|
@ -65,6 +65,7 @@ func (m *cgroupsMonitor) Monitor(c runtime.Task) error {
|
||||
func (m *cgroupsMonitor) Stop(c runtime.Task) error {
|
||||
info := c.Info()
|
||||
m.collector.Remove(info.ID, info.Namespace)
|
||||
m.oom.Remove(info.ID, info.Namespace)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -9,6 +9,7 @@ import (
|
||||
|
||||
"github.com/containerd/cgroups"
|
||||
metrics "github.com/docker/go-metrics"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
|
||||
@ -19,19 +20,20 @@ func NewOOMCollector(ns *metrics.Namespace) (*OOMCollector, error) {
|
||||
}
|
||||
c := &OOMCollector{
|
||||
fd: fd,
|
||||
memoryOOM: ns.NewLabeledGauge("memory_oom", "The number of times a container received an oom event", metrics.Total, "container_id", "namespace"),
|
||||
set: make(map[uintptr]*oom),
|
||||
desc: ns.NewDesc("memory_oom", "The number of times a container has received an oom event", metrics.Total, "container_id", "namespace"),
|
||||
}
|
||||
go c.start()
|
||||
ns.Add(c)
|
||||
return c, nil
|
||||
}
|
||||
|
||||
type OOMCollector struct {
|
||||
mu sync.Mutex
|
||||
|
||||
memoryOOM metrics.LabeledGauge
|
||||
fd int
|
||||
set map[uintptr]*oom
|
||||
desc *prometheus.Desc
|
||||
}
|
||||
|
||||
type oom struct {
|
||||
@ -39,6 +41,7 @@ type oom struct {
|
||||
namespace string
|
||||
c cgroups.Cgroup
|
||||
triggers []Trigger
|
||||
count int
|
||||
}
|
||||
|
||||
func (o *OOMCollector) Add(id, namespace string, cg cgroups.Cgroup, triggers ...Trigger) error {
|
||||
@ -50,12 +53,10 @@ func (o *OOMCollector) Add(id, namespace string, cg cgroups.Cgroup, triggers ...
|
||||
}
|
||||
o.set[fd] = &oom{
|
||||
id: id,
|
||||
namespace: namespace,
|
||||
c: cg,
|
||||
triggers: triggers,
|
||||
namespace: namespace,
|
||||
}
|
||||
// set the gauge's default value
|
||||
o.memoryOOM.WithValues(id, namespace).Set(0)
|
||||
event := unix.EpollEvent{
|
||||
Fd: int32(fd),
|
||||
Events: unix.EPOLLHUP | unix.EPOLLIN | unix.EPOLLERR,
|
||||
@ -66,11 +67,37 @@ func (o *OOMCollector) Add(id, namespace string, cg cgroups.Cgroup, triggers ...
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *OOMCollector) Remove(id, namespace string) {
|
||||
o.mu.Lock()
|
||||
defer o.mu.Unlock()
|
||||
for fd, t := range o.set {
|
||||
if t.id == id && t.namespace == namespace {
|
||||
unix.Close(int(fd))
|
||||
delete(o.set, fd)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Close closes the epoll fd
|
||||
func (o *OOMCollector) Close() error {
|
||||
return unix.Close(int(o.fd))
|
||||
}
|
||||
|
||||
func (o *OOMCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||
o.mu.Lock()
|
||||
defer o.mu.Unlock()
|
||||
ch <- o.desc
|
||||
}
|
||||
|
||||
func (o *OOMCollector) Collect(ch chan<- prometheus.Metric) {
|
||||
o.mu.Lock()
|
||||
defer o.mu.Unlock()
|
||||
for _, t := range o.set {
|
||||
ch <- prometheus.MustNewConstMetric(o.desc, prometheus.GaugeValue, float64(t.count), t.id, t.namespace)
|
||||
}
|
||||
}
|
||||
|
||||
func (o *OOMCollector) start() {
|
||||
var events [128]unix.EpollEvent
|
||||
for {
|
||||
@ -107,7 +134,7 @@ func (o *OOMCollector) process(fd uintptr, event uint32) {
|
||||
unix.Close(int(fd))
|
||||
return
|
||||
}
|
||||
o.memoryOOM.WithValues(info.id, info.namespace).Inc(1)
|
||||
info.count++
|
||||
for _, t := range info.triggers {
|
||||
t(info.id, info.c)
|
||||
}
|
||||
|
@ -9,7 +9,6 @@ import (
|
||||
type TaskInfo struct {
|
||||
ID string
|
||||
Runtime string
|
||||
Spec []byte
|
||||
Namespace string
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user