From e800f08f9f56485ef4d0363159d1ffa973727f23 Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Thu, 17 Aug 2017 16:23:20 -0400 Subject: [PATCH] Change oom metric to const This removes the metric vec that was holding onto all task id and namespace combinations forever, until containerd was restarted. This was causing a memory leak with many task. This also removes the shim cmd where the `Args` is quite large from the reaper after the shim has been started cutting down on another leak. This is the first pass through the reaper but more code is required to fix all the issues when commands are added. Signed-off-by: Michael Crosby --- linux/shim/client.go | 1 + metrics/cgroups/cgroups.go | 1 + metrics/cgroups/oom.go | 47 ++++++++++++++++++++++++++++++-------- runtime/task.go | 1 - 4 files changed, 39 insertions(+), 11 deletions(-) diff --git a/linux/shim/client.go b/linux/shim/client.go index 2b5016fc5..f34403ceb 100644 --- a/linux/shim/client.go +++ b/linux/shim/client.go @@ -49,6 +49,7 @@ func WithStart(binary, address string, debug bool) ClientOpt { if err != nil { terminate(cmd) } + reaper.Default.Delete(cmd.Process.Pid) }() log.G(ctx).WithFields(logrus.Fields{ "pid": cmd.Process.Pid, diff --git a/metrics/cgroups/cgroups.go b/metrics/cgroups/cgroups.go index 5ef16c0c3..31d2f2148 100644 --- a/metrics/cgroups/cgroups.go +++ b/metrics/cgroups/cgroups.go @@ -65,6 +65,7 @@ func (m *cgroupsMonitor) Monitor(c runtime.Task) error { func (m *cgroupsMonitor) Stop(c runtime.Task) error { info := c.Info() m.collector.Remove(info.ID, info.Namespace) + m.oom.Remove(info.ID, info.Namespace) return nil } diff --git a/metrics/cgroups/oom.go b/metrics/cgroups/oom.go index 47d979f58..fd1845c1f 100644 --- a/metrics/cgroups/oom.go +++ b/metrics/cgroups/oom.go @@ -9,6 +9,7 @@ import ( "github.com/containerd/cgroups" metrics "github.com/docker/go-metrics" + "github.com/prometheus/client_golang/prometheus" "github.com/sirupsen/logrus" ) @@ -18,20 +19,21 @@ func NewOOMCollector(ns *metrics.Namespace) (*OOMCollector, error) { return nil, err } c := &OOMCollector{ - fd: fd, - memoryOOM: ns.NewLabeledGauge("memory_oom", "The number of times a container received an oom event", metrics.Total, "container_id", "namespace"), - set: make(map[uintptr]*oom), + fd: fd, + set: make(map[uintptr]*oom), + desc: ns.NewDesc("memory_oom", "The number of times a container has received an oom event", metrics.Total, "container_id", "namespace"), } go c.start() + ns.Add(c) return c, nil } type OOMCollector struct { mu sync.Mutex - memoryOOM metrics.LabeledGauge - fd int - set map[uintptr]*oom + fd int + set map[uintptr]*oom + desc *prometheus.Desc } type oom struct { @@ -39,6 +41,7 @@ type oom struct { namespace string c cgroups.Cgroup triggers []Trigger + count int } func (o *OOMCollector) Add(id, namespace string, cg cgroups.Cgroup, triggers ...Trigger) error { @@ -50,12 +53,10 @@ func (o *OOMCollector) Add(id, namespace string, cg cgroups.Cgroup, triggers ... } o.set[fd] = &oom{ id: id, + namespace: namespace, c: cg, triggers: triggers, - namespace: namespace, } - // set the gauge's default value - o.memoryOOM.WithValues(id, namespace).Set(0) event := unix.EpollEvent{ Fd: int32(fd), Events: unix.EPOLLHUP | unix.EPOLLIN | unix.EPOLLERR, @@ -66,11 +67,37 @@ func (o *OOMCollector) Add(id, namespace string, cg cgroups.Cgroup, triggers ... return nil } +func (o *OOMCollector) Remove(id, namespace string) { + o.mu.Lock() + defer o.mu.Unlock() + for fd, t := range o.set { + if t.id == id && t.namespace == namespace { + unix.Close(int(fd)) + delete(o.set, fd) + return + } + } +} + // Close closes the epoll fd func (o *OOMCollector) Close() error { return unix.Close(int(o.fd)) } +func (o *OOMCollector) Describe(ch chan<- *prometheus.Desc) { + o.mu.Lock() + defer o.mu.Unlock() + ch <- o.desc +} + +func (o *OOMCollector) Collect(ch chan<- prometheus.Metric) { + o.mu.Lock() + defer o.mu.Unlock() + for _, t := range o.set { + ch <- prometheus.MustNewConstMetric(o.desc, prometheus.GaugeValue, float64(t.count), t.id, t.namespace) + } +} + func (o *OOMCollector) start() { var events [128]unix.EpollEvent for { @@ -107,7 +134,7 @@ func (o *OOMCollector) process(fd uintptr, event uint32) { unix.Close(int(fd)) return } - o.memoryOOM.WithValues(info.id, info.namespace).Inc(1) + info.count++ for _, t := range info.triggers { t(info.id, info.c) } diff --git a/runtime/task.go b/runtime/task.go index 4a5685e10..62fa171e6 100644 --- a/runtime/task.go +++ b/runtime/task.go @@ -9,7 +9,6 @@ import ( type TaskInfo struct { ID string Runtime string - Spec []byte Namespace string }