@@ -23,13 +23,14 @@ import (
|
||||
|
||||
"github.com/containerd/cgroups"
|
||||
eventstypes "github.com/containerd/containerd/api/events"
|
||||
"github.com/containerd/containerd/errdefs"
|
||||
"github.com/containerd/containerd/events"
|
||||
"github.com/containerd/containerd/log"
|
||||
"github.com/containerd/containerd/namespaces"
|
||||
"github.com/containerd/containerd/platforms"
|
||||
"github.com/containerd/containerd/plugin"
|
||||
"github.com/containerd/containerd/runtime"
|
||||
"github.com/containerd/containerd/runtime/linux"
|
||||
"github.com/containerd/containerd/runtime/v1/linux"
|
||||
metrics "github.com/docker/go-metrics"
|
||||
"github.com/sirupsen/logrus"
|
||||
)
|
||||
@@ -80,16 +81,21 @@ type cgroupsMonitor struct {
|
||||
}
|
||||
|
||||
func (m *cgroupsMonitor) Monitor(c runtime.Task) error {
|
||||
info := c.Info()
|
||||
t := c.(*linux.Task)
|
||||
if err := m.collector.Add(c); err != nil {
|
||||
return err
|
||||
}
|
||||
t, ok := c.(*linux.Task)
|
||||
if !ok {
|
||||
return nil
|
||||
}
|
||||
cg, err := t.Cgroup()
|
||||
if err != nil {
|
||||
if errdefs.IsNotFound(err) {
|
||||
return nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
if err := m.collector.Add(info.ID, info.Namespace, cg); err != nil {
|
||||
return err
|
||||
}
|
||||
err = m.oom.Add(info.ID, info.Namespace, cg, m.trigger)
|
||||
err = m.oom.Add(c.ID(), c.Namespace(), cg, m.trigger)
|
||||
if err == cgroups.ErrMemoryNotSupported {
|
||||
logrus.WithError(err).Warn("OOM monitoring failed")
|
||||
return nil
|
||||
@@ -98,17 +104,7 @@ func (m *cgroupsMonitor) Monitor(c runtime.Task) error {
|
||||
}
|
||||
|
||||
func (m *cgroupsMonitor) Stop(c runtime.Task) error {
|
||||
info := c.Info()
|
||||
t := c.(*linux.Task)
|
||||
|
||||
cgroup, err := t.Cgroup()
|
||||
if err != nil {
|
||||
log.G(m.context).WithError(err).Warnf("unable to retrieve cgroup on stop")
|
||||
} else {
|
||||
m.collector.collect(info.ID, info.Namespace, cgroup, m.collector.storedMetrics, false, nil)
|
||||
}
|
||||
|
||||
m.collector.Remove(info.ID, info.Namespace)
|
||||
m.collector.Remove(c)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -19,12 +19,16 @@
|
||||
package cgroups
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync"
|
||||
|
||||
"github.com/containerd/cgroups"
|
||||
"github.com/containerd/containerd/log"
|
||||
"github.com/containerd/containerd/namespaces"
|
||||
"github.com/containerd/containerd/runtime"
|
||||
"github.com/containerd/typeurl"
|
||||
metrics "github.com/docker/go-metrics"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
@@ -48,8 +52,8 @@ func newCollector(ns *metrics.Namespace) *collector {
|
||||
}
|
||||
// add machine cpus and memory info
|
||||
c := &collector{
|
||||
ns: ns,
|
||||
cgroups: make(map[string]*task),
|
||||
ns: ns,
|
||||
tasks: make(map[string]runtime.Task),
|
||||
}
|
||||
c.metrics = append(c.metrics, pidMetrics...)
|
||||
c.metrics = append(c.metrics, cpuMetrics...)
|
||||
@@ -61,12 +65,6 @@ func newCollector(ns *metrics.Namespace) *collector {
|
||||
return c
|
||||
}
|
||||
|
||||
type task struct {
|
||||
id string
|
||||
namespace string
|
||||
cgroup cgroups.Cgroup
|
||||
}
|
||||
|
||||
func taskID(id, namespace string) string {
|
||||
return fmt.Sprintf("%s-%s", id, namespace)
|
||||
}
|
||||
@@ -76,7 +74,7 @@ func taskID(id, namespace string) string {
|
||||
type collector struct {
|
||||
mu sync.RWMutex
|
||||
|
||||
cgroups map[string]*task
|
||||
tasks map[string]runtime.Task
|
||||
ns *metrics.Namespace
|
||||
metrics []*metric
|
||||
storedMetrics chan prometheus.Metric
|
||||
@@ -91,9 +89,9 @@ func (c *collector) Describe(ch chan<- *prometheus.Desc) {
|
||||
func (c *collector) Collect(ch chan<- prometheus.Metric) {
|
||||
c.mu.RLock()
|
||||
wg := &sync.WaitGroup{}
|
||||
for _, t := range c.cgroups {
|
||||
for _, t := range c.tasks {
|
||||
wg.Add(1)
|
||||
go c.collect(t.id, t.namespace, t.cgroup, ch, true, wg)
|
||||
go c.collect(t, ch, true, wg)
|
||||
}
|
||||
storedLoop:
|
||||
for {
|
||||
@@ -109,45 +107,52 @@ storedLoop:
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func (c *collector) collect(id, namespace string, cg cgroups.Cgroup, ch chan<- prometheus.Metric, block bool, wg *sync.WaitGroup) {
|
||||
func (c *collector) collect(t runtime.Task, ch chan<- prometheus.Metric, block bool, wg *sync.WaitGroup) {
|
||||
if wg != nil {
|
||||
defer wg.Done()
|
||||
}
|
||||
|
||||
stats, err := cg.Stat(cgroups.IgnoreNotExist)
|
||||
ctx := namespaces.WithNamespace(context.Background(), t.Namespace())
|
||||
stats, err := t.Stats(ctx)
|
||||
if err != nil {
|
||||
log.L.WithError(err).Errorf("stat cgroup %s", id)
|
||||
log.L.WithError(err).Errorf("stat task %s", t.ID())
|
||||
return
|
||||
}
|
||||
data, err := typeurl.UnmarshalAny(stats)
|
||||
if err != nil {
|
||||
log.L.WithError(err).Errorf("unmarshal stats for %s", t.ID())
|
||||
return
|
||||
}
|
||||
s, ok := data.(*cgroups.Metrics)
|
||||
if !ok {
|
||||
log.L.WithError(err).Errorf("invalid metric type for %s", t.ID())
|
||||
return
|
||||
}
|
||||
for _, m := range c.metrics {
|
||||
m.collect(id, namespace, stats, c.ns, ch, block)
|
||||
m.collect(t.ID(), t.Namespace(), s, c.ns, ch, block)
|
||||
}
|
||||
}
|
||||
|
||||
// Add adds the provided cgroup and id so that metrics are collected and exported
|
||||
func (c *collector) Add(id, namespace string, cg cgroups.Cgroup) error {
|
||||
func (c *collector) Add(t runtime.Task) error {
|
||||
if c.ns == nil {
|
||||
return nil
|
||||
}
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
if _, ok := c.cgroups[taskID(id, namespace)]; ok {
|
||||
id := taskID(t.ID(), t.Namespace())
|
||||
if _, ok := c.tasks[id]; ok {
|
||||
return ErrAlreadyCollected
|
||||
}
|
||||
c.cgroups[taskID(id, namespace)] = &task{
|
||||
id: id,
|
||||
namespace: namespace,
|
||||
cgroup: cg,
|
||||
}
|
||||
c.tasks[id] = t
|
||||
return nil
|
||||
}
|
||||
|
||||
// Remove removes the provided cgroup by id from the collector
|
||||
func (c *collector) Remove(id, namespace string) {
|
||||
func (c *collector) Remove(t runtime.Task) {
|
||||
if c.ns == nil {
|
||||
return
|
||||
}
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
delete(c.cgroups, taskID(id, namespace))
|
||||
delete(c.tasks, taskID(t.ID(), t.Namespace()))
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user