Add namespace to container metrics
Signed-off-by: Michael Crosby <crosbymichael@gmail.com>
This commit is contained in:
parent
85f61f6f51
commit
6ec84ef83c
@ -276,7 +276,7 @@ func TestContainerExec(t *testing.T) {
|
||||
"exit 6",
|
||||
}
|
||||
|
||||
process, err := task.Exec(ctx, &processSpec, empty())
|
||||
process, err := task.Exec(ctx, processSpec, empty())
|
||||
if err != nil {
|
||||
t.Error(err)
|
||||
return
|
||||
|
@ -400,21 +400,9 @@ func (r *Runtime) terminate(ctx context.Context, bundle *bundle, ns, id string)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := rt.Kill(ctx, id, int(unix.SIGKILL), &runc.KillOpts{All: true}); err != nil {
|
||||
log.G(ctx).WithError(err).Warnf("kill all processes for %s", id)
|
||||
}
|
||||
// it can take a while for the container to be killed so poll for the container's status
|
||||
// until it is in a stopped state
|
||||
status := "running"
|
||||
for status != "stopped" {
|
||||
c, err := rt.State(ctx, id)
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
status = c.Status
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
}
|
||||
if err := rt.Delete(ctx, id); err != nil {
|
||||
if err := rt.Delete(ctx, id, &runc.DeleteOpts{
|
||||
Force: true,
|
||||
}); err != nil {
|
||||
log.G(ctx).WithError(err).Warnf("delete runtime state %s", id)
|
||||
}
|
||||
if err := unix.Unmount(filepath.Join(bundle.path, "rootfs"), 0); err != nil {
|
||||
|
@ -205,7 +205,7 @@ func (p *initProcess) Delete(context context.Context) error {
|
||||
}
|
||||
p.killAll(context)
|
||||
p.Wait()
|
||||
err = p.runc.Delete(context, p.id)
|
||||
err = p.runc.Delete(context, p.id, nil)
|
||||
if p.io != nil {
|
||||
for _, c := range p.closers {
|
||||
c.Close()
|
||||
|
@ -3,7 +3,6 @@
|
||||
package cgroups
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"time"
|
||||
|
||||
"github.com/containerd/cgroups"
|
||||
@ -44,12 +43,8 @@ type cgroupsMonitor struct {
|
||||
events chan<- *plugin.Event
|
||||
}
|
||||
|
||||
func getID(t plugin.Task) string {
|
||||
return fmt.Sprintf("%s-%s", t.Info().Namespace, t.Info().ID)
|
||||
}
|
||||
|
||||
func (m *cgroupsMonitor) Monitor(c plugin.Task) error {
|
||||
id := getID(c)
|
||||
info := c.Info()
|
||||
state, err := c.State(m.context)
|
||||
if err != nil {
|
||||
return err
|
||||
@ -58,14 +53,15 @@ func (m *cgroupsMonitor) Monitor(c plugin.Task) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := m.collector.Add(id, cg); err != nil {
|
||||
if err := m.collector.Add(info.ID, info.Namespace, cg); err != nil {
|
||||
return err
|
||||
}
|
||||
return m.oom.Add(id, cg, m.trigger)
|
||||
return m.oom.Add(info.ID, info.Namespace, cg, m.trigger)
|
||||
}
|
||||
|
||||
func (m *cgroupsMonitor) Stop(c plugin.Task) error {
|
||||
m.collector.Remove(getID(c))
|
||||
info := c.Info()
|
||||
m.collector.Remove(info.ID, info.Namespace)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -22,12 +22,13 @@ type metric struct {
|
||||
}
|
||||
|
||||
func (m *metric) desc(ns *metrics.Namespace) *prometheus.Desc {
|
||||
return ns.NewDesc(m.name, m.help, m.unit, append([]string{"id"}, m.labels...)...)
|
||||
// the namespace label is for containerd namespaces
|
||||
return ns.NewDesc(m.name, m.help, m.unit, append([]string{"id", "namespace"}, m.labels...)...)
|
||||
}
|
||||
|
||||
func (m *metric) collect(id string, stats *cgroups.Stats, ns *metrics.Namespace, ch chan<- prometheus.Metric) {
|
||||
func (m *metric) collect(id, namespace string, stats *cgroups.Stats, ns *metrics.Namespace, ch chan<- prometheus.Metric) {
|
||||
values := m.getValues(stats)
|
||||
for _, v := range values {
|
||||
ch <- prometheus.MustNewConstMetric(m.desc(ns), m.vt, v.v, append([]string{id}, v.l...)...)
|
||||
ch <- prometheus.MustNewConstMetric(m.desc(ns), m.vt, v.v, append([]string{id, namespace}, v.l...)...)
|
||||
}
|
||||
}
|
||||
|
@ -26,7 +26,7 @@ func NewCollector(ns *metrics.Namespace) *Collector {
|
||||
// add machine cpus and memory info
|
||||
c := &Collector{
|
||||
ns: ns,
|
||||
cgroups: make(map[string]cgroups.Cgroup),
|
||||
cgroups: make(map[string]*task),
|
||||
}
|
||||
c.metrics = append(c.metrics, pidMetrics...)
|
||||
c.metrics = append(c.metrics, cpuMetrics...)
|
||||
@ -37,12 +37,18 @@ func NewCollector(ns *metrics.Namespace) *Collector {
|
||||
return c
|
||||
}
|
||||
|
||||
type task struct {
|
||||
id string
|
||||
namespace string
|
||||
cgroup cgroups.Cgroup
|
||||
}
|
||||
|
||||
// Collector provides the ability to collect container stats and export
|
||||
// them in the prometheus format
|
||||
type Collector struct {
|
||||
mu sync.RWMutex
|
||||
|
||||
cgroups map[string]cgroups.Cgroup
|
||||
cgroups map[string]*task
|
||||
ns *metrics.Namespace
|
||||
metrics []*metric
|
||||
}
|
||||
@ -56,15 +62,15 @@ func (c *Collector) Describe(ch chan<- *prometheus.Desc) {
|
||||
func (c *Collector) Collect(ch chan<- prometheus.Metric) {
|
||||
c.mu.RLock()
|
||||
wg := &sync.WaitGroup{}
|
||||
for id, cg := range c.cgroups {
|
||||
for _, t := range c.cgroups {
|
||||
wg.Add(1)
|
||||
go c.collect(id, cg, ch, wg)
|
||||
go c.collect(t.id, t.namespace, t.cgroup, ch, wg)
|
||||
}
|
||||
c.mu.RUnlock()
|
||||
wg.Wait()
|
||||
}
|
||||
|
||||
func (c *Collector) collect(id string, cg cgroups.Cgroup, ch chan<- prometheus.Metric, wg *sync.WaitGroup) {
|
||||
func (c *Collector) collect(id, namespace string, cg cgroups.Cgroup, ch chan<- prometheus.Metric, wg *sync.WaitGroup) {
|
||||
defer wg.Done()
|
||||
stats, err := cg.Stat(cgroups.IgnoreNotExist)
|
||||
if err != nil {
|
||||
@ -72,38 +78,42 @@ func (c *Collector) collect(id string, cg cgroups.Cgroup, ch chan<- prometheus.M
|
||||
return
|
||||
}
|
||||
for _, m := range c.metrics {
|
||||
m.collect(id, stats, c.ns, ch)
|
||||
m.collect(id, namespace, stats, c.ns, ch)
|
||||
}
|
||||
}
|
||||
|
||||
// Add adds the provided cgroup and id so that metrics are collected and exported
|
||||
func (c *Collector) Add(id string, cg cgroups.Cgroup) error {
|
||||
func (c *Collector) Add(id, namespace string, cg cgroups.Cgroup) error {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
if _, ok := c.cgroups[id]; ok {
|
||||
if _, ok := c.cgroups[id+namespace]; ok {
|
||||
return ErrAlreadyCollected
|
||||
}
|
||||
c.cgroups[id] = cg
|
||||
c.cgroups[id+namespace] = &task{
|
||||
id: id,
|
||||
namespace: namespace,
|
||||
cgroup: cg,
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Get returns the cgroup that is being collected under the provided id
|
||||
// returns ErrCgroupNotExists if the id is not being collected
|
||||
func (c *Collector) Get(id string) (cgroups.Cgroup, error) {
|
||||
func (c *Collector) Get(id, namespace string) (cgroups.Cgroup, error) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
cg, ok := c.cgroups[id]
|
||||
t, ok := c.cgroups[id+namespace]
|
||||
if !ok {
|
||||
return nil, ErrCgroupNotExists
|
||||
}
|
||||
return cg, nil
|
||||
return t.cgroup, nil
|
||||
}
|
||||
|
||||
// Remove removes the provided cgroup by id from the collector
|
||||
func (c *Collector) Remove(id string) {
|
||||
func (c *Collector) Remove(id, namespace string) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
delete(c.cgroups, id)
|
||||
delete(c.cgroups, id+namespace)
|
||||
}
|
||||
|
||||
func blkioValues(l []cgroups.BlkioEntry) []value {
|
||||
|
@ -17,7 +17,7 @@ func NewOOMCollector(ns *metrics.Namespace) (*OOMCollector, error) {
|
||||
}
|
||||
c := &OOMCollector{
|
||||
fd: fd,
|
||||
memoryOOM: ns.NewLabeledGauge("memory_oom", "The number of times a container received an oom event", metrics.Total, "id"),
|
||||
memoryOOM: ns.NewLabeledGauge("memory_oom", "The number of times a container received an oom event", metrics.Total, "id", "namespace"),
|
||||
set: make(map[uintptr]*oom),
|
||||
}
|
||||
go c.start()
|
||||
@ -34,11 +34,12 @@ type OOMCollector struct {
|
||||
|
||||
type oom struct {
|
||||
id string
|
||||
namespace string
|
||||
c cgroups.Cgroup
|
||||
triggers []Trigger
|
||||
}
|
||||
|
||||
func (o *OOMCollector) Add(id string, cg cgroups.Cgroup, triggers ...Trigger) error {
|
||||
func (o *OOMCollector) Add(id, namespace string, cg cgroups.Cgroup, triggers ...Trigger) error {
|
||||
o.mu.Lock()
|
||||
defer o.mu.Unlock()
|
||||
fd, err := cg.OOMEventFD()
|
||||
@ -49,9 +50,10 @@ func (o *OOMCollector) Add(id string, cg cgroups.Cgroup, triggers ...Trigger) er
|
||||
id: id,
|
||||
c: cg,
|
||||
triggers: triggers,
|
||||
namespace: namespace,
|
||||
}
|
||||
// set the gauge's default value
|
||||
o.memoryOOM.WithValues(id).Set(0)
|
||||
o.memoryOOM.WithValues(id, namespace).Set(0)
|
||||
event := unix.EpollEvent{
|
||||
Fd: int32(fd),
|
||||
Events: unix.EPOLLHUP | unix.EPOLLIN | unix.EPOLLERR,
|
||||
@ -103,7 +105,7 @@ func (o *OOMCollector) process(fd uintptr, event uint32) {
|
||||
unix.Close(int(fd))
|
||||
return
|
||||
}
|
||||
o.memoryOOM.WithValues(info.id).Inc(1)
|
||||
o.memoryOOM.WithValues(info.id, info.namespace).Inc(1)
|
||||
for _, t := range info.triggers {
|
||||
t(info.id, info.c)
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user