diff --git a/metrics/cgroups/blkio.go b/metrics/cgroups/blkio.go new file mode 100644 index 000000000..cd9ca2963 --- /dev/null +++ b/metrics/cgroups/blkio.go @@ -0,0 +1,101 @@ +package cgroups + +import ( + "github.com/containerd/cgroups" + metrics "github.com/docker/go-metrics" + "github.com/prometheus/client_golang/prometheus" +) + +var blkioMetrics = []*metric{ + { + name: "blkio_io_merged_recursive", + help: "The blkio io merged recursive", + unit: metrics.Total, + vt: prometheus.GaugeValue, + labels: []string{"op", "device", "major", "minor"}, + getValues: func(stats *cgroups.Stats) []value { + if stats.Blkio == nil { + return nil + } + return blkioValues(stats.Blkio.IoMergedRecursive) + }, + }, + { + name: "blkio_io_queued_recursive", + help: "The blkio io queued recursive", + unit: metrics.Total, + vt: prometheus.GaugeValue, + labels: []string{"op", "device", "major", "minor"}, + getValues: func(stats *cgroups.Stats) []value { + if stats.Blkio == nil { + return nil + } + return blkioValues(stats.Blkio.IoQueuedRecursive) + }, + }, + { + name: "blkio_io_service_bytes_recursive", + help: "The blkio io service bytes recursive", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + labels: []string{"op", "device", "major", "minor"}, + getValues: func(stats *cgroups.Stats) []value { + if stats.Blkio == nil { + return nil + } + return blkioValues(stats.Blkio.IoServiceBytesRecursive) + }, + }, + { + name: "blkio_io_service_time_recursive", + help: "The blkio io servie time recursive", + unit: metrics.Total, + vt: prometheus.GaugeValue, + labels: []string{"op", "device", "major", "minor"}, + getValues: func(stats *cgroups.Stats) []value { + if stats.Blkio == nil { + return nil + } + return blkioValues(stats.Blkio.IoServiceTimeRecursive) + }, + }, + { + name: "blkio_io_serviced_recursive", + help: "The blkio io servied recursive", + unit: metrics.Total, + vt: prometheus.GaugeValue, + labels: []string{"op", "device", "major", "minor"}, + getValues: func(stats *cgroups.Stats) []value { + if stats.Blkio == nil { + return nil + } + return blkioValues(stats.Blkio.IoServicedRecursive) + }, + }, + { + name: "blkio_io_time_recursive", + help: "The blkio io time recursive", + unit: metrics.Total, + vt: prometheus.GaugeValue, + labels: []string{"op", "device", "major", "minor"}, + getValues: func(stats *cgroups.Stats) []value { + if stats.Blkio == nil { + return nil + } + return blkioValues(stats.Blkio.IoTimeRecursive) + }, + }, + { + name: "blkio_sectors_recursive", + help: "The blkio sectors recursive", + unit: metrics.Total, + vt: prometheus.GaugeValue, + labels: []string{"op", "device", "major", "minor"}, + getValues: func(stats *cgroups.Stats) []value { + if stats.Blkio == nil { + return nil + } + return blkioValues(stats.Blkio.SectorsRecursive) + }, + }, +} diff --git a/metrics/cgroups/cgroups.go b/metrics/cgroups/cgroups.go index a2e33891c..2eff4dd28 100644 --- a/metrics/cgroups/cgroups.go +++ b/metrics/cgroups/cgroups.go @@ -7,7 +7,6 @@ import ( "time" "github.com/containerd/cgroups" - "github.com/containerd/cgroups/prometheus" "github.com/containerd/containerd/plugin" metrics "github.com/docker/go-metrics" "golang.org/x/net/context" @@ -24,9 +23,9 @@ func init() { func New(ic *plugin.InitContext) (interface{}, error) { var ( ns = metrics.NewNamespace("container", "", nil) - collector = prometheus.New(ns) + collector = NewCollector(ns) ) - oom, err := prometheus.NewOOMCollector(ns) + oom, err := NewOOMCollector(ns) if err != nil { return nil, err } @@ -39,8 +38,8 @@ func New(ic *plugin.InitContext) (interface{}, error) { } type cgroupsMonitor struct { - collector *prometheus.Collector - oom *prometheus.OOMCollector + collector *Collector + oom *OOMCollector context context.Context events chan<- *plugin.Event } diff --git a/metrics/cgroups/cpu.go b/metrics/cgroups/cpu.go new file mode 100644 index 000000000..79e8de7e8 --- /dev/null +++ b/metrics/cgroups/cpu.go @@ -0,0 +1,128 @@ +package cgroups + +import ( + "strconv" + + "github.com/containerd/cgroups" + metrics "github.com/docker/go-metrics" + "github.com/prometheus/client_golang/prometheus" +) + +var cpuMetrics = []*metric{ + { + name: "cpu_total", + help: "The total cpu time", + unit: metrics.Nanoseconds, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Cpu == nil { + return nil + } + return []value{ + { + v: float64(stats.Cpu.Usage.Total), + }, + } + }, + }, + { + name: "cpu_kernel", + help: "The total kernel cpu time", + unit: metrics.Nanoseconds, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Cpu == nil { + return nil + } + return []value{ + { + v: float64(stats.Cpu.Usage.Kernel), + }, + } + }, + }, + { + name: "cpu_user", + help: "The total user cpu time", + unit: metrics.Nanoseconds, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Cpu == nil { + return nil + } + return []value{ + { + v: float64(stats.Cpu.Usage.User), + }, + } + }, + }, + { + name: "per_cpu", + help: "The total cpu time per cpu", + unit: metrics.Nanoseconds, + vt: prometheus.GaugeValue, + labels: []string{"cpu"}, + getValues: func(stats *cgroups.Stats) []value { + if stats.Cpu == nil { + return nil + } + var out []value + for i, v := range stats.Cpu.Usage.PerCpu { + out = append(out, value{ + v: float64(v), + l: []string{strconv.Itoa(i)}, + }) + } + return out + }, + }, + { + name: "cpu_throttle_periods", + help: "The total cpu throttle periods", + unit: metrics.Total, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Cpu == nil { + return nil + } + return []value{ + { + v: float64(stats.Cpu.Throttling.Periods), + }, + } + }, + }, + { + name: "cpu_throttled_periods", + help: "The total cpu throttled periods", + unit: metrics.Total, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Cpu == nil { + return nil + } + return []value{ + { + v: float64(stats.Cpu.Throttling.ThrottledPeriods), + }, + } + }, + }, + { + name: "cpu_throttled_time", + help: "The total cpu throttled time", + unit: metrics.Nanoseconds, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Cpu == nil { + return nil + } + return []value{ + { + v: float64(stats.Cpu.Throttling.ThrottledTime), + }, + } + }, + }, +} diff --git a/metrics/cgroups/hugetlb.go b/metrics/cgroups/hugetlb.go new file mode 100644 index 000000000..3540e1aa2 --- /dev/null +++ b/metrics/cgroups/hugetlb.go @@ -0,0 +1,70 @@ +package cgroups + +import ( + "github.com/containerd/cgroups" + metrics "github.com/docker/go-metrics" + "github.com/prometheus/client_golang/prometheus" +) + +var hugetlbMetrics = []*metric{ + { + name: "hugetlb_usage", + help: "The hugetlb usage", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + labels: []string{"page"}, + getValues: func(stats *cgroups.Stats) []value { + if stats.Hugetlb == nil { + return nil + } + var out []value + for page, v := range stats.Hugetlb { + out = append(out, value{ + v: float64(v.Usage), + l: []string{page}, + }) + } + return out + }, + }, + { + name: "hugetlb_failcnt", + help: "The hugetlb failcnt", + unit: metrics.Total, + vt: prometheus.GaugeValue, + labels: []string{"page"}, + getValues: func(stats *cgroups.Stats) []value { + if stats.Hugetlb == nil { + return nil + } + var out []value + for page, v := range stats.Hugetlb { + out = append(out, value{ + v: float64(v.Failcnt), + l: []string{page}, + }) + } + return out + }, + }, + { + name: "hugetlb_max", + help: "The hugetlb maximum usage", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + labels: []string{"page"}, + getValues: func(stats *cgroups.Stats) []value { + if stats.Hugetlb == nil { + return nil + } + var out []value + for page, v := range stats.Hugetlb { + out = append(out, value{ + v: float64(v.Max), + l: []string{page}, + }) + } + return out + }, + }, +} diff --git a/metrics/cgroups/memory.go b/metrics/cgroups/memory.go new file mode 100644 index 000000000..6780b1168 --- /dev/null +++ b/metrics/cgroups/memory.go @@ -0,0 +1,778 @@ +package cgroups + +import ( + "github.com/containerd/cgroups" + metrics "github.com/docker/go-metrics" + "github.com/prometheus/client_golang/prometheus" +) + +var memoryMetrics = []*metric{ + { + name: "memory_cache", + help: "The cache amount used", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Cache), + }, + } + }, + }, + { + name: "memory_rss", + help: "The rss amount used", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.RSS), + }, + } + }, + }, + { + name: "memory_rss_huge", + help: "The rss_huge amount used", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.RSSHuge), + }, + } + }, + }, + { + name: "memory_mapped_file", + help: "The mapped_file amount used", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.MappedFile), + }, + } + }, + }, + { + name: "memory_dirty", + help: "The dirty amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Dirty), + }, + } + }, + }, + { + name: "memory_writeback", + help: "The writeback amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Writeback), + }, + } + }, + }, + { + name: "memory_pgpgin", + help: "The pgpgin amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.PgPgIn), + }, + } + }, + }, + { + name: "memory_pgpgout", + help: "The pgpgout amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.PgPgOut), + }, + } + }, + }, + { + name: "memory_pgfault", + help: "The pgfault amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.PgFault), + }, + } + }, + }, + { + name: "memory_pgmajfault", + help: "The pgmajfault amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.PgMajFault), + }, + } + }, + }, + { + name: "memory_inactive_anon", + help: "The inactive_anon amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.InactiveAnon), + }, + } + }, + }, + { + name: "memory_active_anon", + help: "The active_anon amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.ActiveAnon), + }, + } + }, + }, + { + name: "memory_inactive_file", + help: "The inactive_file amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.InactiveFile), + }, + } + }, + }, + { + name: "memory_active_file", + help: "The active_file amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.ActiveFile), + }, + } + }, + }, + { + name: "memory_unevictable", + help: "The unevictable amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Unevictable), + }, + } + }, + }, + { + name: "memory_hierarchical_memory_limit", + help: "The hierarchical_memory_limit amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.HierarchicalMemoryLimit), + }, + } + }, + }, + { + name: "memory_hierarchical_memsw_limit", + help: "The hierarchical_memsw_limit amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.HierarchicalSwapLimit), + }, + } + }, + }, + { + name: "memory_total_cache", + help: "The total_cache amount used", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalCache), + }, + } + }, + }, + { + name: "memory_total_rss", + help: "The total_rss amount used", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalRSS), + }, + } + }, + }, + { + name: "memory_total_rss_huge", + help: "The total_rss_huge amount used", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalRSSHuge), + }, + } + }, + }, + { + name: "memory_total_mapped_file", + help: "The total_mapped_file amount used", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalMappedFile), + }, + } + }, + }, + { + name: "memory_total_dirty", + help: "The total_dirty amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalDirty), + }, + } + }, + }, + { + name: "memory_total_writeback", + help: "The total_writeback amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalWriteback), + }, + } + }, + }, + { + name: "memory_total_pgpgin", + help: "The total_pgpgin amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalPgPgIn), + }, + } + }, + }, + { + name: "memory_total_pgpgout", + help: "The total_pgpgout amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalPgPgOut), + }, + } + }, + }, + { + name: "memory_total_pgfault", + help: "The total_pgfault amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalPgFault), + }, + } + }, + }, + { + name: "memory_total_pgmajfault", + help: "The total_pgmajfault amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalPgMajFault), + }, + } + }, + }, + { + name: "memory_total_inactive_anon", + help: "The total_inactive_anon amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalInactiveAnon), + }, + } + }, + }, + { + name: "memory_total_active_anon", + help: "The total_active_anon amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalActiveAnon), + }, + } + }, + }, + { + name: "memory_total_inactive_file", + help: "The total_inactive_file amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalInactiveFile), + }, + } + }, + }, + { + name: "memory_total_active_file", + help: "The total_active_file amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalActiveFile), + }, + } + }, + }, + { + name: "memory_total_unevictable", + help: "The total_unevictable amount", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.TotalUnevictable), + }, + } + }, + }, + { + name: "memory_usage_failcnt", + help: "The usage failcnt", + unit: metrics.Total, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Usage.Failcnt), + }, + } + }, + }, + { + name: "memory_usage_limit", + help: "The memory limit", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Usage.Limit), + }, + } + }, + }, + { + name: "memory_usage_max", + help: "The memory maximum usage", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Usage.Max), + }, + } + }, + }, + { + name: "memory_usage_usage", + help: "The memory usage", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Usage.Usage), + }, + } + }, + }, + { + name: "memory_swap_failcnt", + help: "The swap failcnt", + unit: metrics.Total, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Swap.Failcnt), + }, + } + }, + }, + { + name: "memory_swap_limit", + help: "The swap limit", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Swap.Limit), + }, + } + }, + }, + { + name: "memory_swap_max", + help: "The swap maximum usage", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Swap.Max), + }, + } + }, + }, + { + name: "memory_swap_usage", + help: "The swap usage", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Swap.Usage), + }, + } + }, + }, + { + name: "memory_kernel_failcnt", + help: "The kernel failcnt", + unit: metrics.Total, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Kernel.Failcnt), + }, + } + }, + }, + { + name: "memory_kernel_limit", + help: "The kernel limit", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Kernel.Limit), + }, + } + }, + }, + { + name: "memory_kernel_max", + help: "The kernel maximum usage", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Kernel.Max), + }, + } + }, + }, + { + name: "memory_kernel_usage", + help: "The kernel usage", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.Kernel.Usage), + }, + } + }, + }, + { + name: "memory_kerneltcp_failcnt", + help: "The kerneltcp failcnt", + unit: metrics.Total, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.KernelTCP.Failcnt), + }, + } + }, + }, + { + name: "memory_kerneltcp_limit", + help: "The kerneltcp limit", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.KernelTCP.Limit), + }, + } + }, + }, + { + name: "memory_kerneltcp_max", + help: "The kerneltcp maximum usage", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.KernelTCP.Max), + }, + } + }, + }, + { + name: "memory_kerneltcp_usage", + help: "The kerneltcp usage", + unit: metrics.Bytes, + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Memory == nil { + return nil + } + return []value{ + { + v: float64(stats.Memory.KernelTCP.Usage), + }, + } + }, + }, +} diff --git a/metrics/cgroups/metric.go b/metrics/cgroups/metric.go new file mode 100644 index 000000000..1fc71e7cb --- /dev/null +++ b/metrics/cgroups/metric.go @@ -0,0 +1,33 @@ +package cgroups + +import ( + "github.com/containerd/cgroups" + metrics "github.com/docker/go-metrics" + "github.com/prometheus/client_golang/prometheus" +) + +type value struct { + v float64 + l []string +} + +type metric struct { + name string + help string + unit metrics.Unit + vt prometheus.ValueType + labels []string + // getValues returns the value and labels for the data + getValues func(stats *cgroups.Stats) []value +} + +func (m *metric) desc(ns *metrics.Namespace) *prometheus.Desc { + return ns.NewDesc(m.name, m.help, m.unit, append([]string{"id"}, m.labels...)...) +} + +func (m *metric) collect(id string, stats *cgroups.Stats, ns *metrics.Namespace, ch chan<- prometheus.Metric) { + values := m.getValues(stats) + for _, v := range values { + ch <- prometheus.MustNewConstMetric(m.desc(ns), m.vt, v.v, append([]string{id}, v.l...)...) + } +} diff --git a/metrics/cgroups/metrics.go b/metrics/cgroups/metrics.go new file mode 100644 index 000000000..f32980635 --- /dev/null +++ b/metrics/cgroups/metrics.go @@ -0,0 +1,118 @@ +package cgroups + +import ( + "errors" + "strconv" + "sync" + + "github.com/Sirupsen/logrus" + "github.com/containerd/cgroups" + metrics "github.com/docker/go-metrics" + "github.com/prometheus/client_golang/prometheus" +) + +var ( + ErrAlreadyCollected = errors.New("cgroup is already being collected") + ErrCgroupNotExists = errors.New("cgroup does not exist in the collector") +) + +// Trigger will be called when an event happens and provides the cgroup +// where the event originated from +type Trigger func(string, cgroups.Cgroup) + +// New registers the Collector with the provided namespace and returns it so +// that cgroups can be added for collection +func NewCollector(ns *metrics.Namespace) *Collector { + // add machine cpus and memory info + c := &Collector{ + ns: ns, + cgroups: make(map[string]cgroups.Cgroup), + } + c.metrics = append(c.metrics, pidMetrics...) + c.metrics = append(c.metrics, cpuMetrics...) + c.metrics = append(c.metrics, memoryMetrics...) + c.metrics = append(c.metrics, hugetlbMetrics...) + c.metrics = append(c.metrics, blkioMetrics...) + ns.Add(c) + return c +} + +// Collector provides the ability to collect container stats and export +// them in the prometheus format +type Collector struct { + mu sync.RWMutex + + cgroups map[string]cgroups.Cgroup + ns *metrics.Namespace + metrics []*metric +} + +func (c *Collector) Describe(ch chan<- *prometheus.Desc) { + for _, m := range c.metrics { + ch <- m.desc(c.ns) + } +} + +func (c *Collector) Collect(ch chan<- prometheus.Metric) { + c.mu.RLock() + wg := &sync.WaitGroup{} + for id, cg := range c.cgroups { + wg.Add(1) + go c.collect(id, cg, ch, wg) + } + c.mu.RUnlock() + wg.Wait() +} + +func (c *Collector) collect(id string, cg cgroups.Cgroup, ch chan<- prometheus.Metric, wg *sync.WaitGroup) { + defer wg.Done() + stats, err := cg.Stat(cgroups.IgnoreNotExist) + if err != nil { + logrus.WithError(err).Errorf("stat cgroup %s", id) + return + } + for _, m := range c.metrics { + m.collect(id, stats, c.ns, ch) + } +} + +// Add adds the provided cgroup and id so that metrics are collected and exported +func (c *Collector) Add(id string, cg cgroups.Cgroup) error { + c.mu.Lock() + defer c.mu.Unlock() + if _, ok := c.cgroups[id]; ok { + return ErrAlreadyCollected + } + c.cgroups[id] = cg + return nil +} + +// Get returns the cgroup that is being collected under the provided id +// returns ErrCgroupNotExists if the id is not being collected +func (c *Collector) Get(id string) (cgroups.Cgroup, error) { + c.mu.Lock() + defer c.mu.Unlock() + cg, ok := c.cgroups[id] + if !ok { + return nil, ErrCgroupNotExists + } + return cg, nil +} + +// Remove removes the provided cgroup by id from the collector +func (c *Collector) Remove(id string) { + c.mu.Lock() + defer c.mu.Unlock() + delete(c.cgroups, id) +} + +func blkioValues(l []cgroups.BlkioEntry) []value { + var out []value + for _, e := range l { + out = append(out, value{ + v: float64(e.Value), + l: []string{e.Op, e.Device, strconv.FormatUint(e.Major, 10), strconv.FormatUint(e.Minor, 10)}, + }) + } + return out +} diff --git a/metrics/cgroups/oom.go b/metrics/cgroups/oom.go new file mode 100644 index 000000000..239723e5d --- /dev/null +++ b/metrics/cgroups/oom.go @@ -0,0 +1,116 @@ +package cgroups + +import ( + "sync" + + "golang.org/x/sys/unix" + + "github.com/Sirupsen/logrus" + "github.com/containerd/cgroups" + metrics "github.com/docker/go-metrics" +) + +func NewOOMCollector(ns *metrics.Namespace) (*OOMCollector, error) { + fd, err := unix.EpollCreate1(unix.EPOLL_CLOEXEC) + if err != nil { + return nil, err + } + c := &OOMCollector{ + fd: fd, + memoryOOM: ns.NewLabeledGauge("memory_oom", "The number of times a container received an oom event", metrics.Total, "id"), + set: make(map[uintptr]*oom), + } + go c.start() + return c, nil +} + +type OOMCollector struct { + mu sync.Mutex + + memoryOOM metrics.LabeledGauge + fd int + set map[uintptr]*oom +} + +type oom struct { + id string + c cgroups.Cgroup + triggers []Trigger +} + +func (o *OOMCollector) Add(id string, cg cgroups.Cgroup, triggers ...Trigger) error { + o.mu.Lock() + defer o.mu.Unlock() + fd, err := cg.OOMEventFD() + if err != nil { + return err + } + o.set[fd] = &oom{ + id: id, + c: cg, + triggers: triggers, + } + // set the gauge's default value + o.memoryOOM.WithValues(id).Set(0) + event := unix.EpollEvent{ + Fd: int32(fd), + Events: unix.EPOLLHUP | unix.EPOLLIN | unix.EPOLLERR, + } + if err := unix.EpollCtl(o.fd, unix.EPOLL_CTL_ADD, int(fd), &event); err != nil { + return err + } + return nil +} + +// Close closes the epoll fd +func (o *OOMCollector) Close() error { + return unix.Close(int(o.fd)) +} + +func (o *OOMCollector) start() { + var events [128]unix.EpollEvent + for { + n, err := unix.EpollWait(o.fd, events[:], -1) + if err != nil { + if err == unix.EINTR { + continue + } + logrus.WithField("error", err).Fatal("cgroups: epoll wait") + } + for i := 0; i < n; i++ { + o.process(uintptr(events[i].Fd), events[i].Events) + } + } +} + +func (o *OOMCollector) process(fd uintptr, event uint32) { + // make sure to always flush the fd + flush(fd) + + o.mu.Lock() + info, ok := o.set[fd] + if !ok { + o.mu.Unlock() + return + } + o.mu.Unlock() + // if we received an event but it was caused by the cgroup being deleted and the fd + // being closed make sure we close our copy and remove the container from the set + if info.c.State() == cgroups.Deleted { + o.mu.Lock() + delete(o.set, fd) + o.mu.Unlock() + unix.Close(int(fd)) + return + } + o.memoryOOM.WithValues(info.id).Inc(1) + for _, t := range info.triggers { + t(info.id, info.c) + } +} + +func flush(fd uintptr) error { + buf := make([]byte, 8) + _, err := unix.Read(int(fd), buf) + return err +} diff --git a/metrics/cgroups/pids.go b/metrics/cgroups/pids.go new file mode 100644 index 000000000..3cb775404 --- /dev/null +++ b/metrics/cgroups/pids.go @@ -0,0 +1,42 @@ +package cgroups + +import ( + "github.com/containerd/cgroups" + metrics "github.com/docker/go-metrics" + "github.com/prometheus/client_golang/prometheus" +) + +var pidMetrics = []*metric{ + { + name: "pids", + help: "The limit to the number of pids allowed", + unit: metrics.Unit("limit"), + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Pids == nil { + return nil + } + return []value{ + { + v: float64(stats.Pids.Limit), + }, + } + }, + }, + { + name: "pids", + help: "The current number of pids", + unit: metrics.Unit("current"), + vt: prometheus.GaugeValue, + getValues: func(stats *cgroups.Stats) []value { + if stats.Pids == nil { + return nil + } + return []value{ + { + v: float64(stats.Pids.Current), + }, + } + }, + }, +}