resource-metrics: add pod/sandbox metrics to endpoint

Pod metrics may not be the same as the sum of container metrics. Add support for pod specific metrics to allow for more accurate accounting of resources. Signed-off-by: Eric Ernst <eric_ernst@apple.com>
2020-10-23 11:23:59 -07:00
parent b3033da9a1
commit 8dfc548709
2 changed files with 76 additions and 3 deletions
--- a/pkg/kubelet/metrics/collectors/resource_metrics.go
+++ b/pkg/kubelet/metrics/collectors/resource_metrics.go
@@ -54,7 +54,21 @@ var (
 		metrics.ALPHA,
 		"")

-	resouceScrapeResultDesc = metrics.NewDesc("scrape_error",
+	podCPUUsageDesc = metrics.NewDesc("pod_cpu_usage_seconds_total",
+		"Cumulative cpu time consumed by the pod in core-seconds",
+		[]string{"pod", "namespace"},
+		nil,
+		metrics.ALPHA,
+		"")
+
+	podMemoryUsageDesc = metrics.NewDesc("pod_memory_working_set_bytes",
+		"Current working set of the pod in bytes",
+		[]string{"pod", "namespace"},
+		nil,
+		metrics.ALPHA,
+		"")
+
+	resourceScrapeResultDesc = metrics.NewDesc("scrape_error",
 		"1 if there was an error while getting container metrics, 0 otherwise",
 		nil,
 		nil,
@@ -84,7 +98,9 @@ func (rc *resourceMetricsCollector) DescribeWithStability(ch chan<- *metrics.Des
 	ch <- nodeMemoryUsageDesc
 	ch <- containerCPUUsageDesc
 	ch <- containerMemoryUsageDesc
-	ch <- resouceScrapeResultDesc
+	ch <- podCPUUsageDesc
+	ch <- podMemoryUsageDesc
+	ch <- resourceScrapeResultDesc
 }

 // CollectWithStability implements metrics.StableCollector
@@ -94,7 +110,7 @@ func (rc *resourceMetricsCollector) DescribeWithStability(ch chan<- *metrics.Des
 func (rc *resourceMetricsCollector) CollectWithStability(ch chan<- metrics.Metric) {
 	var errorCount float64
 	defer func() {
-		ch <- metrics.NewLazyConstMetric(resouceScrapeResultDesc, metrics.GaugeValue, errorCount)
+		ch <- metrics.NewLazyConstMetric(resourceScrapeResultDesc, metrics.GaugeValue, errorCount)
 	}()
 	statsSummary, err := rc.provider.GetCPUAndMemoryStats()
 	if err != nil {
@@ -111,6 +127,8 @@ func (rc *resourceMetricsCollector) CollectWithStability(ch chan<- metrics.Metri
 			rc.collectContainerCPUMetrics(ch, pod, container)
 			rc.collectContainerMemoryMetrics(ch, pod, container)
 		}
+		rc.collectPodCPUMetrics(ch, pod)
+		rc.collectPodMemoryMetrics(ch, pod)
 	}
 }

@@ -151,3 +169,23 @@ func (rc *resourceMetricsCollector) collectContainerMemoryMetrics(ch chan<- metr
 		metrics.NewLazyConstMetric(containerMemoryUsageDesc, metrics.GaugeValue,
 			float64(*s.Memory.WorkingSetBytes), s.Name, pod.PodRef.Name, pod.PodRef.Namespace))
 }
+
+func (rc *resourceMetricsCollector) collectPodCPUMetrics(ch chan<- metrics.Metric, pod summary.PodStats) {
+	if pod.CPU == nil {
+		return
+	}
+
+	ch <- metrics.NewLazyMetricWithTimestamp(pod.CPU.Time.Time,
+		metrics.NewLazyConstMetric(podCPUUsageDesc, metrics.CounterValue,
+			float64(*pod.CPU.UsageCoreNanoSeconds)/float64(time.Second), pod.PodRef.Name, pod.PodRef.Namespace))
+}
+
+func (rc *resourceMetricsCollector) collectPodMemoryMetrics(ch chan<- metrics.Metric, pod summary.PodStats) {
+	if pod.Memory == nil {
+		return
+	}
+
+	ch <- metrics.NewLazyMetricWithTimestamp(pod.Memory.Time.Time,
+		metrics.NewLazyConstMetric(podMemoryUsageDesc, metrics.GaugeValue,
+			float64(*pod.Memory.WorkingSetBytes), pod.PodRef.Name, pod.PodRef.Namespace))
+}
--- a/pkg/kubelet/metrics/collectors/resource_metrics_test.go
+++ b/pkg/kubelet/metrics/collectors/resource_metrics_test.go
@@ -51,6 +51,8 @@ func TestCollectResourceMetrics(t *testing.T) {
 		"node_memory_working_set_bytes",
 		"container_cpu_usage_seconds_total",
 		"container_memory_working_set_bytes",
+		"pod_cpu_usage_seconds_total",
+		"pod_memory_working_set_bytes",
 	}

 	tests := []struct {
@@ -168,6 +170,39 @@ func TestCollectResourceMetrics(t *testing.T) {
 				container_memory_working_set_bytes{container="container_b",namespace="namespace_a",pod="pod_a"} 1000 2000
 			`,
 		},
+		{
+			name: "arbitrary pod metrics",
+			summary: &statsapi.Summary{
+				Pods: []statsapi.PodStats{
+					{
+						PodRef: statsapi.PodReference{
+							Name:      "pod_a",
+							Namespace: "namespace_a",
+						},
+						CPU: &statsapi.CPUStats{
+							Time:                 testTime,
+							UsageCoreNanoSeconds: uint64Ptr(10000000000),
+						},
+						Memory: &statsapi.MemoryStats{
+							Time:            testTime,
+							WorkingSetBytes: uint64Ptr(1000),
+						},
+					},
+				},
+			},
+			summaryErr: nil,
+			expectedMetrics: `
+				# HELP scrape_error [ALPHA] 1 if there was an error while getting container metrics, 0 otherwise
+				# TYPE scrape_error gauge
+				scrape_error 0
+				# HELP pod_cpu_usage_seconds_total [ALPHA] Cumulative cpu time consumed by the pod in core-seconds
+				# TYPE pod_cpu_usage_seconds_total counter
+				pod_cpu_usage_seconds_total{namespace="namespace_a",pod="pod_a"} 10 2000
+				# HELP pod_memory_working_set_bytes [ALPHA] Current working set of the pod in bytes
+				# TYPE pod_memory_working_set_bytes gauge
+				pod_memory_working_set_bytes{namespace="namespace_a",pod="pod_a"} 1000 2000
+			`,
+		},
 	}

 	for _, test := range tests {