
The first pod with network latency impact user workloads, however, it is difficuly to understand where is the problem of this latency, since it depends on the CNI plugin to be ready at the moment of the pod creation. Add a new internal metric in the kubelet that allow developers and cluster administrator to understand the source of the latency problems on node startups. kubelet_first_network_pod_start_sli_duration_seconds Change-Id: I4cdb55b0df72c96a3a65b78ce2aae404c5195006
364 lines
14 KiB
Go
364 lines
14 KiB
Go
/*
|
|
Copyright 2022 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package util
|
|
|
|
import (
|
|
"strings"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/stretchr/testify/assert"
|
|
corev1 "k8s.io/api/core/v1"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/types"
|
|
|
|
"k8s.io/component-base/metrics/testutil"
|
|
"k8s.io/kubernetes/pkg/kubelet/metrics"
|
|
testingclock "k8s.io/utils/clock/testing"
|
|
)
|
|
|
|
var frozenTime = time.Date(2020, 1, 1, 0, 0, 0, 0, time.UTC)
|
|
|
|
const (
|
|
uid = "3c1df8a9-11a8-4791-aeae-184c18cca686"
|
|
metricsName = "kubelet_pod_start_sli_duration_seconds"
|
|
)
|
|
|
|
func TestNoEvents(t *testing.T) {
|
|
t.Run("metrics registered; no incoming events", func(t *testing.T) {
|
|
|
|
// expects no metrics in the output
|
|
wants := ""
|
|
|
|
metrics.Register()
|
|
|
|
tracker := &basicPodStartupLatencyTracker{
|
|
pods: map[types.UID]*perPodState{},
|
|
}
|
|
|
|
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
assert.Empty(t, tracker.pods)
|
|
metrics.PodStartSLIDuration.Reset()
|
|
})
|
|
}
|
|
|
|
func TestPodsRunningBeforeKubeletStarted(t *testing.T) {
|
|
t.Run("pod was running for 10m before kubelet started", func(t *testing.T) {
|
|
|
|
// expects no metrics in the output
|
|
wants := ""
|
|
|
|
metrics.Register()
|
|
|
|
tracker := &basicPodStartupLatencyTracker{
|
|
pods: map[types.UID]*perPodState{},
|
|
}
|
|
|
|
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
podStarted := &corev1.Pod{
|
|
Status: corev1.PodStatus{
|
|
StartTime: &metav1.Time{Time: frozenTime.Add(-10 * time.Minute)},
|
|
},
|
|
}
|
|
tracker.ObservedPodOnWatch(podStarted, frozenTime)
|
|
|
|
assert.Empty(t, tracker.pods)
|
|
metrics.PodStartSLIDuration.Reset()
|
|
})
|
|
}
|
|
|
|
func TestSinglePodOneImageDownloadRecorded(t *testing.T) {
|
|
|
|
t.Run("single pod; started in 3s, image pulling 100ms", func(t *testing.T) {
|
|
|
|
wants := `
|
|
# HELP kubelet_pod_start_sli_duration_seconds [ALPHA] Duration in seconds to start a pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch
|
|
# TYPE kubelet_pod_start_sli_duration_seconds histogram
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="0.5"} 0
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="1"} 0
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="2"} 0
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="3"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="4"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="5"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="6"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="8"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="10"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="20"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="30"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="45"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="60"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="120"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="180"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="240"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="300"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="360"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="480"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="600"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="900"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="1200"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="1800"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="2700"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="3600"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="+Inf"} 1
|
|
kubelet_pod_start_sli_duration_seconds_sum 2.9
|
|
kubelet_pod_start_sli_duration_seconds_count 1
|
|
`
|
|
|
|
fakeClock := testingclock.NewFakeClock(frozenTime)
|
|
|
|
metrics.Register()
|
|
|
|
tracker := &basicPodStartupLatencyTracker{
|
|
pods: map[types.UID]*perPodState{},
|
|
clock: fakeClock,
|
|
}
|
|
|
|
podInit := buildInitializingPod()
|
|
tracker.ObservedPodOnWatch(podInit, frozenTime)
|
|
|
|
// image pulling took 100ms
|
|
tracker.RecordImageStartedPulling(podInit.UID)
|
|
fakeClock.Step(time.Millisecond * 100)
|
|
tracker.RecordImageFinishedPulling(podInit.UID)
|
|
|
|
podState, ok := tracker.pods[podInit.UID]
|
|
if !ok {
|
|
t.Errorf("expected to track pod: %s, but pod not found", podInit.UID)
|
|
}
|
|
if !podState.lastFinishedPulling.Equal(podState.firstStartedPulling.Add(time.Millisecond * 100)) {
|
|
t.Errorf("expected pod firstStartedPulling: %s and lastFinishedPulling: %s but got firstStartedPulling: %s and lastFinishedPulling: %s",
|
|
podState.firstStartedPulling, podState.firstStartedPulling.Add(time.Millisecond*100), podState.firstStartedPulling, podState.lastFinishedPulling)
|
|
}
|
|
|
|
podStarted := buildRunningPod()
|
|
tracker.RecordStatusUpdated(podStarted)
|
|
|
|
// 3s later, observe the same pod on watch
|
|
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*3))
|
|
|
|
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// cleanup
|
|
tracker.DeletePodStartupState(podStarted.UID)
|
|
|
|
assert.Empty(t, tracker.pods)
|
|
metrics.PodStartSLIDuration.Reset()
|
|
})
|
|
}
|
|
|
|
func TestSinglePodMultipleDownloadsAndRestartsRecorded(t *testing.T) {
|
|
|
|
t.Run("single pod; started in 30s, image pulling between 10th and 20th seconds", func(t *testing.T) {
|
|
|
|
wants := `
|
|
# HELP kubelet_pod_start_sli_duration_seconds [ALPHA] Duration in seconds to start a pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch
|
|
# TYPE kubelet_pod_start_sli_duration_seconds histogram
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="0.5"} 0
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="1"} 0
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="2"} 0
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="3"} 0
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="4"} 0
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="5"} 0
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="6"} 0
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="8"} 0
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="10"} 0
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="20"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="30"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="45"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="60"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="120"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="180"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="240"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="300"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="360"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="480"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="600"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="900"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="1200"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="1800"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="2700"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="3600"} 1
|
|
kubelet_pod_start_sli_duration_seconds_bucket{le="+Inf"} 1
|
|
kubelet_pod_start_sli_duration_seconds_sum 20
|
|
kubelet_pod_start_sli_duration_seconds_count 1
|
|
`
|
|
|
|
fakeClock := testingclock.NewFakeClock(frozenTime)
|
|
|
|
metrics.Register()
|
|
|
|
tracker := &basicPodStartupLatencyTracker{
|
|
pods: map[types.UID]*perPodState{},
|
|
clock: fakeClock,
|
|
}
|
|
|
|
podInitializing := buildInitializingPod()
|
|
tracker.ObservedPodOnWatch(podInitializing, frozenTime)
|
|
|
|
// image pulling started at 10s and the last one finished at 30s
|
|
// first image starts pulling at 10s
|
|
fakeClock.SetTime(frozenTime.Add(time.Second * 10))
|
|
tracker.RecordImageStartedPulling(podInitializing.UID)
|
|
// second image starts pulling at 11s
|
|
fakeClock.SetTime(frozenTime.Add(time.Second * 11))
|
|
tracker.RecordImageStartedPulling(podInitializing.UID)
|
|
// third image starts pulling at 14s
|
|
fakeClock.SetTime(frozenTime.Add(time.Second * 14))
|
|
tracker.RecordImageStartedPulling(podInitializing.UID)
|
|
// first image finished pulling at 18s
|
|
fakeClock.SetTime(frozenTime.Add(time.Second * 18))
|
|
tracker.RecordImageFinishedPulling(podInitializing.UID)
|
|
// second and third finished pulling at 20s
|
|
fakeClock.SetTime(frozenTime.Add(time.Second * 20))
|
|
tracker.RecordImageFinishedPulling(podInitializing.UID)
|
|
|
|
podState, ok := tracker.pods[podInitializing.UID]
|
|
if !ok {
|
|
t.Errorf("expected to track pod: %s, but pod not found", podInitializing.UID)
|
|
}
|
|
if !podState.firstStartedPulling.Equal(frozenTime.Add(time.Second * 10)) { // second and third image start pulling should not affect pod firstStartedPulling
|
|
t.Errorf("expected pod firstStartedPulling: %s but got firstStartedPulling: %s",
|
|
podState.firstStartedPulling.Add(time.Second*10), podState.firstStartedPulling)
|
|
}
|
|
if !podState.lastFinishedPulling.Equal(frozenTime.Add(time.Second * 20)) { // should be updated when the pod's last image finished pulling
|
|
t.Errorf("expected pod lastFinishedPulling: %s but got lastFinishedPulling: %s",
|
|
podState.lastFinishedPulling.Add(time.Second*20), podState.lastFinishedPulling)
|
|
}
|
|
|
|
// pod started
|
|
podStarted := buildRunningPod()
|
|
tracker.RecordStatusUpdated(podStarted)
|
|
|
|
// at 30s observe the same pod on watch
|
|
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*30))
|
|
|
|
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// any new pod observations should not impact the metrics, as the pod should be recorder only once
|
|
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*150))
|
|
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*200))
|
|
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*250))
|
|
|
|
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), metricsName); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// cleanup
|
|
tracker.DeletePodStartupState(podStarted.UID)
|
|
|
|
assert.Empty(t, tracker.pods)
|
|
metrics.PodStartSLIDuration.Reset()
|
|
})
|
|
}
|
|
|
|
func TestFirstNetworkPodMetrics(t *testing.T) {
|
|
|
|
t.Run("first network pod; started in 30s, image pulling between 10th and 20th seconds", func(t *testing.T) {
|
|
|
|
wants := `
|
|
# HELP kubelet_first_network_pod_start_sli_duration_seconds [INTERNAL] Duration in seconds to start the first network pod, excluding time to pull images and run init containers, measured from pod creation timestamp to when all its containers are reported as started and observed via watch
|
|
# TYPE kubelet_first_network_pod_start_sli_duration_seconds gauge
|
|
kubelet_first_network_pod_start_sli_duration_seconds 30
|
|
`
|
|
|
|
fakeClock := testingclock.NewFakeClock(frozenTime)
|
|
|
|
metrics.Register()
|
|
|
|
tracker := &basicPodStartupLatencyTracker{
|
|
pods: map[types.UID]*perPodState{},
|
|
clock: fakeClock,
|
|
}
|
|
|
|
// hostNetwork pods should not be tracked
|
|
hostNetworkPodInitializing := buildInitializingPod()
|
|
hostNetworkPodInitializing.UID = "11111-22222"
|
|
hostNetworkPodInitializing.Spec.HostNetwork = true
|
|
tracker.ObservedPodOnWatch(hostNetworkPodInitializing, frozenTime)
|
|
|
|
hostNetworkPodStarted := buildRunningPod()
|
|
hostNetworkPodStarted.UID = "11111-22222"
|
|
hostNetworkPodStarted.Spec.HostNetwork = true
|
|
tracker.RecordStatusUpdated(hostNetworkPodStarted)
|
|
|
|
// track only the first pod with network
|
|
podInitializing := buildInitializingPod()
|
|
tracker.ObservedPodOnWatch(podInitializing, frozenTime)
|
|
|
|
// pod started
|
|
podStarted := buildRunningPod()
|
|
tracker.RecordStatusUpdated(podStarted)
|
|
|
|
// at 30s observe the same pod on watch
|
|
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*30))
|
|
|
|
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), "kubelet_first_network_pod_start_sli_duration_seconds", "kubelet_first_network_pod_start_total_duration_seconds"); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// any new pod observations should not impact the metrics, as the pod should be recorder only once
|
|
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*150))
|
|
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*200))
|
|
tracker.ObservedPodOnWatch(podStarted, frozenTime.Add(time.Second*250))
|
|
|
|
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(wants), "kubelet_first_network_pod_start_sli_duration_seconds", "kubelet_first_network_pod_start_total_duration_seconds"); err != nil {
|
|
t.Fatal(err)
|
|
}
|
|
|
|
// cleanup
|
|
tracker.DeletePodStartupState(hostNetworkPodStarted.UID)
|
|
tracker.DeletePodStartupState(podStarted.UID)
|
|
|
|
assert.Empty(t, tracker.pods)
|
|
metrics.PodStartSLIDuration.Reset()
|
|
})
|
|
}
|
|
|
|
func buildInitializingPod() *corev1.Pod {
|
|
return buildPodWithStatus([]corev1.ContainerStatus{
|
|
{State: corev1.ContainerState{Waiting: &corev1.ContainerStateWaiting{Reason: "PodInitializing"}}},
|
|
})
|
|
}
|
|
|
|
func buildRunningPod() *corev1.Pod {
|
|
return buildPodWithStatus([]corev1.ContainerStatus{
|
|
{State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{StartedAt: metav1.NewTime(frozenTime)}}},
|
|
})
|
|
}
|
|
|
|
func buildPodWithStatus(cs []corev1.ContainerStatus) *corev1.Pod {
|
|
return &corev1.Pod{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
UID: types.UID(uid),
|
|
CreationTimestamp: metav1.NewTime(frozenTime),
|
|
},
|
|
Status: corev1.PodStatus{
|
|
ContainerStatuses: cs,
|
|
},
|
|
}
|
|
}
|