kubernetes/test/e2e/common/node/container_probe.go

/*
Copyright 2015 The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package node

import (
	"context"
	"fmt"
	"net"
	"net/url"
	"time"

	v1 "k8s.io/api/core/v1"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/fields"
	"k8s.io/apimachinery/pkg/util/intstr"
	"k8s.io/apimachinery/pkg/util/uuid"
	podutil "k8s.io/kubernetes/pkg/api/v1/pod"
	"k8s.io/kubernetes/pkg/kubelet/events"
	"k8s.io/kubernetes/test/e2e/framework"
	e2eevents "k8s.io/kubernetes/test/e2e/framework/events"
	e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
	testutils "k8s.io/kubernetes/test/utils"
	imageutils "k8s.io/kubernetes/test/utils/image"
	admissionapi "k8s.io/pod-security-admission/api"

	"github.com/onsi/ginkgo"
	"github.com/onsi/gomega"
)

const (
	probeTestInitialDelaySeconds = 15

	defaultObservationTimeout = time.Minute * 4
)

var _ = SIGDescribe("Probing container", func() {
	f := framework.NewDefaultFramework("container-probe")
	f.NamespacePodSecurityEnforceLevel = admissionapi.LevelBaseline
	var podClient *framework.PodClient
	probe := webserverProbeBuilder{}

	ginkgo.BeforeEach(func() {
		podClient = f.PodClient()
	})

	/*
		Release: v1.9
		Testname: Pod readiness probe, with initial delay
		Description: Create a Pod that is configured with a initial delay set on the readiness probe. Check the Pod Start time to compare to the initial delay. The Pod MUST be ready only after the specified initial delay.
	*/
	framework.ConformanceIt("with readiness probe should not be ready before initial delay and never restart [NodeConformance]", func() {
		containerName := "test-webserver"
		p := podClient.Create(testWebServerPodSpec(probe.withInitialDelay().build(), nil, containerName, 80))
		e2epod.WaitTimeoutForPodReadyInNamespace(f.ClientSet, p.Name, f.Namespace.Name, framework.PodStartTimeout)

		p, err := podClient.Get(context.TODO(), p.Name, metav1.GetOptions{})
		framework.ExpectNoError(err)
		isReady, err := testutils.PodRunningReady(p)
		framework.ExpectNoError(err)
		if !isReady {
			framework.Failf("pod %s/%s should be ready", f.Namespace.Name, p.Name)
		}

		// We assume the pod became ready when the container became ready. This
		// is true for a single container pod.
		readyTime, err := GetTransitionTimeForReadyCondition(p)
		framework.ExpectNoError(err)
		startedTime, err := GetContainerStartedTime(p, containerName)
		framework.ExpectNoError(err)

		framework.Logf("Container started at %v, pod became ready at %v", startedTime, readyTime)
		initialDelay := probeTestInitialDelaySeconds * time.Second
		if readyTime.Sub(startedTime) < initialDelay {
			framework.Failf("Pod became ready before it's %v initial delay", initialDelay)
		}

		restartCount := getRestartCount(p)
		framework.ExpectEqual(restartCount, 0, "pod should have a restart count of 0 but got %v", restartCount)
	})

	/*
		Release: v1.9
		Testname: Pod readiness probe, failure
		Description: Create a Pod with a readiness probe that fails consistently. When this Pod is created,
			then the Pod MUST never be ready, never be running and restart count MUST be zero.
	*/
	framework.ConformanceIt("with readiness probe that fails should never be ready and never restart [NodeConformance]", func() {
		p := podClient.Create(testWebServerPodSpec(probe.withFailing().build(), nil, "test-webserver", 80))
		gomega.Consistently(func() (bool, error) {
			p, err := podClient.Get(context.TODO(), p.Name, metav1.GetOptions{})
			if err != nil {
				return false, err
			}
			return podutil.IsPodReady(p), nil
		}, 1*time.Minute, 1*time.Second).ShouldNot(gomega.BeTrue(), "pod should not be ready")

		p, err := podClient.Get(context.TODO(), p.Name, metav1.GetOptions{})
		framework.ExpectNoError(err)

		isReady, _ := testutils.PodRunningReady(p)
		if isReady {
			framework.Failf("pod %s/%s should be not ready", f.Namespace.Name, p.Name)
		}

		restartCount := getRestartCount(p)
		framework.ExpectEqual(restartCount, 0, "pod should have a restart count of 0 but got %v", restartCount)
	})

	/*
		Release: v1.9
		Testname: Pod liveness probe, using local file, restart
		Description: Create a Pod with liveness probe that uses ExecAction handler to cat /temp/health file. The Container deletes the file /temp/health after 10 second, triggering liveness probe to fail. The Pod MUST now be killed and restarted incrementing restart count to 1.
	*/
	framework.ConformanceIt("should be restarted with a exec \"cat /tmp/health\" liveness probe [NodeConformance]", func() {
		cmd := []string{"/bin/sh", "-c", "echo ok >/tmp/health; sleep 10; rm -rf /tmp/health; sleep 600"}
		livenessProbe := &v1.Probe{
			ProbeHandler:        execHandler([]string{"cat", "/tmp/health"}),
			InitialDelaySeconds: 15,
			TimeoutSeconds:      5, // default 1s can be pretty aggressive in CI environments with low resources
			FailureThreshold:    1,
		}
		pod := busyBoxPodSpec(nil, livenessProbe, cmd)
		RunLivenessTest(f, pod, 1, defaultObservationTimeout)
	})

	/*
		Release: v1.9
		Testname: Pod liveness probe, using local file, no restart
		Description:  Pod is created with liveness probe that uses 'exec' command to cat /temp/health file. Liveness probe MUST not fail to check health and the restart count should remain 0.
	*/
	framework.ConformanceIt("should *not* be restarted with a exec \"cat /tmp/health\" liveness probe [NodeConformance]", func() {
		cmd := []string{"/bin/sh", "-c", "echo ok >/tmp/health; sleep 600"}
		livenessProbe := &v1.Probe{
			ProbeHandler:        execHandler([]string{"cat", "/tmp/health"}),
			InitialDelaySeconds: 15,
			TimeoutSeconds:      5, // default 1s can be pretty aggressive in CI environments with low resources
			FailureThreshold:    1,
		}
		pod := busyBoxPodSpec(nil, livenessProbe, cmd)
		RunLivenessTest(f, pod, 0, defaultObservationTimeout)
	})

	/*
		Release: v1.9
		Testname: Pod liveness probe, using http endpoint, restart
		Description: A Pod is created with liveness probe on http endpoint /healthz. The http handler on the /healthz will return a http error after 10 seconds since the Pod is started. This MUST result in liveness check failure. The Pod MUST now be killed and restarted incrementing restart count to 1.
	*/
	framework.ConformanceIt("should be restarted with a /healthz http liveness probe [NodeConformance]", func() {
		livenessProbe := &v1.Probe{
			ProbeHandler:        httpGetHandler("/healthz", 8080),
			InitialDelaySeconds: 15,
			FailureThreshold:    1,
		}
		pod := livenessPodSpec(f.Namespace.Name, nil, livenessProbe)
		RunLivenessTest(f, pod, 1, defaultObservationTimeout)
	})

	/*
		Release: v1.18
		Testname: Pod liveness probe, using tcp socket, no restart
		Description: A Pod is created with liveness probe on tcp socket 8080. The http handler on port 8080 will return http errors after 10 seconds, but the socket will remain open. Liveness probe MUST not fail to check health and the restart count should remain 0.
	*/
	framework.ConformanceIt("should *not* be restarted with a tcp:8080 liveness probe [NodeConformance]", func() {
		livenessProbe := &v1.Probe{
			ProbeHandler:        tcpSocketHandler(8080),
			InitialDelaySeconds: 15,
			FailureThreshold:    1,
		}
		pod := livenessPodSpec(f.Namespace.Name, nil, livenessProbe)
		RunLivenessTest(f, pod, 0, defaultObservationTimeout)
	})

	/*
		Release: v1.9
		Testname: Pod liveness probe, using http endpoint, multiple restarts (slow)
		Description: A Pod is created with liveness probe on http endpoint /healthz. The http handler on the /healthz will return a http error after 10 seconds since the Pod is started. This MUST result in liveness check failure. The Pod MUST now be killed and restarted incrementing restart count to 1. The liveness probe must fail again after restart once the http handler for /healthz enpoind on the Pod returns an http error after 10 seconds from the start. Restart counts MUST increment everytime health check fails, measure upto 5 restart.
	*/
	framework.ConformanceIt("should have monotonically increasing restart count [NodeConformance]", func() {
		livenessProbe := &v1.Probe{
			ProbeHandler:        httpGetHandler("/healthz", 8080),
			InitialDelaySeconds: 5,
			FailureThreshold:    1,
		}
		pod := livenessPodSpec(f.Namespace.Name, nil, livenessProbe)
		// ~2 minutes backoff timeouts + 4 minutes defaultObservationTimeout + 2 minutes for each pod restart
		RunLivenessTest(f, pod, 5, 2*time.Minute+defaultObservationTimeout+4*2*time.Minute)
	})

	/*
		Release: v1.9
		Testname: Pod liveness probe, using http endpoint, failure
		Description: A Pod is created with liveness probe on http endpoint '/'. Liveness probe on this endpoint will not fail. When liveness probe does not fail then the restart count MUST remain zero.
	*/
	framework.ConformanceIt("should *not* be restarted with a /healthz http liveness probe [NodeConformance]", func() {
		livenessProbe := &v1.Probe{
			ProbeHandler:        httpGetHandler("/", 80),
			InitialDelaySeconds: 15,
			TimeoutSeconds:      5,
			FailureThreshold:    5, // to accommodate nodes which are slow in bringing up containers.
		}
		pod := testWebServerPodSpec(nil, livenessProbe, "test-webserver", 80)
		RunLivenessTest(f, pod, 0, defaultObservationTimeout)
	})

	/*
		Release: v1.9
		Testname: Pod liveness probe, container exec timeout, restart
		Description: A Pod is created with liveness probe with a Exec action on the Pod. If the liveness probe call does not return within the timeout specified, liveness probe MUST restart the Pod.
	*/
	ginkgo.It("should be restarted with an exec liveness probe with timeout [MinimumKubeletVersion:1.20] [NodeConformance]", func() {
		cmd := []string{"/bin/sh", "-c", "sleep 600"}
		livenessProbe := &v1.Probe{
			ProbeHandler:        execHandler([]string{"/bin/sh", "-c", "sleep 10"}),
			InitialDelaySeconds: 15,
			TimeoutSeconds:      1,
			FailureThreshold:    1,
		}
		pod := busyBoxPodSpec(nil, livenessProbe, cmd)
		RunLivenessTest(f, pod, 1, defaultObservationTimeout)
	})

	/*
		Release: v1.20
		Testname: Pod readiness probe, container exec timeout, not ready
		Description: A Pod is created with readiness probe with a Exec action on the Pod. If the readiness probe call does not return within the timeout specified, readiness probe MUST not be Ready.
	*/
	ginkgo.It("should not be ready with an exec readiness probe timeout [MinimumKubeletVersion:1.20] [NodeConformance]", func() {
		cmd := []string{"/bin/sh", "-c", "sleep 600"}
		readinessProbe := &v1.Probe{
			ProbeHandler:        execHandler([]string{"/bin/sh", "-c", "sleep 10"}),
			InitialDelaySeconds: 15,
			TimeoutSeconds:      1,
			FailureThreshold:    1,
		}
		pod := busyBoxPodSpec(readinessProbe, nil, cmd)
		runReadinessFailTest(f, pod, time.Minute)
	})

	/*
		Release: v1.21
		Testname: Pod liveness probe, container exec timeout, restart
		Description: A Pod is created with liveness probe with a Exec action on the Pod. If the liveness probe call does not return within the timeout specified, liveness probe MUST restart the Pod. When ExecProbeTimeout feature gate is disabled and cluster is using dockershim, the timeout is ignored BUT a failing liveness probe MUST restart the Pod.
	*/
	ginkgo.It("should be restarted with a failing exec liveness probe that took longer than the timeout", func() {
		cmd := []string{"/bin/sh", "-c", "sleep 600"}
		livenessProbe := &v1.Probe{
			ProbeHandler:        execHandler([]string{"/bin/sh", "-c", "sleep 10 & exit 1"}),
			InitialDelaySeconds: 15,
			TimeoutSeconds:      1,
			FailureThreshold:    1,
		}
		pod := busyBoxPodSpec(nil, livenessProbe, cmd)
		RunLivenessTest(f, pod, 1, defaultObservationTimeout)
	})

	/*
		Release: v1.14
		Testname: Pod http liveness probe, redirected to a local address
		Description: A Pod is created with liveness probe on http endpoint /redirect?loc=healthz. The http handler on the /redirect will redirect to the /healthz endpoint, which will return a http error after 10 seconds since the Pod is started. This MUST result in liveness check failure. The Pod MUST now be killed and restarted incrementing restart count to 1.
	*/
	ginkgo.It("should be restarted with a local redirect http liveness probe", func() {
		livenessProbe := &v1.Probe{
			ProbeHandler:        httpGetHandler("/redirect?loc="+url.QueryEscape("/healthz"), 8080),
			InitialDelaySeconds: 15,
			FailureThreshold:    1,
		}
		pod := livenessPodSpec(f.Namespace.Name, nil, livenessProbe)
		RunLivenessTest(f, pod, 1, defaultObservationTimeout)
	})

	/*
		Release: v1.14
		Testname: Pod http liveness probe, redirected to a non-local address
		Description: A Pod is created with liveness probe on http endpoint /redirect with a redirect to http://0.0.0.0/. The http handler on the /redirect should not follow the redirect, but instead treat it as a success and generate an event.
	*/
	ginkgo.It("should *not* be restarted with a non-local redirect http liveness probe", func() {
		livenessProbe := &v1.Probe{
			ProbeHandler:        httpGetHandler("/redirect?loc="+url.QueryEscape("http://0.0.0.0/"), 8080),
			InitialDelaySeconds: 15,
			FailureThreshold:    1,
		}
		pod := livenessPodSpec(f.Namespace.Name, nil, livenessProbe)
		RunLivenessTest(f, pod, 0, defaultObservationTimeout)
		// Expect an event of type "ProbeWarning".
		expectedEvent := fields.Set{
			"involvedObject.kind":      "Pod",
			"involvedObject.name":      pod.Name,
			"involvedObject.namespace": f.Namespace.Name,
			"reason":                   events.ContainerProbeWarning,
		}.AsSelector().String()
		framework.ExpectNoError(e2eevents.WaitTimeoutForEvent(
			f.ClientSet, f.Namespace.Name, expectedEvent, "Probe terminated redirects, Response body: <a href=\"http://0.0.0.0/\">Found</a>.", framework.PodEventTimeout))
	})

	/*
		Release: v1.16
		Testname: Pod startup probe restart
		Description: A Pod is created with a failing startup probe. The Pod MUST be killed and restarted incrementing restart count to 1, even if liveness would succeed.
	*/
	ginkgo.It("should be restarted startup probe fails", func() {
		cmd := []string{"/bin/sh", "-c", "sleep 600"}
		livenessProbe := &v1.Probe{
			ProbeHandler: v1.ProbeHandler{
				Exec: &v1.ExecAction{
					Command: []string{"/bin/true"},
				},
			},
			InitialDelaySeconds: 15,
			FailureThreshold:    1,
		}
		startupProbe := &v1.Probe{
			ProbeHandler: v1.ProbeHandler{
				Exec: &v1.ExecAction{
					Command: []string{"/bin/false"},
				},
			},
			InitialDelaySeconds: 15,
			FailureThreshold:    3,
		}
		pod := startupPodSpec(startupProbe, nil, livenessProbe, cmd)
		RunLivenessTest(f, pod, 1, defaultObservationTimeout)
	})

	/*
		Release: v1.16
		Testname: Pod liveness probe delayed (long) by startup probe
		Description: A Pod is created with failing liveness and startup probes. Liveness probe MUST NOT fail until startup probe expires.
	*/
	ginkgo.It("should *not* be restarted by liveness probe because startup probe delays it", func() {
		cmd := []string{"/bin/sh", "-c", "sleep 600"}
		livenessProbe := &v1.Probe{
			ProbeHandler: v1.ProbeHandler{
				Exec: &v1.ExecAction{
					Command: []string{"/bin/false"},
				},
			},
			InitialDelaySeconds: 15,
			FailureThreshold:    1,
		}
		startupProbe := &v1.Probe{
			ProbeHandler: v1.ProbeHandler{
				Exec: &v1.ExecAction{
					Command: []string{"/bin/false"},
				},
			},
			InitialDelaySeconds: 15,
			FailureThreshold:    60,
		}
		pod := startupPodSpec(startupProbe, nil, livenessProbe, cmd)
		RunLivenessTest(f, pod, 0, defaultObservationTimeout)
	})

	/*
		Release: v1.16
		Testname: Pod liveness probe fails after startup success
		Description: A Pod is created with failing liveness probe and delayed startup probe that uses 'exec' command to cat /temp/health file. The Container is started by creating /tmp/startup after 10 seconds, triggering liveness probe to fail. The Pod MUST now be killed and restarted incrementing restart count to 1.
	*/
	ginkgo.It("should be restarted by liveness probe after startup probe enables it", func() {
		cmd := []string{"/bin/sh", "-c", "sleep 10; echo ok >/tmp/startup; sleep 600"}
		livenessProbe := &v1.Probe{
			ProbeHandler: v1.ProbeHandler{
				Exec: &v1.ExecAction{
					Command: []string{"/bin/false"},
				},
			},
			InitialDelaySeconds: 15,
			FailureThreshold:    1,
		}
		startupProbe := &v1.Probe{
			ProbeHandler: v1.ProbeHandler{
				Exec: &v1.ExecAction{
					Command: []string{"cat", "/tmp/startup"},
				},
			},
			InitialDelaySeconds: 15,
			FailureThreshold:    60,
		}
		pod := startupPodSpec(startupProbe, nil, livenessProbe, cmd)
		RunLivenessTest(f, pod, 1, defaultObservationTimeout)
	})

	/*
		Release: v1.16
		Testname: Pod readiness probe, delayed by startup probe
		Description: A Pod is created with startup and readiness probes. The Container is started by creating /tmp/startup after 45 seconds, delaying the ready state by this amount of time. This is similar to the "Pod readiness probe, with initial delay" test.
	*/
	ginkgo.It("should be ready immediately after startupProbe succeeds", func() {
		// Probe workers sleep at Kubelet start for a random time which is at most PeriodSeconds
		// this test requires both readiness and startup workers running before updating statuses
		// to avoid flakes, ensure sleep before startup (32s) > readinessProbe.PeriodSeconds
		cmd := []string{"/bin/sh", "-c", "echo ok >/tmp/health; sleep 32; echo ok >/tmp/startup; sleep 600"}
		readinessProbe := &v1.Probe{
			ProbeHandler:        execHandler([]string{"/bin/cat", "/tmp/health"}),
			InitialDelaySeconds: 0,
			PeriodSeconds:       30,
		}
		startupProbe := &v1.Probe{
			ProbeHandler:        execHandler([]string{"/bin/cat", "/tmp/startup"}),
			InitialDelaySeconds: 0,
			FailureThreshold:    120,
			PeriodSeconds:       5,
		}
		p := podClient.Create(startupPodSpec(startupProbe, readinessProbe, nil, cmd))

		p, err := podClient.Get(context.TODO(), p.Name, metav1.GetOptions{})
		framework.ExpectNoError(err)

		err = e2epod.WaitForPodContainerStarted(f.ClientSet, f.Namespace.Name, p.Name, 0, framework.PodStartTimeout)
		framework.ExpectNoError(err)
		startedTime := time.Now()

		// We assume the pod became ready when the container became ready. This
		// is true for a single container pod.
		err = e2epod.WaitTimeoutForPodReadyInNamespace(f.ClientSet, p.Name, f.Namespace.Name, framework.PodStartTimeout)
		framework.ExpectNoError(err)
		readyTime := time.Now()

		p, err = podClient.Get(context.TODO(), p.Name, metav1.GetOptions{})
		framework.ExpectNoError(err)

		isReady, err := testutils.PodRunningReady(p)
		framework.ExpectNoError(err)
		if !isReady {
			framework.Failf("pod %s/%s should be ready", f.Namespace.Name, p.Name)
		}

		readyIn := readyTime.Sub(startedTime)
		framework.Logf("Container started at %v, pod became ready at %v, %v after startupProbe succeeded", startedTime, readyTime, readyIn)
		if readyIn < 0 {
			framework.Failf("Pod became ready before startupProbe succeeded")
		}
		if readyIn > 25*time.Second {
			framework.Failf("Pod became ready in %v, more than 25s after startupProbe succeeded. It means that the delay readiness probes were not initiated immediately after startup finished.", readyIn)
		}
	})

	/*
		Release: v1.21
		Testname: Set terminationGracePeriodSeconds for livenessProbe
		Description: A pod with a long terminationGracePeriod is created with a shorter livenessProbe-level terminationGracePeriodSeconds. We confirm the shorter termination period is used.
	*/
	ginkgo.It("should override timeoutGracePeriodSeconds when LivenessProbe field is set [Feature:ProbeTerminationGracePeriod]", func() {
		pod := e2epod.NewAgnhostPod(f.Namespace.Name, "liveness-override-"+string(uuid.NewUUID()), nil, nil, nil, "/bin/sh", "-c", "sleep 1000")
		longGracePeriod := int64(500)
		pod.Spec.TerminationGracePeriodSeconds = &longGracePeriod

		// probe will fail since pod has no http endpoints
		shortGracePeriod := int64(5)
		pod.Spec.Containers[0].LivenessProbe = &v1.Probe{
			ProbeHandler: v1.ProbeHandler{
				HTTPGet: &v1.HTTPGetAction{
					Path: "/healthz",
					Port: intstr.FromInt(8080),
				},
			},
			InitialDelaySeconds:           10,
			FailureThreshold:              1,
			TerminationGracePeriodSeconds: &shortGracePeriod,
		}

		// 10s delay + 10s period + 5s grace period = 25s < 30s << pod-level timeout 500
		RunLivenessTest(f, pod, 1, time.Second*30)
	})

	/*
		Release: v1.21
		Testname: Set terminationGracePeriodSeconds for startupProbe
		Description: A pod with a long terminationGracePeriod is created with a shorter startupProbe-level terminationGracePeriodSeconds. We confirm the shorter termination period is used.
	*/
	ginkgo.It("should override timeoutGracePeriodSeconds when StartupProbe field is set [Feature:ProbeTerminationGracePeriod]", func() {
		pod := e2epod.NewAgnhostPod(f.Namespace.Name, "startup-override-"+string(uuid.NewUUID()), nil, nil, nil, "/bin/sh", "-c", "sleep 1000")
		longGracePeriod := int64(500)
		pod.Spec.TerminationGracePeriodSeconds = &longGracePeriod

		// startup probe will fail since pod will sleep for 1000s before becoming ready
		shortGracePeriod := int64(5)
		pod.Spec.Containers[0].StartupProbe = &v1.Probe{
			ProbeHandler:                  execHandler([]string{"/bin/cat", "/tmp/startup"}),
			InitialDelaySeconds:           10,
			FailureThreshold:              1,
			TerminationGracePeriodSeconds: &shortGracePeriod,
		}
		// liveness probe always succeeds
		pod.Spec.Containers[0].LivenessProbe = &v1.Probe{
			ProbeHandler: v1.ProbeHandler{
				Exec: &v1.ExecAction{
					Command: []string{"/bin/true"},
				},
			},
			InitialDelaySeconds: 15,
			FailureThreshold:    1,
		}

		// 10s delay + 10s period + 5s grace period = 25s < 30s << pod-level timeout 500
		RunLivenessTest(f, pod, 1, time.Second*30)
	})

	/*
		Release: v1.23
		Testname: Pod liveness probe, using grpc call, success
		Description: A Pod is created with liveness probe on grpc service. Liveness probe on this endpoint will not fail. When liveness probe does not fail then the restart count MUST remain zero.
	*/
	ginkgo.It("should *not* be restarted with a GRPC liveness probe [NodeConformance]", func() {
		livenessProbe := &v1.Probe{
			ProbeHandler: v1.ProbeHandler{
				GRPC: &v1.GRPCAction{
					Port:    2379,
					Service: nil,
				},
			},
			InitialDelaySeconds: probeTestInitialDelaySeconds,
			TimeoutSeconds:      5, // default 1s can be pretty aggressive in CI environments with low resources
			FailureThreshold:    1,
		}

		pod := gRPCServerPodSpec(nil, livenessProbe, "etcd")
		RunLivenessTest(f, pod, 0, defaultObservationTimeout)
	})

	/*
			Release: v1.23
			Testname: Pod liveness probe, using grpc call, failure
			Description: A Pod is created with liveness probe on grpc service. Liveness probe on this endpoint should fail because of wrong probe port.
		                 When liveness probe does  fail then the restart count should +1.
	*/
	ginkgo.It("should be restarted with a GRPC liveness probe [NodeConformance]", func() {
		livenessProbe := &v1.Probe{
			ProbeHandler: v1.ProbeHandler{
				GRPC: &v1.GRPCAction{
					Port: 2333, // this port is wrong
				},
			},
			InitialDelaySeconds: probeTestInitialDelaySeconds * 4,
			TimeoutSeconds:      5, // default 1s can be pretty aggressive in CI environments with low resources
			FailureThreshold:    1,
		}
		pod := gRPCServerPodSpec(nil, livenessProbe, "etcd")
		RunLivenessTest(f, pod, 1, defaultObservationTimeout)
	})
})

// GetContainerStartedTime returns the time when the given container started and error if any
func GetContainerStartedTime(p *v1.Pod, containerName string) (time.Time, error) {
	for _, status := range p.Status.ContainerStatuses {
		if status.Name != containerName {
			continue
		}
		if status.State.Running == nil {
			return time.Time{}, fmt.Errorf("container is not running")
		}
		return status.State.Running.StartedAt.Time, nil
	}
	return time.Time{}, fmt.Errorf("cannot find container named %q", containerName)
}

// GetTransitionTimeForReadyCondition returns the time when the given pod became ready and error if any
func GetTransitionTimeForReadyCondition(p *v1.Pod) (time.Time, error) {
	for _, cond := range p.Status.Conditions {
		if cond.Type == v1.PodReady {
			return cond.LastTransitionTime.Time, nil
		}
	}
	return time.Time{}, fmt.Errorf("no ready condition can be found for pod")
}

func getRestartCount(p *v1.Pod) int {
	count := 0
	for _, containerStatus := range p.Status.ContainerStatuses {
		count += int(containerStatus.RestartCount)
	}
	return count
}

func testWebServerPodSpec(readinessProbe, livenessProbe *v1.Probe, containerName string, port int) *v1.Pod {
	return &v1.Pod{
		ObjectMeta: metav1.ObjectMeta{Name: "test-webserver-" + string(uuid.NewUUID())},
		Spec: v1.PodSpec{
			Containers: []v1.Container{
				{
					Name:           containerName,
					Image:          imageutils.GetE2EImage(imageutils.Agnhost),
					Args:           []string{"test-webserver"},
					Ports:          []v1.ContainerPort{{ContainerPort: int32(port)}},
					LivenessProbe:  livenessProbe,
					ReadinessProbe: readinessProbe,
				},
			},
		},
	}
}

func busyBoxPodSpec(readinessProbe, livenessProbe *v1.Probe, cmd []string) *v1.Pod {
	return &v1.Pod{
		ObjectMeta: metav1.ObjectMeta{
			Name:   "busybox-" + string(uuid.NewUUID()),
			Labels: map[string]string{"test": "liveness"},
		},
		Spec: v1.PodSpec{
			Containers: []v1.Container{
				{
					Name:           "busybox",
					Image:          imageutils.GetE2EImage(imageutils.BusyBox),
					Command:        cmd,
					LivenessProbe:  livenessProbe,
					ReadinessProbe: readinessProbe,
				},
			},
		},
	}
}

func livenessPodSpec(namespace string, readinessProbe, livenessProbe *v1.Probe) *v1.Pod {
	pod := e2epod.NewAgnhostPod(namespace, "liveness-"+string(uuid.NewUUID()), nil, nil, nil, "liveness")
	pod.ObjectMeta.Labels = map[string]string{"test": "liveness"}
	pod.Spec.Containers[0].LivenessProbe = livenessProbe
	pod.Spec.Containers[0].ReadinessProbe = readinessProbe
	return pod
}

func startupPodSpec(startupProbe, readinessProbe, livenessProbe *v1.Probe, cmd []string) *v1.Pod {
	return &v1.Pod{
		ObjectMeta: metav1.ObjectMeta{
			Name:   "startup-" + string(uuid.NewUUID()),
			Labels: map[string]string{"test": "startup"},
		},
		Spec: v1.PodSpec{
			Containers: []v1.Container{
				{
					Name:           "busybox",
					Image:          imageutils.GetE2EImage(imageutils.BusyBox),
					Command:        cmd,
					LivenessProbe:  livenessProbe,
					ReadinessProbe: readinessProbe,
					StartupProbe:   startupProbe,
				},
			},
		},
	}
}

func execHandler(cmd []string) v1.ProbeHandler {
	return v1.ProbeHandler{
		Exec: &v1.ExecAction{
			Command: cmd,
		},
	}
}

func httpGetHandler(path string, port int) v1.ProbeHandler {
	return v1.ProbeHandler{
		HTTPGet: &v1.HTTPGetAction{
			Path: path,
			Port: intstr.FromInt(port),
		},
	}
}

func tcpSocketHandler(port int) v1.ProbeHandler {
	return v1.ProbeHandler{
		TCPSocket: &v1.TCPSocketAction{
			Port: intstr.FromInt(port),
		},
	}
}

type webserverProbeBuilder struct {
	failing      bool
	initialDelay bool
}

func (b webserverProbeBuilder) withFailing() webserverProbeBuilder {
	b.failing = true
	return b
}

func (b webserverProbeBuilder) withInitialDelay() webserverProbeBuilder {
	b.initialDelay = true
	return b
}

func (b webserverProbeBuilder) build() *v1.Probe {
	probe := &v1.Probe{
		ProbeHandler: httpGetHandler("/", 80),
	}
	if b.initialDelay {
		probe.InitialDelaySeconds = probeTestInitialDelaySeconds
	}
	if b.failing {
		probe.HTTPGet.Port = intstr.FromInt(81)
	}
	return probe
}

// RunLivenessTest verifies the number of restarts for pod with given expected number of restarts
func RunLivenessTest(f *framework.Framework, pod *v1.Pod, expectNumRestarts int, timeout time.Duration) {
	podClient := f.PodClient()
	ns := f.Namespace.Name
	gomega.Expect(pod.Spec.Containers).NotTo(gomega.BeEmpty())
	containerName := pod.Spec.Containers[0].Name
	// At the end of the test, clean up by removing the pod.
	defer func() {
		ginkgo.By("deleting the pod")
		podClient.Delete(context.TODO(), pod.Name, *metav1.NewDeleteOptions(0))
	}()
	ginkgo.By(fmt.Sprintf("Creating pod %s in namespace %s", pod.Name, ns))
	podClient.Create(pod)

	// Wait until the pod is not pending. (Here we need to check for something other than
	// 'Pending' other than checking for 'Running', since when failures occur, we go to
	// 'Terminated' which can cause indefinite blocking.)
	framework.ExpectNoError(e2epod.WaitForPodNotPending(f.ClientSet, ns, pod.Name),
		fmt.Sprintf("starting pod %s in namespace %s", pod.Name, ns))
	framework.Logf("Started pod %s in namespace %s", pod.Name, ns)

	// Check the pod's current state and verify that restartCount is present.
	ginkgo.By("checking the pod's current state and verifying that restartCount is present")
	pod, err := podClient.Get(context.TODO(), pod.Name, metav1.GetOptions{})
	framework.ExpectNoError(err, fmt.Sprintf("getting pod %s in namespace %s", pod.Name, ns))
	initialRestartCount := podutil.GetExistingContainerStatus(pod.Status.ContainerStatuses, containerName).RestartCount
	framework.Logf("Initial restart count of pod %s is %d", pod.Name, initialRestartCount)

	// Wait for the restart state to be as desired.
	deadline := time.Now().Add(timeout)
	lastRestartCount := initialRestartCount
	observedRestarts := int32(0)
	for start := time.Now(); time.Now().Before(deadline); time.Sleep(2 * time.Second) {
		pod, err = podClient.Get(context.TODO(), pod.Name, metav1.GetOptions{})
		framework.ExpectNoError(err, fmt.Sprintf("getting pod %s", pod.Name))
		restartCount := podutil.GetExistingContainerStatus(pod.Status.ContainerStatuses, containerName).RestartCount
		if restartCount != lastRestartCount {
			framework.Logf("Restart count of pod %s/%s is now %d (%v elapsed)",
				ns, pod.Name, restartCount, time.Since(start))
			if restartCount < lastRestartCount {
				framework.Failf("Restart count should increment monotonically: restart cont of pod %s/%s changed from %d to %d",
					ns, pod.Name, lastRestartCount, restartCount)
			}
		}
		observedRestarts = restartCount - initialRestartCount
		if expectNumRestarts > 0 && int(observedRestarts) >= expectNumRestarts {
			// Stop if we have observed more than expectNumRestarts restarts.
			break
		}
		lastRestartCount = restartCount
	}

	// If we expected 0 restarts, fail if observed any restart.
	// If we expected n restarts (n > 0), fail if we observed < n restarts.
	if (expectNumRestarts == 0 && observedRestarts > 0) || (expectNumRestarts > 0 &&
		int(observedRestarts) < expectNumRestarts) {
		framework.Failf("pod %s/%s - expected number of restarts: %d, found restarts: %d",
			ns, pod.Name, expectNumRestarts, observedRestarts)
	}
}

func runReadinessFailTest(f *framework.Framework, pod *v1.Pod, notReadyUntil time.Duration) {
	podClient := f.PodClient()
	ns := f.Namespace.Name
	gomega.Expect(pod.Spec.Containers).NotTo(gomega.BeEmpty())

	// At the end of the test, clean up by removing the pod.
	defer func() {
		ginkgo.By("deleting the pod")
		podClient.Delete(context.TODO(), pod.Name, *metav1.NewDeleteOptions(0))
	}()
	ginkgo.By(fmt.Sprintf("Creating pod %s in namespace %s", pod.Name, ns))
	podClient.Create(pod)

	// Wait until the pod is not pending. (Here we need to check for something other than
	// 'Pending', since when failures occur, we go to 'Terminated' which can cause indefinite blocking.)
	framework.ExpectNoError(e2epod.WaitForPodNotPending(f.ClientSet, ns, pod.Name),
		fmt.Sprintf("starting pod %s in namespace %s", pod.Name, ns))
	framework.Logf("Started pod %s in namespace %s", pod.Name, ns)

	// Wait for the not ready state to be true for notReadyUntil duration
	deadline := time.Now().Add(notReadyUntil)
	for start := time.Now(); time.Now().Before(deadline); time.Sleep(2 * time.Second) {
		// poll for Not Ready
		if podutil.IsPodReady(pod) {
			framework.Failf("pod %s/%s - expected to be not ready", ns, pod.Name)
		}

		framework.Logf("pod %s/%s is not ready (%v elapsed)",
			ns, pod.Name, time.Since(start))
	}
}

func gRPCServerPodSpec(readinessProbe, livenessProbe *v1.Probe, containerName string) *v1.Pod {
	etcdLocalhostAddress := "127.0.0.1"
	if framework.TestContext.ClusterIsIPv6() {
		etcdLocalhostAddress = "::1"
	}
	etcdURL := fmt.Sprintf("http://%s", net.JoinHostPort(etcdLocalhostAddress, "2379"))
	return &v1.Pod{
		ObjectMeta: metav1.ObjectMeta{Name: "test-grpc-" + string(uuid.NewUUID())},
		Spec: v1.PodSpec{
			Containers: []v1.Container{
				{
					Name:  containerName,
					Image: imageutils.GetE2EImage(imageutils.Etcd),
					Command: []string{
						"/usr/local/bin/etcd",
						"--listen-client-urls",
						"http://0.0.0.0:2379", //should listen on all addresses
						"--advertise-client-urls",
						etcdURL,
					},
					// 2380 is an automatic peer URL
					Ports:          []v1.ContainerPort{{ContainerPort: int32(2379)}, {ContainerPort: int32(2380)}},
					LivenessProbe:  livenessProbe,
					ReadinessProbe: readinessProbe,
				},
			},
		},
	}
}