diff --git a/pkg/kubelet/kubelet_pods.go b/pkg/kubelet/kubelet_pods.go index cb8bca6d21c..bf91b9c7f94 100644 --- a/pkg/kubelet/kubelet_pods.go +++ b/pkg/kubelet/kubelet_pods.go @@ -1173,6 +1173,21 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error { metrics.RestartedPodTotal.WithLabelValues("true").Add(float64(restartCountStatic)) metrics.RestartedPodTotal.WithLabelValues("").Add(float64(restartCount)) + // Complete termination of deleted pods that are not runtime pods (don't have + // running containers), are terminal, and are not known to pod workers. + // An example is pods rejected during kubelet admission that have never + // started before (i.e. does not have an orphaned pod). + // Adding the pods with SyncPodKill to pod workers allows to proceed with + // force-deletion of such pods, yet preventing re-entry of the routine in the + // next invocation of HandlePodCleanups. + for _, pod := range kl.filterTerminalPodsToDelete(allPods, runningRuntimePods, workingPods) { + klog.V(3).InfoS("Handling termination and deletion of the pod to pod workers", "pod", klog.KObj(pod), "podUID", pod.UID) + kl.podWorkers.UpdatePod(UpdatePodOptions{ + UpdateType: kubetypes.SyncPodKill, + Pod: pod, + }) + } + // Finally, terminate any pods that are observed in the runtime but not present in the list of // known running pods from config. If we do terminate running runtime pods that will happen // asynchronously in the background and those will be processed in the next invocation of @@ -1245,6 +1260,41 @@ func (kl *Kubelet) HandlePodCleanups(ctx context.Context) error { return nil } +// filterTerminalPodsToDelete returns terminal pods which are ready to be +// deleted by the status manager, but are not in pod workers. +// First, the check for deletionTimestamp is a performance optimization as we +// don't need to do anything with terminal pods without deletionTimestamp. +// Second, the check for terminal pods is to avoid race conditions of triggering +// deletion on Pending pods which are not yet added to pod workers. +// Third, the check to skip pods known to pod workers is that the lifecycle of +// such pods is already handled by pod workers. +// Finally, we skip runtime pods as their termination is handled separately in +// the HandlePodCleanups routine. +func (kl *Kubelet) filterTerminalPodsToDelete(allPods []*v1.Pod, runningRuntimePods []*kubecontainer.Pod, workingPods map[types.UID]PodWorkerSync) map[types.UID]*v1.Pod { + terminalPodsToDelete := make(map[types.UID]*v1.Pod) + for _, pod := range allPods { + if pod.DeletionTimestamp == nil { + // skip pods which don't have a deletion timestamp + continue + } + if !podutil.IsPodPhaseTerminal(pod.Status.Phase) { + // skip the non-terminal pods + continue + } + if _, knownPod := workingPods[pod.UID]; knownPod { + // skip pods known to pod workers + continue + } + terminalPodsToDelete[pod.UID] = pod + } + for _, runningRuntimePod := range runningRuntimePods { + // skip running runtime pods - they are handled by a dedicated routine + // which terminates the containers + delete(terminalPodsToDelete, runningRuntimePod.ID) + } + return terminalPodsToDelete +} + // splitPodsByStatic separates a list of desired pods from the pod manager into // regular or static pods. Mirror pods are not valid config sources (a mirror pod // being created cannot cause the Kubelet to start running a static pod) and are diff --git a/pkg/kubelet/status/status_manager.go b/pkg/kubelet/status/status_manager.go index 82eb83d9e52..37aea7230d5 100644 --- a/pkg/kubelet/status/status_manager.go +++ b/pkg/kubelet/status/status_manager.go @@ -897,14 +897,16 @@ func (m *manager) canBeDeleted(pod *v1.Pod, status v1.PodStatus, podIsFinished b if pod.DeletionTimestamp == nil || kubetypes.IsMirrorPod(pod) { return false } - // Delay deletion of pods until the phase is terminal. + // Delay deletion of pods until the phase is terminal, based on pod.Status + // which comes from pod manager. if !podutil.IsPodPhaseTerminal(pod.Status.Phase) { - klog.V(3).InfoS("Delaying pod deletion as the phase is non-terminal", "phase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID) + // For debugging purposes we also log the kubelet's local phase, when the deletion is delayed. + klog.V(3).InfoS("Delaying pod deletion as the phase is non-terminal", "phase", pod.Status.Phase, "localPhase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID) return false } // If this is an update completing pod termination then we know the pod termination is finished. if podIsFinished { - klog.V(3).InfoS("The pod termination is finished as SyncTerminatedPod completes its execution", "phase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID) + klog.V(3).InfoS("The pod termination is finished as SyncTerminatedPod completes its execution", "phase", pod.Status.Phase, "localPhase", status.Phase, "pod", klog.KObj(pod), "podUID", pod.UID) return true } return false diff --git a/test/e2e_node/restart_test.go b/test/e2e_node/restart_test.go index 9157e3e69e3..e892862c36c 100644 --- a/test/e2e_node/restart_test.go +++ b/test/e2e_node/restart_test.go @@ -381,6 +381,145 @@ var _ = SIGDescribe("Restart [Serial] [Slow] [Disruptive]", func() { return checkMirrorPodDisappear(ctx, f.ClientSet, pod.Name, pod.Namespace) }, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.BeNil()) }) + // Regression test for https://issues.k8s.io/118472 + ginkgo.It("should force-delete non-admissible pods created and deleted during kubelet restart", func(ctx context.Context) { + podName := "rejected-deleted-pod" + string(uuid.NewUUID()) + gracePeriod := int64(30) + nodeName := getNodeName(ctx, f) + podSpec := e2epod.MustMixinRestrictedPodSecurity(&v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: f.Namespace.Name, + }, + Spec: v1.PodSpec{ + NodeName: nodeName, + NodeSelector: map[string]string{ + "this-label": "does-not-exist-on-any-nodes", + }, + TerminationGracePeriodSeconds: &gracePeriod, + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Name: podName, + Image: imageutils.GetPauseImageName(), + }, + }, + }, + }) + ginkgo.By("Stopping the kubelet") + startKubelet := stopKubelet() + + // wait until the kubelet health check will fail + gomega.Eventually(ctx, func() bool { + return kubeletHealthCheck(kubeletHealthCheckURL) + }, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeFalse()) + + // Create the pod bound to the node. It will remain in the Pending + // phase as Kubelet is down. + ginkgo.By(fmt.Sprintf("Creating a pod (%v/%v)", f.Namespace.Name, podName)) + pod := e2epod.NewPodClient(f).Create(ctx, podSpec) + + ginkgo.By(fmt.Sprintf("Deleting the pod (%v/%v) to set a deletion timestamp", pod.Namespace, pod.Name)) + err := e2epod.NewPodClient(f).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod}) + framework.ExpectNoError(err, "Failed to delete the pod: %q", pod.Name) + + // Restart Kubelet so that it proceeds with deletion + ginkgo.By("Starting the kubelet") + startKubelet() + + // wait until the kubelet health check will succeed + gomega.Eventually(ctx, func() bool { + return kubeletHealthCheck(kubeletHealthCheckURL) + }, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeTrue()) + + // Wait for the Kubelet to be ready. + gomega.Eventually(ctx, func(ctx context.Context) bool { + nodes, err := e2enode.TotalReady(ctx, f.ClientSet) + framework.ExpectNoError(err) + return nodes == 1 + }, time.Minute, f.Timeouts.Poll).Should(gomega.BeTrue()) + + ginkgo.By(fmt.Sprintf("After the kubelet is restarted, verify the pod (%v/%v) is deleted by kubelet", pod.Namespace, pod.Name)) + gomega.Eventually(ctx, func(ctx context.Context) error { + return checkMirrorPodDisappear(ctx, f.ClientSet, pod.Name, pod.Namespace) + }, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.BeNil()) + }) + // Regression test for an extended scenario for https://issues.k8s.io/118472 + ginkgo.It("should force-delete non-admissible pods that was admitted and running before kubelet restart", func(ctx context.Context) { + nodeLabelKey := "custom-label-key-required" + nodeLabelValueRequired := "custom-label-value-required-for-admission" + podName := "rejected-deleted-run" + string(uuid.NewUUID()) + gracePeriod := int64(30) + nodeName := getNodeName(ctx, f) + pod := e2epod.MustMixinRestrictedPodSecurity(&v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + Namespace: f.Namespace.Name, + }, + Spec: v1.PodSpec{ + NodeSelector: map[string]string{ + nodeLabelKey: nodeLabelValueRequired, + }, + NodeName: nodeName, + TerminationGracePeriodSeconds: &gracePeriod, + RestartPolicy: v1.RestartPolicyNever, + Containers: []v1.Container{ + { + Name: podName, + Image: imageutils.GetPauseImageName(), + }, + }, + }, + }) + + ginkgo.By(fmt.Sprintf("Adding node label for node (%v) to allow admission of pod (%v/%v)", nodeName, f.Namespace.Name, podName)) + e2enode.AddOrUpdateLabelOnNode(f.ClientSet, nodeName, nodeLabelKey, nodeLabelValueRequired) + ginkgo.DeferCleanup(func() { e2enode.RemoveLabelOffNode(f.ClientSet, nodeName, nodeLabelKey) }) + + // Create the pod bound to the node. It will start, but will be rejected after kubelet restart. + ginkgo.By(fmt.Sprintf("Creating a pod (%v/%v)", f.Namespace.Name, podName)) + pod = e2epod.NewPodClient(f).Create(ctx, pod) + + ginkgo.By(fmt.Sprintf("Waiting for the pod (%v/%v) to be running", f.Namespace.Name, pod.Name)) + err := e2epod.WaitForPodNameRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name) + framework.ExpectNoError(err, "Failed to await for the pod to be running: (%v/%v)", f.Namespace.Name, pod.Name) + + ginkgo.By("Stopping the kubelet") + startKubelet := stopKubelet() + + // wait until the kubelet health check will fail + gomega.Eventually(ctx, func() bool { + return kubeletHealthCheck(kubeletHealthCheckURL) + }, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeFalse()) + + ginkgo.By(fmt.Sprintf("Deleting the pod (%v/%v) to set a deletion timestamp", pod.Namespace, pod.Name)) + err = e2epod.NewPodClient(f).Delete(ctx, pod.Name, metav1.DeleteOptions{GracePeriodSeconds: &gracePeriod}) + framework.ExpectNoError(err, "Failed to delete the pod: %q", pod.Name) + + ginkgo.By(fmt.Sprintf("Removing node label for node (%v) to ensure the pod (%v/%v) is rejected after kubelet restart", nodeName, f.Namespace.Name, podName)) + e2enode.RemoveLabelOffNode(f.ClientSet, nodeName, nodeLabelKey) + + // Restart Kubelet so that it proceeds with deletion + ginkgo.By("Starting the kubelet") + startKubelet() + + // wait until the kubelet health check will succeed + gomega.Eventually(ctx, func() bool { + return kubeletHealthCheck(kubeletHealthCheckURL) + }, f.Timeouts.PodStart, f.Timeouts.Poll).Should(gomega.BeTrue()) + + // Wait for the Kubelet to be ready. + gomega.Eventually(ctx, func(ctx context.Context) bool { + nodes, err := e2enode.TotalReady(ctx, f.ClientSet) + framework.ExpectNoError(err) + return nodes == 1 + }, time.Minute, f.Timeouts.Poll).Should(gomega.BeTrue()) + + ginkgo.By(fmt.Sprintf("Once Kubelet is restarted, verify the pod (%v/%v) is deleted by kubelet", pod.Namespace, pod.Name)) + gomega.Eventually(ctx, func(ctx context.Context) error { + return checkMirrorPodDisappear(ctx, f.ClientSet, pod.Name, pod.Namespace) + }, f.Timeouts.PodDelete, f.Timeouts.Poll).Should(gomega.BeNil()) + }) }) })