Merge pull request #124205 from mkarrmann/wait-for-pods-e2e-cleanup-111092
chore/refactor(e2e tests): Solidify Contract for and Cleanup WaitForPodsRunningReady
This commit is contained in:
		| @@ -38,7 +38,7 @@ import ( | ||||
| var _ = SIGDescribe(framework.WithDisruptive(), "NodeLease", func() { | ||||
| 	f := framework.NewDefaultFramework("node-lease-test") | ||||
| 	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged | ||||
| 	var systemPodsNo int32 | ||||
| 	var systemPodsNo int | ||||
| 	var c clientset.Interface | ||||
| 	var ns string | ||||
| 	var group string | ||||
| @@ -49,7 +49,7 @@ var _ = SIGDescribe(framework.WithDisruptive(), "NodeLease", func() { | ||||
| 		ns = f.Namespace.Name | ||||
| 		systemPods, err := e2epod.GetPodsInNamespace(ctx, c, ns, map[string]string{}) | ||||
| 		framework.ExpectNoError(err) | ||||
| 		systemPodsNo = int32(len(systemPods)) | ||||
| 		systemPodsNo = len(systemPods) | ||||
| 		if strings.Contains(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") { | ||||
| 			framework.Failf("Test dose not support cluster setup with more than one MIG: %s", framework.TestContext.CloudConfig.NodeInstanceGroup) | ||||
| 		} else { | ||||
| @@ -98,7 +98,7 @@ var _ = SIGDescribe(framework.WithDisruptive(), "NodeLease", func() { | ||||
| 			// Many e2e tests assume that the cluster is fully healthy before they start.  Wait until | ||||
| 			// the cluster is restored to health. | ||||
| 			ginkgo.By("waiting for system pods to successfully restart") | ||||
| 			err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, 0, framework.PodReadyBeforeTimeout) | ||||
| 			err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout) | ||||
| 			framework.ExpectNoError(err) | ||||
| 		}) | ||||
|  | ||||
|   | ||||
| @@ -47,7 +47,7 @@ func resizeRC(ctx context.Context, c clientset.Interface, ns, name string, repli | ||||
| var _ = SIGDescribe("Nodes", framework.WithDisruptive(), func() { | ||||
| 	f := framework.NewDefaultFramework("resize-nodes") | ||||
| 	f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged | ||||
| 	var systemPodsNo int32 | ||||
| 	var systemPodsNo int | ||||
| 	var c clientset.Interface | ||||
| 	var ns string | ||||
| 	var group string | ||||
| @@ -57,7 +57,7 @@ var _ = SIGDescribe("Nodes", framework.WithDisruptive(), func() { | ||||
| 		ns = f.Namespace.Name | ||||
| 		systemPods, err := e2epod.GetPodsInNamespace(ctx, c, ns, map[string]string{}) | ||||
| 		framework.ExpectNoError(err) | ||||
| 		systemPodsNo = int32(len(systemPods)) | ||||
| 		systemPodsNo = len(systemPods) | ||||
| 		if strings.Contains(framework.TestContext.CloudConfig.NodeInstanceGroup, ",") { | ||||
| 			framework.Failf("Test dose not support cluster setup with more than one MIG: %s", framework.TestContext.CloudConfig.NodeInstanceGroup) | ||||
| 		} else { | ||||
| @@ -99,7 +99,7 @@ var _ = SIGDescribe("Nodes", framework.WithDisruptive(), func() { | ||||
| 				// Many e2e tests assume that the cluster is fully healthy before they start.  Wait until | ||||
| 				// the cluster is restored to health. | ||||
| 				ginkgo.By("waiting for system pods to successfully restart") | ||||
| 				err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, 0, framework.PodReadyBeforeTimeout) | ||||
| 				err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout) | ||||
| 				framework.ExpectNoError(err) | ||||
| 			}) | ||||
| 		}) | ||||
|   | ||||
| @@ -612,7 +612,7 @@ done | ||||
| 		}) | ||||
|  | ||||
| 		// verify pods are running and ready | ||||
| 		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart) | ||||
| 		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart) | ||||
| 		framework.ExpectNoError(err) | ||||
|  | ||||
| 		// Shutdown pod. Readiness should change to false | ||||
| @@ -694,7 +694,7 @@ done | ||||
| 		}) | ||||
|  | ||||
| 		// verify pods are running and ready | ||||
| 		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart) | ||||
| 		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart) | ||||
| 		framework.ExpectNoError(err) | ||||
|  | ||||
| 		// Shutdown pod. Readiness should change to false | ||||
| @@ -1359,7 +1359,7 @@ done | ||||
| 		}) | ||||
|  | ||||
| 		// verify pods are running and ready | ||||
| 		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart) | ||||
| 		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart) | ||||
| 		framework.ExpectNoError(err) | ||||
|  | ||||
| 		// Shutdown pod. Readiness should change to false | ||||
| @@ -1452,7 +1452,7 @@ done | ||||
| 		}) | ||||
|  | ||||
| 		// verify pods are running and ready | ||||
| 		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, f.Timeouts.PodStart) | ||||
| 		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, f.Timeouts.PodStart) | ||||
| 		framework.ExpectNoError(err) | ||||
|  | ||||
| 		// Shutdown pod. Readiness should change to false | ||||
|   | ||||
| @@ -873,7 +873,7 @@ var _ = SIGDescribe("Pods", func() { | ||||
|  | ||||
| 		// wait as required for all 3 pods to be running | ||||
| 		ginkgo.By("waiting for all 3 pods to be running") | ||||
| 		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, 0, f.Timeouts.PodStart) | ||||
| 		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, f.Timeouts.PodStart) | ||||
| 		framework.ExpectNoError(err, "3 pods not found running.") | ||||
|  | ||||
| 		// delete Collection of pods with a label in the current namespace | ||||
|   | ||||
| @@ -226,7 +226,7 @@ func setupSuite(ctx context.Context) { | ||||
| 	// #41007. To avoid those pods preventing the whole test runs (and just | ||||
| 	// wasting the whole run), we allow for some not-ready pods (with the | ||||
| 	// number equal to the number of allowed not-ready nodes). | ||||
| 	if err := e2epod.WaitForPodsRunningReady(ctx, c, metav1.NamespaceSystem, int32(framework.TestContext.MinStartupPods), int32(framework.TestContext.AllowedNotReadyNodes), timeouts.SystemPodsStartup); err != nil { | ||||
| 	if err := e2epod.WaitForAlmostAllPodsReady(ctx, c, metav1.NamespaceSystem, framework.TestContext.MinStartupPods, framework.TestContext.AllowedNotReadyNodes, timeouts.SystemPodsStartup); err != nil { | ||||
| 		e2edebug.DumpAllNamespaceInfo(ctx, c, metav1.NamespaceSystem) | ||||
| 		e2ekubectl.LogFailedContainers(ctx, c, metav1.NamespaceSystem, framework.Logf) | ||||
| 		framework.Failf("Error waiting for all pods to be running and ready: %v", err) | ||||
|   | ||||
| @@ -99,17 +99,22 @@ func BeInPhase(phase v1.PodPhase) types.GomegaMatcher { | ||||
| 	}).WithTemplate("Expected Pod {{.To}} be in {{format .Data}}\nGot instead:\n{{.FormattedActual}}").WithTemplateData(phase) | ||||
| } | ||||
|  | ||||
| // WaitForPodsRunningReady waits up to timeout to ensure that all pods in | ||||
| // namespace ns are either running and ready, or failed but controlled by a | ||||
| // controller. Also, it ensures that at least minPods are running and | ||||
| // ready. It has separate behavior from other 'wait for' pods functions in | ||||
| // that it requests the list of pods on every iteration. This is useful, for | ||||
| // example, in cluster startup, because the number of pods increases while | ||||
| // waiting. All pods that are in SUCCESS state are not counted. | ||||
| // WaitForAlmostAllReady waits up to timeout for the following conditions: | ||||
| // 1. At least minPods Pods in Namespace ns are Running and Ready | ||||
| // 2. All Pods in Namespace ns are either Ready or Succeeded | ||||
| // 3. All Pods part of a ReplicaSet or ReplicationController in Namespace ns are Ready | ||||
| // | ||||
| // After the timeout has elapsed, an error is returned if the number of Pods in a Pending Phase | ||||
| // is greater than allowedNotReadyPods. | ||||
| // | ||||
| // It is generally recommended to use WaitForPodsRunningReady instead of this function | ||||
| // whenever possible, because its behavior is more intuitive. Similar to WaitForPodsRunningReady, | ||||
| // this function requests the list of pods on every iteration, making it useful for situations | ||||
| // where the set of Pods is likely changing, such as during cluster startup. | ||||
| // | ||||
| // If minPods or allowedNotReadyPods are -1, this method returns immediately | ||||
| // without waiting. | ||||
| func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns string, minPods, allowedNotReadyPods int32, timeout time.Duration) error { | ||||
| func WaitForAlmostAllPodsReady(ctx context.Context, c clientset.Interface, ns string, minPods, allowedNotReadyPods int, timeout time.Duration) error { | ||||
| 	if minPods == -1 || allowedNotReadyPods == -1 { | ||||
| 		return nil | ||||
| 	} | ||||
| @@ -126,14 +131,12 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri | ||||
| 		Pods                   []v1.Pod | ||||
| 	} | ||||
|  | ||||
| 	// notReady is -1 for any failure other than a timeout. | ||||
| 	// Otherwise it is the number of pods that we were still | ||||
| 	// waiting for. | ||||
| 	notReady := int32(-1) | ||||
| 	nOk := 0 | ||||
| 	badPods := []v1.Pod{} | ||||
| 	otherPods := []v1.Pod{} | ||||
| 	succeededPods := []string{} | ||||
|  | ||||
| 	err := framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) (*state, error) { | ||||
| 		// Reset notReady at the start of a poll attempt. | ||||
| 		notReady = -1 | ||||
|  | ||||
| 		rcList, err := c.CoreV1().ReplicationControllers(ns).List(ctx, metav1.ListOptions{}) | ||||
| 		if err != nil { | ||||
| @@ -163,11 +166,10 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri | ||||
| 			replicaOk += rs.Status.ReadyReplicas | ||||
| 		} | ||||
|  | ||||
| 		nOk := int32(0) | ||||
| 		notReady = int32(0) | ||||
| 		failedPods := []v1.Pod{} | ||||
| 		otherPods := []v1.Pod{} | ||||
| 		succeededPods := []string{} | ||||
| 		nOk = 0 | ||||
| 		badPods = []v1.Pod{} | ||||
| 		otherPods = []v1.Pod{} | ||||
| 		succeededPods = []string{} | ||||
| 		for _, pod := range s.Pods { | ||||
| 			res, err := testutils.PodRunningReady(&pod) | ||||
| 			switch { | ||||
| @@ -179,14 +181,13 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri | ||||
| 			case pod.Status.Phase == v1.PodFailed: | ||||
| 				// ignore failed pods that are controlled by some controller | ||||
| 				if metav1.GetControllerOf(&pod) == nil { | ||||
| 					failedPods = append(failedPods, pod) | ||||
| 					badPods = append(badPods, pod) | ||||
| 				} | ||||
| 			default: | ||||
| 				notReady++ | ||||
| 				otherPods = append(otherPods, pod) | ||||
| 			} | ||||
| 		} | ||||
| 		done := replicaOk == replicas && nOk >= minPods && (len(failedPods)+len(otherPods)) == 0 | ||||
| 		done := replicaOk == replicas && nOk >= minPods && (len(badPods)+len(otherPods)) == 0 | ||||
| 		if done { | ||||
| 			return nil, nil | ||||
| 		} | ||||
| @@ -200,8 +201,8 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri | ||||
| 			if len(succeededPods) > 0 { | ||||
| 				buffer.WriteString(fmt.Sprintf("Pods that completed successfully:\n%s", format.Object(succeededPods, 1))) | ||||
| 			} | ||||
| 			if len(failedPods) > 0 { | ||||
| 				buffer.WriteString(fmt.Sprintf("Pods that failed and were not controlled by some controller:\n%s", format.Object(failedPods, 1))) | ||||
| 			if len(badPods) > 0 { | ||||
| 				buffer.WriteString(fmt.Sprintf("Pods that failed and were not controlled by some controller:\n%s", format.Object(badPods, 1))) | ||||
| 			} | ||||
| 			if len(otherPods) > 0 { | ||||
| 				buffer.WriteString(fmt.Sprintf("Pods that were neither completed nor running:\n%s", format.Object(otherPods, 1))) | ||||
| @@ -211,13 +212,79 @@ func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns stri | ||||
| 	})) | ||||
|  | ||||
| 	// An error might not be fatal. | ||||
| 	if err != nil && notReady >= 0 && notReady <= allowedNotReadyPods { | ||||
| 		framework.Logf("Number of not-ready pods (%d) is below the allowed threshold (%d).", notReady, allowedNotReadyPods) | ||||
| 	if len(otherPods) <= allowedNotReadyPods { | ||||
| 		return nil | ||||
| 	} | ||||
| 	return err | ||||
| } | ||||
|  | ||||
| // WaitForPodsRunningReady waits up to timeout for the following conditions: | ||||
| //  1. At least minPods Pods in Namespace ns are Running and Ready | ||||
| //  2. No Pods in Namespace ns are Failed and not owned by a controller or Pending | ||||
| // | ||||
| // An error is returned if either of these conditions are not met within the timeout. | ||||
| // | ||||
| // It has separate behavior from other 'wait for' pods functions in | ||||
| // that it requests the list of pods on every iteration. This is useful, for | ||||
| // example, in cluster startup, because the number of pods increases while | ||||
| // waiting. All pods that are in SUCCESS state are not counted. | ||||
| func WaitForPodsRunningReady(ctx context.Context, c clientset.Interface, ns string, minPods int, timeout time.Duration) error { | ||||
|  | ||||
| 	return framework.Gomega().Eventually(ctx, framework.HandleRetry(func(ctx context.Context) ([]v1.Pod, error) { | ||||
|  | ||||
| 		podList, err := c.CoreV1().Pods(ns).List(ctx, metav1.ListOptions{}) | ||||
| 		if err != nil { | ||||
| 			return nil, fmt.Errorf("listing pods in namespace %s: %w", ns, err) | ||||
| 		} | ||||
| 		return podList.Items, nil | ||||
| 	})).WithTimeout(timeout).Should(framework.MakeMatcher(func(pods []v1.Pod) (func() string, error) { | ||||
|  | ||||
| 		nOk := 0 | ||||
| 		badPods := []v1.Pod{} | ||||
| 		otherPods := []v1.Pod{} | ||||
| 		succeededPods := []string{} | ||||
|  | ||||
| 		for _, pod := range pods { | ||||
| 			res, err := testutils.PodRunningReady(&pod) | ||||
| 			switch { | ||||
| 			case res && err == nil: | ||||
| 				nOk++ | ||||
| 			case pod.Status.Phase == v1.PodSucceeded: | ||||
| 				// ignore succeeded pods | ||||
| 				succeededPods = append(succeededPods, pod.Name) | ||||
| 			case pod.Status.Phase == v1.PodFailed: | ||||
| 				// ignore failed pods that are controlled by some controller | ||||
| 				if metav1.GetControllerOf(&pod) == nil { | ||||
| 					badPods = append(badPods, pod) | ||||
| 				} | ||||
| 			default: | ||||
| 				otherPods = append(otherPods, pod) | ||||
| 			} | ||||
| 		} | ||||
| 		if nOk >= minPods && len(badPods)+len(otherPods) == 0 { | ||||
| 			return nil, nil | ||||
| 		} | ||||
|  | ||||
| 		// Delayed formatting of a failure message. | ||||
| 		return func() string { | ||||
| 			var buffer strings.Builder | ||||
| 			buffer.WriteString(fmt.Sprintf("Expected all pods (need at least %d) in namespace %q to be running and ready \n", minPods, ns)) | ||||
| 			buffer.WriteString(fmt.Sprintf("%d / %d pods were running and ready.\n", nOk, len(pods))) | ||||
| 			if len(succeededPods) > 0 { | ||||
| 				buffer.WriteString(fmt.Sprintf("Pods that completed successfully:\n%s", format.Object(succeededPods, 1))) | ||||
| 			} | ||||
| 			if len(badPods) > 0 { | ||||
| 				buffer.WriteString(fmt.Sprintf("Pods that failed and were not controlled by some controller:\n%s", format.Object(badPods, 1))) | ||||
| 			} | ||||
| 			if len(otherPods) > 0 { | ||||
| 				buffer.WriteString(fmt.Sprintf("Pods that were neither completed nor running:\n%s", format.Object(otherPods, 1))) | ||||
| 			} | ||||
| 			return buffer.String() | ||||
| 		}, nil | ||||
| 	})) | ||||
|  | ||||
| } | ||||
|  | ||||
| // WaitForPodCondition waits a pods to be matched to the given condition. | ||||
| // The condition callback may use gomega.StopTrying to abort early. | ||||
| func WaitForPodCondition(ctx context.Context, c clientset.Interface, ns, podName, conditionDesc string, timeout time.Duration, condition podCondition) error { | ||||
|   | ||||
| @@ -109,7 +109,7 @@ var _ = SIGDescribe("SchedulerPriorities", framework.WithSerial(), func() { | ||||
|  | ||||
| 		err = framework.CheckTestingNSDeletedExcept(ctx, cs, ns) | ||||
| 		framework.ExpectNoError(err) | ||||
| 		err = e2epod.WaitForPodsRunningReady(ctx, cs, metav1.NamespaceSystem, int32(systemPodsNo), 0, framework.PodReadyBeforeTimeout) | ||||
| 		err = e2epod.WaitForPodsRunningReady(ctx, cs, metav1.NamespaceSystem, systemPodsNo, framework.PodReadyBeforeTimeout) | ||||
| 		framework.ExpectNoError(err) | ||||
|  | ||||
| 		// skip if the most utilized node has less than the cri-o minMemLimit available | ||||
|   | ||||
| @@ -657,7 +657,8 @@ var _ = sigDescribe(feature.WindowsHostProcessContainers, "[MinimumKubeletVersio | ||||
|  | ||||
| 		ginkgo.By("Waiting for the pod to start running") | ||||
| 		timeout := 3 * time.Minute | ||||
| 		e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, timeout) | ||||
| 		err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout) | ||||
| 		framework.ExpectNoError(err) | ||||
|  | ||||
| 		ginkgo.By("Getting container stats for pod") | ||||
| 		statsChecked := false | ||||
| @@ -711,7 +712,8 @@ var _ = sigDescribe(feature.WindowsHostProcessContainers, "[MinimumKubeletVersio | ||||
| 		pc.Create(ctx, pod) | ||||
|  | ||||
| 		ginkgo.By("Waiting for pod to run") | ||||
| 		e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, 3*time.Minute) | ||||
| 		err := e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 3*time.Minute) | ||||
| 		framework.ExpectNoError(err) | ||||
|  | ||||
| 		ginkgo.By("Waiting for 60 seconds") | ||||
| 		// We wait an additional 60 seconds after the pod is Running because the | ||||
|   | ||||
| @@ -95,7 +95,8 @@ var _ = sigDescribe(feature.WindowsHyperVContainers, "HyperV containers", skipUn | ||||
| 		pc.Create(ctx, hypervPod) | ||||
| 		ginkgo.By("waiting for the pod to be running") | ||||
| 		timeout := 3 * time.Minute | ||||
| 		e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, 0, timeout) | ||||
| 		err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 1, timeout) | ||||
| 		framework.ExpectNoError(err) | ||||
|  | ||||
| 		ginkgo.By("creating a host process container in another pod to verify the pod is running hyperv isolated containers") | ||||
|  | ||||
|   | ||||
| @@ -60,7 +60,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk | ||||
|  | ||||
| 				ginkgo.By("Waiting up to 3 minutes for pods to be running") | ||||
| 				timeout := 3 * time.Minute | ||||
| 				err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 10, 0, timeout) | ||||
| 				err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 10, timeout) | ||||
| 				framework.ExpectNoError(err) | ||||
|  | ||||
| 				ginkgo.By("Getting kubelet stats 5 times and checking average duration") | ||||
| @@ -152,7 +152,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", skipUnlessWindows(func() { | ||||
|  | ||||
| 				ginkgo.By("Waiting up to 3 minutes for pods to be running") | ||||
| 				timeout := 3 * time.Minute | ||||
| 				err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, 0, timeout) | ||||
| 				err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 3, timeout) | ||||
| 				framework.ExpectNoError(err) | ||||
|  | ||||
| 				ginkgo.By("Getting kubelet stats 1 time") | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kubernetes Prow Robot
					Kubernetes Prow Robot