Merge pull request #124503 from neolit123/1.31-convert-create-job-preflight-to-warning
kubeadm: check for available nodes during 'CreateJob' preflight
This commit is contained in:
		| @@ -64,7 +64,7 @@ func (c *healthCheck) Name() string { | ||||
| } | ||||
|  | ||||
| // CheckClusterHealth makes sure: | ||||
| // - the API /healthz endpoint is healthy | ||||
| // - the cluster can accept a workload | ||||
| // - all control-plane Nodes are Ready | ||||
| // - (if static pod-hosted) that all required Static Pod manifests exist on disk | ||||
| func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.Set[string], printer output.Printer) error { | ||||
| @@ -92,11 +92,18 @@ func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfi | ||||
| } | ||||
|  | ||||
| // createJob is a check that verifies that a Job can be created in the cluster | ||||
| func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) (lastError error) { | ||||
| func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) error { | ||||
| 	const ( | ||||
| 		prefix  = "upgrade-health-check" | ||||
| 		ns      = metav1.NamespaceSystem | ||||
| 		timeout = 15 * time.Second | ||||
| 		prefix        = "upgrade-health-check" | ||||
| 		fieldSelector = "spec.unschedulable=false" | ||||
| 		ns            = metav1.NamespaceSystem | ||||
| 		timeout       = 15 * time.Second | ||||
| 	) | ||||
| 	var ( | ||||
| 		err, lastError error | ||||
| 		ctx            = context.Background() | ||||
| 		nodes          *v1.NodeList | ||||
| 		listOptions    = metav1.ListOptions{Limit: 1, FieldSelector: fieldSelector} | ||||
| 	) | ||||
|  | ||||
| 	// If client.Discovery().RESTClient() is nil, the fake client is used. | ||||
| @@ -106,6 +113,25 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) | ||||
| 		return nil | ||||
| 	} | ||||
|  | ||||
| 	// Check if there is at least one Node where a Job's Pod can schedule. If not, skip this preflight check. | ||||
| 	err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) { | ||||
| 		nodes, err = client.CoreV1().Nodes().List(context.Background(), listOptions) | ||||
| 		if err != nil { | ||||
| 			klog.V(2).Infof("Could not list Nodes with field selector %q: %v", fieldSelector, err) | ||||
| 			lastError = err | ||||
| 			return false, nil | ||||
| 		} | ||||
| 		return true, nil | ||||
| 	}) | ||||
| 	if err != nil { | ||||
| 		return errors.Wrap(lastError, "could not check if there is at least one Node that can schedule a test Pod") | ||||
| 	} | ||||
|  | ||||
| 	if len(nodes.Items) == 0 { | ||||
| 		klog.Warning("The preflight check \"CreateJob\" was skipped because there are no schedulable Nodes in the cluster.") | ||||
| 		return nil | ||||
| 	} | ||||
|  | ||||
| 	// Prepare Job | ||||
| 	job := &batchv1.Job{ | ||||
| 		ObjectMeta: metav1.ObjectMeta{ | ||||
| @@ -114,7 +140,7 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) | ||||
| 		}, | ||||
| 		Spec: batchv1.JobSpec{ | ||||
| 			BackoffLimit:            ptr.To[int32](0), | ||||
| 			TTLSecondsAfterFinished: ptr.To[int32](2), | ||||
| 			TTLSecondsAfterFinished: ptr.To[int32](int32(timeout.Seconds()) + 5), // Make sure it's more than 'timeout'. | ||||
| 			Template: v1.PodTemplateSpec{ | ||||
| 				Spec: v1.PodSpec{ | ||||
| 					RestartPolicy: v1.RestartPolicyNever, | ||||
| @@ -141,13 +167,11 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) | ||||
| 		}, | ||||
| 	} | ||||
|  | ||||
| 	ctx := context.Background() | ||||
|  | ||||
| 	// Create the Job, but retry if it fails | ||||
| 	klog.V(2).Infof("Creating a Job with the prefix %q in the namespace %q", prefix, ns) | ||||
| 	var jobName string | ||||
| 	err := wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) { | ||||
| 		createdJob, err := client.BatchV1().Jobs(ns).Create(ctx, job, metav1.CreateOptions{}) | ||||
| 	err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) { | ||||
| 		createdJob, err := client.BatchV1().Jobs(ns).Create(context.Background(), job, metav1.CreateOptions{}) | ||||
| 		if err != nil { | ||||
| 			klog.V(2).Infof("Could not create a Job with the prefix %q in the namespace %q, retrying: %v", prefix, ns, err) | ||||
| 			lastError = err | ||||
| @@ -162,8 +186,8 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) | ||||
| 	} | ||||
|  | ||||
| 	// Wait for the Job to complete | ||||
| 	err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) { | ||||
| 		job, err := client.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{}) | ||||
| 	err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) { | ||||
| 		job, err := client.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{}) | ||||
| 		if err != nil { | ||||
| 			lastError = err | ||||
| 			klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err) | ||||
| @@ -192,7 +216,7 @@ func controlPlaneNodesReady(client clientset.Interface, _ *kubeadmapi.ClusterCon | ||||
| 	selectorControlPlane := labels.SelectorFromSet(map[string]string{ | ||||
| 		constants.LabelNodeRoleControlPlane: "", | ||||
| 	}) | ||||
| 	nodes, err := client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{ | ||||
| 	nodes, err := client.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{ | ||||
| 		LabelSelector: selectorControlPlane.String(), | ||||
| 	}) | ||||
| 	if err != nil { | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kubernetes Prow Robot
					Kubernetes Prow Robot