Merge pull request #124503 from neolit123/1.31-convert-create-job-preflight-to-warning
kubeadm: check for available nodes during 'CreateJob' preflight
This commit is contained in:
		| @@ -64,7 +64,7 @@ func (c *healthCheck) Name() string { | |||||||
| } | } | ||||||
|  |  | ||||||
| // CheckClusterHealth makes sure: | // CheckClusterHealth makes sure: | ||||||
| // - the API /healthz endpoint is healthy | // - the cluster can accept a workload | ||||||
| // - all control-plane Nodes are Ready | // - all control-plane Nodes are Ready | ||||||
| // - (if static pod-hosted) that all required Static Pod manifests exist on disk | // - (if static pod-hosted) that all required Static Pod manifests exist on disk | ||||||
| func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.Set[string], printer output.Printer) error { | func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.Set[string], printer output.Printer) error { | ||||||
| @@ -92,11 +92,18 @@ func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfi | |||||||
| } | } | ||||||
|  |  | ||||||
| // createJob is a check that verifies that a Job can be created in the cluster | // createJob is a check that verifies that a Job can be created in the cluster | ||||||
| func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) (lastError error) { | func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) error { | ||||||
| 	const ( | 	const ( | ||||||
| 		prefix  = "upgrade-health-check" | 		prefix        = "upgrade-health-check" | ||||||
| 		ns      = metav1.NamespaceSystem | 		fieldSelector = "spec.unschedulable=false" | ||||||
| 		timeout = 15 * time.Second | 		ns            = metav1.NamespaceSystem | ||||||
|  | 		timeout       = 15 * time.Second | ||||||
|  | 	) | ||||||
|  | 	var ( | ||||||
|  | 		err, lastError error | ||||||
|  | 		ctx            = context.Background() | ||||||
|  | 		nodes          *v1.NodeList | ||||||
|  | 		listOptions    = metav1.ListOptions{Limit: 1, FieldSelector: fieldSelector} | ||||||
| 	) | 	) | ||||||
|  |  | ||||||
| 	// If client.Discovery().RESTClient() is nil, the fake client is used. | 	// If client.Discovery().RESTClient() is nil, the fake client is used. | ||||||
| @@ -106,6 +113,25 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) | |||||||
| 		return nil | 		return nil | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | 	// Check if there is at least one Node where a Job's Pod can schedule. If not, skip this preflight check. | ||||||
|  | 	err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) { | ||||||
|  | 		nodes, err = client.CoreV1().Nodes().List(context.Background(), listOptions) | ||||||
|  | 		if err != nil { | ||||||
|  | 			klog.V(2).Infof("Could not list Nodes with field selector %q: %v", fieldSelector, err) | ||||||
|  | 			lastError = err | ||||||
|  | 			return false, nil | ||||||
|  | 		} | ||||||
|  | 		return true, nil | ||||||
|  | 	}) | ||||||
|  | 	if err != nil { | ||||||
|  | 		return errors.Wrap(lastError, "could not check if there is at least one Node that can schedule a test Pod") | ||||||
|  | 	} | ||||||
|  |  | ||||||
|  | 	if len(nodes.Items) == 0 { | ||||||
|  | 		klog.Warning("The preflight check \"CreateJob\" was skipped because there are no schedulable Nodes in the cluster.") | ||||||
|  | 		return nil | ||||||
|  | 	} | ||||||
|  |  | ||||||
| 	// Prepare Job | 	// Prepare Job | ||||||
| 	job := &batchv1.Job{ | 	job := &batchv1.Job{ | ||||||
| 		ObjectMeta: metav1.ObjectMeta{ | 		ObjectMeta: metav1.ObjectMeta{ | ||||||
| @@ -114,7 +140,7 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) | |||||||
| 		}, | 		}, | ||||||
| 		Spec: batchv1.JobSpec{ | 		Spec: batchv1.JobSpec{ | ||||||
| 			BackoffLimit:            ptr.To[int32](0), | 			BackoffLimit:            ptr.To[int32](0), | ||||||
| 			TTLSecondsAfterFinished: ptr.To[int32](2), | 			TTLSecondsAfterFinished: ptr.To[int32](int32(timeout.Seconds()) + 5), // Make sure it's more than 'timeout'. | ||||||
| 			Template: v1.PodTemplateSpec{ | 			Template: v1.PodTemplateSpec{ | ||||||
| 				Spec: v1.PodSpec{ | 				Spec: v1.PodSpec{ | ||||||
| 					RestartPolicy: v1.RestartPolicyNever, | 					RestartPolicy: v1.RestartPolicyNever, | ||||||
| @@ -141,13 +167,11 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) | |||||||
| 		}, | 		}, | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	ctx := context.Background() |  | ||||||
|  |  | ||||||
| 	// Create the Job, but retry if it fails | 	// Create the Job, but retry if it fails | ||||||
| 	klog.V(2).Infof("Creating a Job with the prefix %q in the namespace %q", prefix, ns) | 	klog.V(2).Infof("Creating a Job with the prefix %q in the namespace %q", prefix, ns) | ||||||
| 	var jobName string | 	var jobName string | ||||||
| 	err := wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) { | 	err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) { | ||||||
| 		createdJob, err := client.BatchV1().Jobs(ns).Create(ctx, job, metav1.CreateOptions{}) | 		createdJob, err := client.BatchV1().Jobs(ns).Create(context.Background(), job, metav1.CreateOptions{}) | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| 			klog.V(2).Infof("Could not create a Job with the prefix %q in the namespace %q, retrying: %v", prefix, ns, err) | 			klog.V(2).Infof("Could not create a Job with the prefix %q in the namespace %q, retrying: %v", prefix, ns, err) | ||||||
| 			lastError = err | 			lastError = err | ||||||
| @@ -162,8 +186,8 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	// Wait for the Job to complete | 	// Wait for the Job to complete | ||||||
| 	err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) { | 	err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) { | ||||||
| 		job, err := client.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{}) | 		job, err := client.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{}) | ||||||
| 		if err != nil { | 		if err != nil { | ||||||
| 			lastError = err | 			lastError = err | ||||||
| 			klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err) | 			klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err) | ||||||
| @@ -192,7 +216,7 @@ func controlPlaneNodesReady(client clientset.Interface, _ *kubeadmapi.ClusterCon | |||||||
| 	selectorControlPlane := labels.SelectorFromSet(map[string]string{ | 	selectorControlPlane := labels.SelectorFromSet(map[string]string{ | ||||||
| 		constants.LabelNodeRoleControlPlane: "", | 		constants.LabelNodeRoleControlPlane: "", | ||||||
| 	}) | 	}) | ||||||
| 	nodes, err := client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{ | 	nodes, err := client.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{ | ||||||
| 		LabelSelector: selectorControlPlane.String(), | 		LabelSelector: selectorControlPlane.String(), | ||||||
| 	}) | 	}) | ||||||
| 	if err != nil { | 	if err != nil { | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Kubernetes Prow Robot
					Kubernetes Prow Robot