Merge pull request #124503 from neolit123/1.31-convert-create-job-preflight-to-warning

kubeadm: check for available nodes during 'CreateJob' preflight
This commit is contained in:
Kubernetes Prow Robot 2024-04-26 08:49:26 -07:00 committed by GitHub
commit bae83009d3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -64,7 +64,7 @@ func (c *healthCheck) Name() string {
}
// CheckClusterHealth makes sure:
// - the API /healthz endpoint is healthy
// - the cluster can accept a workload
// - all control-plane Nodes are Ready
// - (if static pod-hosted) that all required Static Pod manifests exist on disk
func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration, ignoreChecksErrors sets.Set[string], printer output.Printer) error {
@ -92,11 +92,18 @@ func CheckClusterHealth(client clientset.Interface, cfg *kubeadmapi.ClusterConfi
}
// createJob is a check that verifies that a Job can be created in the cluster
func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) (lastError error) {
func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration) error {
const (
prefix = "upgrade-health-check"
ns = metav1.NamespaceSystem
timeout = 15 * time.Second
prefix = "upgrade-health-check"
fieldSelector = "spec.unschedulable=false"
ns = metav1.NamespaceSystem
timeout = 15 * time.Second
)
var (
err, lastError error
ctx = context.Background()
nodes *v1.NodeList
listOptions = metav1.ListOptions{Limit: 1, FieldSelector: fieldSelector}
)
// If client.Discovery().RESTClient() is nil, the fake client is used.
@ -106,6 +113,25 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
return nil
}
// Check if there is at least one Node where a Job's Pod can schedule. If not, skip this preflight check.
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
nodes, err = client.CoreV1().Nodes().List(context.Background(), listOptions)
if err != nil {
klog.V(2).Infof("Could not list Nodes with field selector %q: %v", fieldSelector, err)
lastError = err
return false, nil
}
return true, nil
})
if err != nil {
return errors.Wrap(lastError, "could not check if there is at least one Node that can schedule a test Pod")
}
if len(nodes.Items) == 0 {
klog.Warning("The preflight check \"CreateJob\" was skipped because there are no schedulable Nodes in the cluster.")
return nil
}
// Prepare Job
job := &batchv1.Job{
ObjectMeta: metav1.ObjectMeta{
@ -114,7 +140,7 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
},
Spec: batchv1.JobSpec{
BackoffLimit: ptr.To[int32](0),
TTLSecondsAfterFinished: ptr.To[int32](2),
TTLSecondsAfterFinished: ptr.To[int32](int32(timeout.Seconds()) + 5), // Make sure it's more than 'timeout'.
Template: v1.PodTemplateSpec{
Spec: v1.PodSpec{
RestartPolicy: v1.RestartPolicyNever,
@ -141,13 +167,11 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
},
}
ctx := context.Background()
// Create the Job, but retry if it fails
klog.V(2).Infof("Creating a Job with the prefix %q in the namespace %q", prefix, ns)
var jobName string
err := wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) {
createdJob, err := client.BatchV1().Jobs(ns).Create(ctx, job, metav1.CreateOptions{})
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
createdJob, err := client.BatchV1().Jobs(ns).Create(context.Background(), job, metav1.CreateOptions{})
if err != nil {
klog.V(2).Infof("Could not create a Job with the prefix %q in the namespace %q, retrying: %v", prefix, ns, err)
lastError = err
@ -162,8 +186,8 @@ func createJob(client clientset.Interface, cfg *kubeadmapi.ClusterConfiguration)
}
// Wait for the Job to complete
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(ctx context.Context) (bool, error) {
job, err := client.BatchV1().Jobs(ns).Get(ctx, jobName, metav1.GetOptions{})
err = wait.PollUntilContextTimeout(ctx, time.Second*1, timeout, true, func(_ context.Context) (bool, error) {
job, err := client.BatchV1().Jobs(ns).Get(context.Background(), jobName, metav1.GetOptions{})
if err != nil {
lastError = err
klog.V(2).Infof("could not get Job %q in the namespace %q, retrying: %v", jobName, ns, err)
@ -192,7 +216,7 @@ func controlPlaneNodesReady(client clientset.Interface, _ *kubeadmapi.ClusterCon
selectorControlPlane := labels.SelectorFromSet(map[string]string{
constants.LabelNodeRoleControlPlane: "",
})
nodes, err := client.CoreV1().Nodes().List(context.TODO(), metav1.ListOptions{
nodes, err := client.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
LabelSelector: selectorControlPlane.String(),
})
if err != nil {