Merge pull request #119159 from alculquicondor/fix-job-uncounted

Only declare job as finished after removing all finalizers
2023-07-08 01:55:03 -07:00
parent 63e6d81314 f7a1fb76f4
commit 19a25bac05
2 changed files with 7 additions and 7 deletions
--- a/pkg/controller/job/job_controller.go
+++ b/pkg/controller/job/job_controller.go
@@ -792,12 +792,7 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
 	var manageJobErr error
 	var finishedCondition *batch.JobCondition

-	jobHasNewFailure := failed > job.Status.Failed
-	// new failures happen when status does not reflect the failures and active
-	// is different than parallelism, otherwise the previous controller loop
-	// failed updating status so even if we pick up failure it is not a new one
-	exceedsBackoffLimit := jobHasNewFailure && (active != *job.Spec.Parallelism) &&
-		(failed > *job.Spec.BackoffLimit)
+	exceedsBackoffLimit := failed > *job.Spec.BackoffLimit

 	if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
 		if failureTargetCondition := findConditionByType(job.Status.Conditions, batch.JobFailureTarget); failureTargetCondition != nil {
@@ -999,6 +994,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
 		needsFlush = true
 	}
 	podFailureCountByPolicyAction := map[string]int{}
+	reachedMaxUncountedPods := false
 	for _, pod := range pods {
 		if !hasJobTrackingFinalizer(pod) || expectedRmFinalizers.Has(string(pod.UID)) {
 			// This pod was processed in a previous sync.
@@ -1049,6 +1045,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
 			//
 			// The job will be synced again because the Job status and Pod updates
 			// will put the Job back to the work queue.
+			reachedMaxUncountedPods = true
 			break
 		}
 	}
@@ -1077,7 +1074,7 @@ func (jm *Controller) trackJobStatusAndRemoveFinalizers(ctx context.Context, job
 	if job, needsFlush, err = jm.flushUncountedAndRemoveFinalizers(ctx, job, podsToRemoveFinalizer, uidsWithFinalizer, &oldCounters, podFailureCountByPolicyAction, needsFlush, newBackoffRecord); err != nil {
 		return err
 	}
-	jobFinished := jm.enactJobFinished(job, finishedCond)
+	jobFinished := !reachedMaxUncountedPods && jm.enactJobFinished(job, finishedCond)
 	if jobFinished {
 		needsFlush = true
 	}
--- a/test/integration/job/job_test.go
+++ b/test/integration/job/job_test.go
@@ -1341,6 +1341,9 @@ func TestOrphanPodsFinalizersClearedWithGC(t *testing.T) {
 }

 func TestFinalizersClearedWhenBackoffLimitExceeded(t *testing.T) {
+	// Set a maximum number of uncounted pods below parallelism, to ensure it
+	// doesn't affect the termination of pods.
+	t.Cleanup(setDuringTest(&jobcontroller.MaxUncountedPods, 50))
 	closeFn, restConfig, clientSet, ns := setup(t, "simple")
 	defer closeFn()
 	ctx, cancel := startJobControllerAndWaitForCaches(restConfig)