Merge pull request #123242 from mimowo/fast-backoff-for-replacment-policy-tests

Improve accuracy of the PodsCreationTotal metric and use fast pod failure backoff for ReplacementPolicy integration tests
2024-02-15 09:54:53 -08:00
parent 50bf3a2060 f84d643c20
commit 47f92ce231
2 changed files with 12 additions and 12 deletions
--- a/pkg/controller/job/job_controller.go
+++ b/pkg/controller/job/job_controller.go
@@ -130,6 +130,7 @@ type syncJobCtx struct {
 	finishedCondition               *batch.JobCondition
 	activePods                      []*v1.Pod
 	succeeded                       int32
+	failed                          int32
 	prevSucceededIndexes            orderedIntervals
 	succeededIndexes                orderedIntervals
 	failedIndexes                   *orderedIntervals
@@ -790,7 +791,7 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {
 	active := int32(len(jobCtx.activePods))
 	newSucceededPods, newFailedPods := getNewFinishedPods(jobCtx)
 	jobCtx.succeeded = job.Status.Succeeded + int32(len(newSucceededPods)) + int32(len(jobCtx.uncounted.succeeded))
-	failed := job.Status.Failed + int32(nonIgnoredFailedPodsCount(jobCtx, newFailedPods)) + int32(len(jobCtx.uncounted.failed))
+	jobCtx.failed = job.Status.Failed + int32(nonIgnoredFailedPodsCount(jobCtx, newFailedPods)) + int32(len(jobCtx.uncounted.failed))
 	var ready *int32
 	if feature.DefaultFeatureGate.Enabled(features.JobReadyPods) {
 		ready = ptr.To(countReadyPods(jobCtx.activePods))
@@ -806,7 +807,7 @@ func (jm *Controller) syncJob(ctx context.Context, key string) (rErr error) {

 	var manageJobErr error

-	exceedsBackoffLimit := failed > *job.Spec.BackoffLimit
+	exceedsBackoffLimit := jobCtx.failed > *job.Spec.BackoffLimit

 	if feature.DefaultFeatureGate.Enabled(features.JobPodFailurePolicy) {
 		if failureTargetCondition := findConditionByType(job.Status.Conditions, batch.JobFailureTarget); failureTargetCondition != nil {
@@ -1618,7 +1619,7 @@ func (jm *Controller) manageJob(ctx context.Context, job *batch.Job, jobCtx *syn
 			}
 			diff -= batchSize
 		}
-		recordJobPodsCreationTotal(job, creationsSucceeded, creationsFailed)
+		recordJobPodsCreationTotal(job, jobCtx, creationsSucceeded, creationsFailed)
 		return active, metrics.JobSyncActionPodsCreated, errorFromChannel(errCh)
 	}

@@ -1917,16 +1918,13 @@ func (jm *Controller) cleanupPodFinalizers(job *batch.Job) {
 	}
 }

-func recordJobPodsCreationTotal(job *batch.Job, succeeded, failed int32) {
+func recordJobPodsCreationTotal(job *batch.Job, jobCtx *syncJobCtx, succeeded, failed int32) {
 	reason := metrics.PodCreateNew
 	if feature.DefaultFeatureGate.Enabled(features.JobPodReplacementPolicy) {
-		podsTerminating := job.Status.Terminating != nil && *job.Status.Terminating > 0
-		isRecreateAction := podsTerminating || job.Status.Failed > 0
-		if isRecreateAction {
+		if ptr.Deref(job.Spec.PodReplacementPolicy, batch.TerminatingOrFailed) == batch.Failed && jobCtx.failed > 0 {
+			reason = metrics.PodRecreateFailed
+		} else if jobCtx.failed > 0 || ptr.Deref(jobCtx.terminating, 0) > 0 {
 			reason = metrics.PodRecreateTerminatingOrFailed
-			if *job.Spec.PodReplacementPolicy == batch.Failed {
-				reason = metrics.PodRecreateFailed
-			}
 		}
 	}
 	if succeeded > 0 {
--- a/test/integration/job/job_test.go
+++ b/test/integration/job/job_test.go
@@ -1587,6 +1587,7 @@ func TestIndexedJob(t *testing.T) {
 }

 func TestJobPodReplacementPolicy(t *testing.T) {
+	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
 	indexedCompletion := batchv1.IndexedCompletion
 	nonIndexedCompletion := batchv1.NonIndexedCompletion
 	var podReplacementPolicy = func(obj batchv1.PodReplacementPolicy) *batchv1.PodReplacementPolicy {
@@ -1632,7 +1633,7 @@ func TestJobPodReplacementPolicy(t *testing.T) {
 				new: 4,
 			},
 		},
-		"feature flag true, TerminatingOrFailed policy, delete & fail pods, recreate terminating pods, and verify job status counters": {
+		"feature flag true with IndexedJob, TerminatingOrFailed policy, delete & fail pods, recreate terminating pods, and verify job status counters": {
 			podReplacementPolicyEnabled: true,
 			jobSpec: &batchv1.JobSpec{
 				Parallelism:          ptr.To[int32](2),
@@ -1665,7 +1666,7 @@ func TestJobPodReplacementPolicy(t *testing.T) {
 			jobSpec: &batchv1.JobSpec{
 				Parallelism:          ptr.To[int32](2),
 				Completions:          ptr.To[int32](2),
-				CompletionMode:       &indexedCompletion,
+				CompletionMode:       &nonIndexedCompletion,
 				PodReplacementPolicy: podReplacementPolicy(batchv1.TerminatingOrFailed),
 				Template: v1.PodTemplateSpec{
 					ObjectMeta: metav1.ObjectMeta{
@@ -1846,6 +1847,7 @@ func TestJobPodReplacementPolicy(t *testing.T) {
 // Disable reverts to previous behavior.
 // Enabling will then match the original failed case.
 func TestJobPodReplacementPolicyFeatureToggling(t *testing.T) {
+	t.Cleanup(setDurationDuringTest(&jobcontroller.DefaultJobPodFailureBackOff, fastPodFailureBackoff))
 	const podCount int32 = 2
 	jobSpec := batchv1.JobSpec{
 		Parallelism:          ptr.To(podCount),