Merge pull request #60202 from clamoriniere1A/feature/JobBackoffWithParallelism
Automatic merge from submit-queue (batch tested with PRs 60054, 60202, 60219, 58090, 60275). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>. Improves backoff policy in JobController **What this PR does / why we need it**: This PR is fixing the issue: #56853, It improves the "Job backoff policy" when Job is configure to allow parallelism and few pods' Jobs failed but others succeed. Now, it checks if the number of pods succeeded increased since the last check. If yes the backoff delay is cleared. **Which issue(s) this PR fixes**: Fixes #56853 **Special notes for your reviewer**: **Release note**: ```release-note NONE ```
This commit is contained in:
		| @@ -553,6 +553,14 @@ func (jm *JobController) syncJob(key string) (bool, error) { | ||||
| 	} | ||||
|  | ||||
| 	forget := false | ||||
| 	// Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true | ||||
| 	// This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to | ||||
| 	// improve the Job backoff policy when parallelism > 1 and few Jobs failed but others succeed. | ||||
| 	// In this case, we should clear the backoff delay. | ||||
| 	if job.Status.Succeeded < succeeded { | ||||
| 		forget = true | ||||
| 	} | ||||
|  | ||||
| 	// no need to update the job if the status hasn't changed since last time | ||||
| 	if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions { | ||||
| 		job.Status.Active = active | ||||
| @@ -560,12 +568,12 @@ func (jm *JobController) syncJob(key string) (bool, error) { | ||||
| 		job.Status.Failed = failed | ||||
|  | ||||
| 		if err := jm.updateHandler(&job); err != nil { | ||||
| 			return false, err | ||||
| 			return forget, err | ||||
| 		} | ||||
|  | ||||
| 		if jobHaveNewFailure && !IsJobFinished(&job) { | ||||
| 			// returning an error will re-enqueue Job after the backoff period | ||||
| 			return false, fmt.Errorf("failed pod(s) detected for job key %q", key) | ||||
| 			return forget, fmt.Errorf("failed pod(s) detected for job key %q", key) | ||||
| 		} | ||||
|  | ||||
| 		forget = true | ||||
|   | ||||
| @@ -218,11 +218,16 @@ func TestControllerSyncJob(t *testing.T) { | ||||
| 			fmt.Errorf("Fake error"), true, 0, 3, 0, 0, | ||||
| 			0, 1, 3, 0, 0, nil, "", | ||||
| 		}, | ||||
| 		"failed pod": { | ||||
| 		"failed + succeed pods: reset backoff delay": { | ||||
| 			2, 5, 6, false, 0, | ||||
| 			fmt.Errorf("Fake error"), false, 0, 1, 1, 1, | ||||
| 			fmt.Errorf("Fake error"), true, 0, 1, 1, 1, | ||||
| 			1, 0, 1, 1, 1, nil, "", | ||||
| 		}, | ||||
| 		"only new failed pod": { | ||||
| 			2, 5, 6, false, 0, | ||||
| 			fmt.Errorf("Fake error"), false, 0, 1, 0, 1, | ||||
| 			1, 0, 1, 0, 1, nil, "", | ||||
| 		}, | ||||
| 		"job finish": { | ||||
| 			2, 5, 6, false, 0, | ||||
| 			nil, true, 0, 0, 5, 0, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kubernetes Submit Queue
					Kubernetes Submit Queue