Merge pull request #60202 from clamoriniere1A/feature/JobBackoffWithParallelism

Automatic merge from submit-queue (batch tested with PRs 60054, 60202, 60219, 58090, 60275). If you want to cherry-pick this change to another branch, please follow the instructions <a href="https://github.com/kubernetes/community/blob/master/contributors/devel/cherry-picks.md">here</a>.

Improves backoff policy in JobController

**What this PR does / why we need it**:
This PR is fixing the issue: #56853, It improves the "Job backoff policy" when Job is configure to allow parallelism and few pods' Jobs failed but others succeed. 
Now, it checks if the number of pods succeeded increased since the last check. If yes the backoff delay is cleared. 

**Which issue(s) this PR fixes**:
Fixes #56853

**Special notes for your reviewer**:

**Release note**:
```release-note
NONE
```
This commit is contained in:
Kubernetes Submit Queue
2018-02-23 23:15:37 -08:00
committed by GitHub
2 changed files with 17 additions and 4 deletions

View File

@@ -553,6 +553,14 @@ func (jm *JobController) syncJob(key string) (bool, error) {
}
forget := false
// Check if the number of jobs succeeded increased since the last check. If yes "forget" should be true
// This logic is linked to the issue: https://github.com/kubernetes/kubernetes/issues/56853 that aims to
// improve the Job backoff policy when parallelism > 1 and few Jobs failed but others succeed.
// In this case, we should clear the backoff delay.
if job.Status.Succeeded < succeeded {
forget = true
}
// no need to update the job if the status hasn't changed since last time
if job.Status.Active != active || job.Status.Succeeded != succeeded || job.Status.Failed != failed || len(job.Status.Conditions) != conditions {
job.Status.Active = active
@@ -560,12 +568,12 @@ func (jm *JobController) syncJob(key string) (bool, error) {
job.Status.Failed = failed
if err := jm.updateHandler(&job); err != nil {
return false, err
return forget, err
}
if jobHaveNewFailure && !IsJobFinished(&job) {
// returning an error will re-enqueue Job after the backoff period
return false, fmt.Errorf("failed pod(s) detected for job key %q", key)
return forget, fmt.Errorf("failed pod(s) detected for job key %q", key)
}
forget = true

View File

@@ -218,11 +218,16 @@ func TestControllerSyncJob(t *testing.T) {
fmt.Errorf("Fake error"), true, 0, 3, 0, 0,
0, 1, 3, 0, 0, nil, "",
},
"failed pod": {
"failed + succeed pods: reset backoff delay": {
2, 5, 6, false, 0,
fmt.Errorf("Fake error"), false, 0, 1, 1, 1,
fmt.Errorf("Fake error"), true, 0, 1, 1, 1,
1, 0, 1, 1, 1, nil, "",
},
"only new failed pod": {
2, 5, 6, false, 0,
fmt.Errorf("Fake error"), false, 0, 1, 0, 1,
1, 0, 1, 0, 1, nil, "",
},
"job finish": {
2, 5, 6, false, 0,
nil, true, 0, 0, 5, 0,