Remove GET job and retries for status updates.

Doing a GET right before retrying has 2 problems:
- It can masquerade conflicts
- It adds an additional delay

As for retries, we are better of going through the sync backoff.

In the case of conflict, we know that there was a Job update that would trigger another sync, so there is no need to do a rate limited requeue.
This commit is contained in:
Aldo Culquicondor
2021-09-23 11:42:30 -04:00
parent 2541fcf256
commit eebd678cda
2 changed files with 91 additions and 73 deletions

View File

@@ -29,6 +29,7 @@ import (
batch "k8s.io/api/batch/v1"
"k8s.io/api/core/v1"
apiequality "k8s.io/apimachinery/pkg/api/equality"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apimachinery/pkg/types"
@@ -745,9 +746,9 @@ func TestControllerSyncJob(t *testing.T) {
setPodsStatusesWithIndexes(podIndexer, job, tc.podsWithIndexes)
actual := job
manager.updateStatusHandler = func(job *batch.Job) error {
manager.updateStatusHandler = func(job *batch.Job) (*batch.Job, error) {
actual = job
return nil
return job, nil
}
// run
@@ -976,9 +977,9 @@ func TestSyncJobLegacyTracking(t *testing.T) {
sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(&tc.job)
var actual *batch.Job
manager.updateStatusHandler = func(job *batch.Job) error {
manager.updateStatusHandler = func(job *batch.Job) (*batch.Job, error) {
actual = job
return nil
return job, nil
}
// Run.
@@ -1527,9 +1528,9 @@ func TestTrackJobStatusAndRemoveFinalizers(t *testing.T) {
fakePodControl := controller.FakePodControl{Err: tc.podControlErr}
manager.podControl = &fakePodControl
var statusUpdates []batch.JobStatus
manager.updateStatusHandler = func(job *batch.Job) error {
manager.updateStatusHandler = func(job *batch.Job) (*batch.Job, error) {
statusUpdates = append(statusUpdates, *job.Status.DeepCopy())
return tc.statusUpdateErr
return job, tc.statusUpdateErr
}
if tc.job.Status.UncountedTerminatedPods == nil {
@@ -1654,9 +1655,9 @@ func TestSyncJobPastDeadline(t *testing.T) {
manager.podStoreSynced = alwaysReady
manager.jobStoreSynced = alwaysReady
var actual *batch.Job
manager.updateStatusHandler = func(job *batch.Job) error {
manager.updateStatusHandler = func(job *batch.Job) (*batch.Job, error) {
actual = job
return nil
return job, nil
}
// job & pods setup
@@ -1731,9 +1732,9 @@ func TestSyncPastDeadlineJobFinished(t *testing.T) {
manager.podStoreSynced = alwaysReady
manager.jobStoreSynced = alwaysReady
var actual *batch.Job
manager.updateStatusHandler = func(job *batch.Job) error {
manager.updateStatusHandler = func(job *batch.Job) (*batch.Job, error) {
actual = job
return nil
return job, nil
}
job := newJob(1, 1, 6, batch.NonIndexedCompletion)
@@ -1796,7 +1797,9 @@ func TestSyncJobDeleted(t *testing.T) {
manager.podControl = &fakePodControl
manager.podStoreSynced = alwaysReady
manager.jobStoreSynced = alwaysReady
manager.updateStatusHandler = func(job *batch.Job) error { return nil }
manager.updateStatusHandler = func(job *batch.Job) (*batch.Job, error) {
return job, nil
}
job := newJob(2, 2, 6, batch.NonIndexedCompletion)
forget, err := manager.syncJob(testutil.GetKey(job, t))
if err != nil {
@@ -1816,30 +1819,52 @@ func TestSyncJobDeleted(t *testing.T) {
func TestSyncJobUpdateRequeue(t *testing.T) {
clientset := clientset.NewForConfigOrDie(&restclient.Config{Host: "", ContentConfig: restclient.ContentConfig{GroupVersion: &schema.GroupVersion{Group: "", Version: "v1"}}})
DefaultJobBackOff = time.Duration(0) // overwrite the default value for testing
manager, sharedInformerFactory := newControllerFromClient(clientset, controller.NoResyncPeriodFunc)
fakePodControl := controller.FakePodControl{}
manager.podControl = &fakePodControl
manager.podStoreSynced = alwaysReady
manager.jobStoreSynced = alwaysReady
updateError := fmt.Errorf("update error")
manager.updateStatusHandler = func(job *batch.Job) error {
manager.queue.AddRateLimited(testutil.GetKey(job, t))
return updateError
cases := map[string]struct {
updateErr error
wantRequeue bool
withFinalizers bool
}{
"no error": {},
"generic error": {
updateErr: fmt.Errorf("update error"),
wantRequeue: true,
},
"conflict error": {
updateErr: apierrors.NewConflict(schema.GroupResource{}, "", nil),
},
"conflict error, with finalizers": {
withFinalizers: true,
updateErr: apierrors.NewConflict(schema.GroupResource{}, "", nil),
},
}
job := newJob(2, 2, 6, batch.NonIndexedCompletion)
sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
forget, err := manager.syncJob(testutil.GetKey(job, t))
if err == nil || err != updateError {
t.Errorf("Expected error %v when syncing jobs, got %v", updateError, err)
}
if forget != false {
t.Errorf("Unexpected forget value. Expected %v, saw %v\n", false, forget)
}
t.Log("Waiting for a job in the queue")
key, _ := manager.queue.Get()
expectedKey := testutil.GetKey(job, t)
if key != expectedKey {
t.Errorf("Expected requeue of job with key %s got %s", expectedKey, key)
for name, tc := range cases {
t.Run(name, func(t *testing.T) {
defer featuregatetesting.SetFeatureGateDuringTest(t, feature.DefaultFeatureGate, features.JobTrackingWithFinalizers, tc.withFinalizers)()
manager, sharedInformerFactory := newControllerFromClient(clientset, controller.NoResyncPeriodFunc)
fakePodControl := controller.FakePodControl{}
manager.podControl = &fakePodControl
manager.podStoreSynced = alwaysReady
manager.jobStoreSynced = alwaysReady
manager.updateStatusHandler = func(job *batch.Job) (*batch.Job, error) {
return job, tc.updateErr
}
job := newJob(2, 2, 6, batch.NonIndexedCompletion)
sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
manager.queue.Add(testutil.GetKey(job, t))
manager.processNextWorkItem()
// With DefaultJobBackOff=0, the queueing is synchronous.
requeued := manager.queue.Len() > 0
if requeued != tc.wantRequeue {
t.Errorf("Unexpected requeue, got %t, want %t", requeued, tc.wantRequeue)
}
if requeued {
key, _ := manager.queue.Get()
expectedKey := testutil.GetKey(job, t)
if key != expectedKey {
t.Errorf("Expected requeue of job with key %s got %s", expectedKey, key)
}
}
})
}
}
@@ -2323,7 +2348,9 @@ func TestSyncJobExpectations(t *testing.T) {
manager.podControl = &fakePodControl
manager.podStoreSynced = alwaysReady
manager.jobStoreSynced = alwaysReady
manager.updateStatusHandler = func(job *batch.Job) error { return nil }
manager.updateStatusHandler = func(job *batch.Job) (*batch.Job, error) {
return job, nil
}
job := newJob(2, 2, 6, batch.NonIndexedCompletion)
sharedInformerFactory.Batch().V1().Jobs().Informer().GetIndexer().Add(job)
@@ -2527,9 +2554,9 @@ func TestJobBackoffReset(t *testing.T) {
manager.podStoreSynced = alwaysReady
manager.jobStoreSynced = alwaysReady
var actual *batch.Job
manager.updateStatusHandler = func(job *batch.Job) error {
manager.updateStatusHandler = func(job *batch.Job) (*batch.Job, error) {
actual = job
return nil
return job, nil
}
// job & pods setup
@@ -2707,9 +2734,9 @@ func TestJobBackoffForOnFailure(t *testing.T) {
manager.podStoreSynced = alwaysReady
manager.jobStoreSynced = alwaysReady
var actual *batch.Job
manager.updateStatusHandler = func(job *batch.Job) error {
manager.updateStatusHandler = func(job *batch.Job) (*batch.Job, error) {
actual = job
return nil
return job, nil
}
// job & pods setup
@@ -2809,9 +2836,9 @@ func TestJobBackoffOnRestartPolicyNever(t *testing.T) {
manager.podStoreSynced = alwaysReady
manager.jobStoreSynced = alwaysReady
var actual *batch.Job
manager.updateStatusHandler = func(job *batch.Job) error {
manager.updateStatusHandler = func(job *batch.Job) (*batch.Job, error) {
actual = job
return nil
return job, nil
}
// job & pods setup