From 8835f6bb00d3519dd14c8570f47c5f3f6fadbf4e Mon Sep 17 00:00:00 2001 From: Jonathan Basseri Date: Thu, 29 Aug 2019 16:19:45 -0700 Subject: [PATCH] Fix retry logic in DisruptionController This changes the retry logic in DisruptionController so that it reconciles update conflicts. In the old behavior, any pdb status update failure was retried with the same status, regardless of error. Now there is no retry logic with the status update. The error is passed up the stack where the PDB can be requeued for processing. If the PDB status update error is a conflict error, there are some new special cases: - failSafe is not triggered, since this is considered a retryable error - the PDB is requeued immediately (ignoring the rate limiter) because we assume that conflict can be resolved by getting the latest version --- pkg/controller/disruption/BUILD | 1 - pkg/controller/disruption/disruption.go | 39 +++++++------------------ 2 files changed, 11 insertions(+), 29 deletions(-) diff --git a/pkg/controller/disruption/BUILD b/pkg/controller/disruption/BUILD index 01658479321..b7f0600881c 100644 --- a/pkg/controller/disruption/BUILD +++ b/pkg/controller/disruption/BUILD @@ -32,7 +32,6 @@ go_library( "//staging/src/k8s.io/client-go/kubernetes:go_default_library", "//staging/src/k8s.io/client-go/kubernetes/scheme:go_default_library", "//staging/src/k8s.io/client-go/kubernetes/typed/core/v1:go_default_library", - "//staging/src/k8s.io/client-go/kubernetes/typed/policy/v1beta1:go_default_library", "//staging/src/k8s.io/client-go/listers/apps/v1:go_default_library", "//staging/src/k8s.io/client-go/listers/core/v1:go_default_library", "//staging/src/k8s.io/client-go/listers/policy/v1beta1:go_default_library", diff --git a/pkg/controller/disruption/disruption.go b/pkg/controller/disruption/disruption.go index 6fe7d16b222..e1c5ec09f81 100644 --- a/pkg/controller/disruption/disruption.go +++ b/pkg/controller/disruption/disruption.go @@ -21,7 +21,7 @@ import ( "time" apps "k8s.io/api/apps/v1beta1" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/api/extensions/v1beta1" policy "k8s.io/api/policy/v1beta1" apiequality "k8s.io/apimachinery/pkg/api/equality" @@ -39,7 +39,6 @@ import ( clientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/scheme" v1core "k8s.io/client-go/kubernetes/typed/core/v1" - policyclientset "k8s.io/client-go/kubernetes/typed/policy/v1beta1" appsv1listers "k8s.io/client-go/listers/apps/v1" corelisters "k8s.io/client-go/listers/core/v1" policylisters "k8s.io/client-go/listers/policy/v1beta1" @@ -53,8 +52,6 @@ import ( "k8s.io/klog" ) -const statusUpdateRetries = 2 - // DeletionTimeout sets maximum time from the moment a pod is added to DisruptedPods in PDB.Status // to the time when the pod is expected to be seen by PDB controller as having been marked for deletion. // If the pod was not marked for deletion during that time it is assumed that it won't be deleted at @@ -532,7 +529,13 @@ func (dc *DisruptionController) sync(key string) error { return err } - if err := dc.trySync(pdb); err != nil { + err = dc.trySync(pdb) + // If the reason for failure was a conflict, then allow this PDB update to be + // requeued without triggering the failSafe logic. + if errors.IsConflict(err) { + return err + } + if err != nil { klog.Errorf("Failed to sync pdb %s/%s: %v", pdb.Namespace, pdb.Name, err) return dc.failSafe(pdb) } @@ -773,29 +776,9 @@ func (dc *DisruptionController) updatePdbStatus(pdb *policy.PodDisruptionBudget, return dc.getUpdater()(newPdb) } -// refresh tries to re-GET the given PDB. If there are any errors, it just -// returns the old PDB. Intended to be used in a retry loop where it runs a -// bounded number of times. -func refresh(pdbClient policyclientset.PodDisruptionBudgetInterface, pdb *policy.PodDisruptionBudget) *policy.PodDisruptionBudget { - newPdb, err := pdbClient.Get(pdb.Name, metav1.GetOptions{}) - if err == nil { - return newPdb - } - return pdb - -} - func (dc *DisruptionController) writePdbStatus(pdb *policy.PodDisruptionBudget) error { - pdbClient := dc.kubeClient.PolicyV1beta1().PodDisruptionBudgets(pdb.Namespace) - st := pdb.Status - - var err error - for i, pdb := 0, pdb; i < statusUpdateRetries; i, pdb = i+1, refresh(pdbClient, pdb) { - pdb.Status = st - if _, err = pdbClient.UpdateStatus(pdb); err == nil { - break - } - } - + // If this update fails, don't retry it. Allow the failure to get handled & + // retried in `processNextWorkItem()`. + _, err := dc.kubeClient.PolicyV1beta1().PodDisruptionBudgets(pdb.Namespace).UpdateStatus(pdb) return err }