Merge pull request #42938 from enisoc/orphan-race

Automatic merge from submit-queue GC: Fix re-adoption race when orphaning dependents. **What this PR does / why we need it**: The GC expects that once it sees a controller with a non-nil DeletionTimestamp, that controller will not attempt any adoption. There was a known race condition that could cause a controller to re-adopt something orphaned by the GC, because the controller is using a cached value of its own spec from before DeletionTimestamp was set. This fixes that race by doing an uncached quorum read of the controller spec just before the first adoption attempt. It's important that this read occurs after listing potential orphans. Note that this uncached read is skipped if no adoptions are attempted (i.e. at steady state). **Which issue this PR fixes**: Fixes #42639 **Special notes for your reviewer**: **Release note**: ```release-note ``` cc @kubernetes/sig-apps-pr-reviews
2017-03-20 01:30:11 -07:00
parent f880340314 b4b8fdbca3
commit 47320fd3f0
15 changed files with 485 additions and 269 deletions
--- a/pkg/controller/deployment/deployment_controller_test.go
+++ b/pkg/controller/deployment/deployment_controller_test.go
@@ -148,6 +148,11 @@ type fixture struct {
 	objects []runtime.Object
 }

+func (f *fixture) expectGetDeploymentAction(d *extensions.Deployment) {
+	action := core.NewGetAction(schema.GroupVersionResource{Resource: "deployments"}, d.Namespace, d.Name)
+	f.actions = append(f.actions, action)
+}
+
 func (f *fixture) expectUpdateDeploymentStatusAction(d *extensions.Deployment) {
 	action := core.NewUpdateAction(schema.GroupVersionResource{Resource: "deployments"}, d.Namespace, d)
 	action.Subresource = "status"
@@ -190,15 +195,25 @@ func (f *fixture) newController() (*DeploymentController, informers.SharedInform
 	return c, informers
 }

+func (f *fixture) runExpectError(deploymentName string) {
+	f.run_(deploymentName, true)
+}
+
 func (f *fixture) run(deploymentName string) {
+	f.run_(deploymentName, false)
+}
+
+func (f *fixture) run_(deploymentName string, expectError bool) {
 	c, informers := f.newController()
 	stopCh := make(chan struct{})
 	defer close(stopCh)
 	informers.Start(stopCh)

 	err := c.syncDeployment(deploymentName)
-	if err != nil {
+	if !expectError && err != nil {
 		f.t.Errorf("error syncing deployment: %v", err)
+	} else if expectError && err == nil {
+		f.t.Error("expected error syncing deployment, got nil")
 	}

 	actions := filterInformerActions(f.client.Actions())
@@ -293,6 +308,30 @@ func TestSyncDeploymentDontDoAnythingDuringDeletion(t *testing.T) {
 	f.run(getKey(d, t))
 }

+func TestSyncDeploymentDeletionRace(t *testing.T) {
+	f := newFixture(t)
+
+	d := newDeployment("foo", 1, nil, nil, nil, map[string]string{"foo": "bar"})
+	d2 := *d
+	// Lister (cache) says NOT deleted.
+	f.dLister = append(f.dLister, d)
+	// Bare client says it IS deleted. This should be presumed more up-to-date.
+	now := metav1.Now()
+	d2.DeletionTimestamp = &now
+	f.objects = append(f.objects, &d2)
+
+	// The recheck is only triggered if a matching orphan exists.
+	rs := newReplicaSet(d, "rs1", 1)
+	rs.OwnerReferences = nil
+	f.objects = append(f.objects, rs)
+	f.rsLister = append(f.rsLister, rs)
+
+	// Expect to only recheck DeletionTimestamp.
+	f.expectGetDeploymentAction(d)
+	// Sync should fail and requeue to let cache catch up.
+	f.runExpectError(getKey(d, t))
+}
+
 // issue: https://github.com/kubernetes/kubernetes/issues/23218
 func TestDontSyncDeploymentsWithEmptyPodSelector(t *testing.T) {
 	f := newFixture(t)