RC/RS: Use ControllerRef to route watch events.

This is part of the completion of ControllerRef, as described here: https://github.com/kubernetes/community/blob/master/contributors/design-proposals/controller-ref.md#watches This also removes the need for the Pod->Controller mapping cache in RC and RS. This mapping is now persisted in the Pod's ControllerRef instead.
2017-02-23 08:58:28 -08:00
parent 298db3a0c3
commit ca13b9e532
13 changed files with 486 additions and 416 deletions
--- a/pkg/controller/replication/replication_controller.go
+++ b/pkg/controller/replication/replication_controller.go
@@ -29,7 +29,6 @@ import (
 	"k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
-	"k8s.io/apimachinery/pkg/runtime/schema"
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 	"k8s.io/apimachinery/pkg/util/wait"
 	utiltrace "k8s.io/apiserver/pkg/util/trace"
@@ -56,9 +55,8 @@ const (
 	statusUpdateRetries = 1
 )

-func getRCKind() schema.GroupVersionKind {
-	return v1.SchemeGroupVersion.WithKind("ReplicationController")
-}
+// controllerKind contains the schema.GroupVersionKind for this controller type.
+var controllerKind = v1.SchemeGroupVersion.WithKind("ReplicationController")

 // ReplicationManager is responsible for synchronizing ReplicationController objects stored
 // in the system with actual running pods.
@@ -85,14 +83,12 @@ type ReplicationManager struct {
 	// Added as a member to the struct to allow injection for testing.
 	podListerSynced cache.InformerSynced

-	lookupCache *controller.MatchingCache
-
 	// Controllers that need to be synced
 	queue workqueue.RateLimitingInterface
 }

 // NewReplicationManager configures a replication manager with the specified event recorder
-func NewReplicationManager(podInformer coreinformers.PodInformer, rcInformer coreinformers.ReplicationControllerInformer, kubeClient clientset.Interface, burstReplicas int, lookupCacheSize int) *ReplicationManager {
+func NewReplicationManager(podInformer coreinformers.PodInformer, rcInformer coreinformers.ReplicationControllerInformer, kubeClient clientset.Interface, burstReplicas int) *ReplicationManager {
 	if kubeClient != nil && kubeClient.Core().RESTClient().GetRateLimiter() != nil {
 		metrics.RegisterMetricAndTrackRateLimiterUsage("replication_controller", kubeClient.Core().RESTClient().GetRateLimiter())
 	}
@@ -135,7 +131,6 @@ func NewReplicationManager(podInformer coreinformers.PodInformer, rcInformer cor
 	rm.podListerSynced = podInformer.Informer().HasSynced

 	rm.syncHandler = rm.syncReplicationController
-	rm.lookupCache = controller.NewMatchingCache(lookupCacheSize)
 	return rm
 }

@@ -167,71 +162,19 @@ func (rm *ReplicationManager) Run(workers int, stopCh <-chan struct{}) {
 	glog.Infof("Shutting down RC Manager")
 }

-// getPodController returns the controller managing the given pod.
-// TODO: Surface that we are ignoring multiple controllers for a single pod.
-// TODO: use ownerReference.Controller to determine if the rc controls the pod.
-func (rm *ReplicationManager) getPodController(pod *v1.Pod) *v1.ReplicationController {
-	// look up in the cache, if cached and the cache is valid, just return cached value
-	if obj, cached := rm.lookupCache.GetMatchingObject(pod); cached {
-		controller, ok := obj.(*v1.ReplicationController)
-		if !ok {
-			// This should not happen
-			utilruntime.HandleError(fmt.Errorf("lookup cache does not return a ReplicationController object"))
-			return nil
-		}
-		if cached && rm.isCacheValid(pod, controller) {
-			return controller
-		}
-	}
-
-	// if not cached or cached value is invalid, search all the rc to find the matching one, and update cache
-	controllers, err := rm.rcLister.GetPodControllers(pod)
+// getPodControllers returns a list of ReplicationControllers matching the given pod.
+func (rm *ReplicationManager) getPodControllers(pod *v1.Pod) []*v1.ReplicationController {
+	rcs, err := rm.rcLister.GetPodControllers(pod)
 	if err != nil {
-		glog.V(4).Infof("No controllers found for pod %v, replication manager will avoid syncing", pod.Name)
+		glog.V(4).Infof("No ReplicationControllers found for pod %v, controller will avoid syncing", pod.Name)
 		return nil
 	}
-	// In theory, overlapping controllers is user error. This sorting will not prevent
-	// oscillation of replicas in all cases, eg:
-	// rc1 (older rc): [(k1=v1)], replicas=1 rc2: [(k2=v2)], replicas=2
-	// pod: [(k1:v1), (k2:v2)] will wake both rc1 and rc2, and we will sync rc1.
-	// pod: [(k2:v2)] will wake rc2 which creates a new replica.
-	if len(controllers) > 1 {
-		// More than two items in this list indicates user error. If two replication-controller
-		// overlap, sort by creation timestamp, subsort by name, then pick
-		// the first.
-		utilruntime.HandleError(fmt.Errorf("user error! more than one replication controller is selecting pods with labels: %+v", pod.Labels))
-		sort.Sort(OverlappingControllers(controllers))
+	if len(rcs) > 1 {
+		// ControllerRef will ensure we don't do anything crazy, but more than one
+		// item in this list nevertheless constitutes user error.
+		utilruntime.HandleError(fmt.Errorf("user error! more than one ReplicationController is selecting pods with labels: %+v", pod.Labels))
 	}
-
-	// update lookup cache
-	rm.lookupCache.Update(pod, controllers[0])
-
-	return controllers[0]
-}
-
-// isCacheValid check if the cache is valid
-func (rm *ReplicationManager) isCacheValid(pod *v1.Pod, cachedRC *v1.ReplicationController) bool {
-	_, err := rm.rcLister.ReplicationControllers(cachedRC.Namespace).Get(cachedRC.Name)
-	// rc has been deleted or updated, cache is invalid
-	if err != nil || !isControllerMatch(pod, cachedRC) {
-		return false
-	}
-	return true
-}
-
-// isControllerMatch take a Pod and ReplicationController, return whether the Pod and ReplicationController are matching
-// TODO(mqliang): This logic is a copy from GetPodControllers(), remove the duplication
-func isControllerMatch(pod *v1.Pod, rc *v1.ReplicationController) bool {
-	if rc.Namespace != pod.Namespace {
-		return false
-	}
-	selector := labels.Set(rc.Spec.Selector).AsSelectorPreValidated()
-
-	// If an rc with a nil or empty selector creeps in, it should match nothing, not everything.
-	if selector.Empty() || !selector.Matches(labels.Set(pod.Labels)) {
-		return false
-	}
-	return true
+	return rcs
 }

 // callback when RC is updated
@@ -239,20 +182,6 @@ func (rm *ReplicationManager) updateRC(old, cur interface{}) {
 	oldRC := old.(*v1.ReplicationController)
 	curRC := cur.(*v1.ReplicationController)

-	// We should invalidate the whole lookup cache if a RC's selector has been updated.
-	//
-	// Imagine that you have two RCs:
-	// * old RC1
-	// * new RC2
-	// You also have a pod that is attached to RC2 (because it doesn't match RC1 selector).
-	// Now imagine that you are changing RC1 selector so that it is now matching that pod,
-	// in such case, we must invalidate the whole cache so that pod could be adopted by RC1
-	//
-	// This makes the lookup cache less helpful, but selector update does not happen often,
-	// so it's not a big problem
-	if !reflect.DeepEqual(oldRC.Spec.Selector, curRC.Spec.Selector) {
-		rm.lookupCache.InvalidateAll()
-	}
 	// TODO: Remove when #31981 is resolved!
 	glog.Infof("Observed updated replication controller %v. Desired pod count change: %d->%d", curRC.Name, *(oldRC.Spec.Replicas), *(curRC.Spec.Replicas))

@@ -275,19 +204,10 @@ func (rm *ReplicationManager) updateRC(old, cur interface{}) {
 	rm.enqueueController(cur)
 }

-// When a pod is created, enqueue the controller that manages it and update it's expectations.
+// When a pod is created, enqueue the ReplicationController that manages it and update its expectations.
 func (rm *ReplicationManager) addPod(obj interface{}) {
 	pod := obj.(*v1.Pod)
-
-	rc := rm.getPodController(pod)
-	if rc == nil {
-		return
-	}
-	rcKey, err := controller.KeyFunc(rc)
-	if err != nil {
-		utilruntime.HandleError(fmt.Errorf("Couldn't get key for replication controller %#v: %v", rc, err))
-		return
-	}
+	glog.V(4).Infof("Pod %s created: %#v.", pod.Name, pod)

 	if pod.DeletionTimestamp != nil {
 		// on a restart of the controller manager, it's possible a new pod shows up in a state that
@@ -295,13 +215,38 @@ func (rm *ReplicationManager) addPod(obj interface{}) {
 		rm.deletePod(pod)
 		return
 	}
-	rm.expectations.CreationObserved(rcKey)
-	rm.enqueueController(rc)
+
+	// If it has a ControllerRef, that's all that matters.
+	if controllerRef := controller.GetControllerOf(pod); controllerRef != nil {
+		if controllerRef.Kind != controllerKind.Kind {
+			// It's controlled by a different type of controller.
+			return
+		}
+		rc, err := rm.rcLister.ReplicationControllers(pod.Namespace).Get(controllerRef.Name)
+		if err != nil {
+			return
+		}
+		rsKey, err := controller.KeyFunc(rc)
+		if err != nil {
+			return
+		}
+		rm.expectations.CreationObserved(rsKey)
+		rm.enqueueController(rc)
+		return
+	}
+
+	// Otherwise, it's an orphan. Get a list of all matching ReplicationControllers and sync
+	// them to see if anyone wants to adopt it.
+	// DO NOT observe creation because no controller should be waiting for an
+	// orphan.
+	for _, rc := range rm.getPodControllers(pod) {
+		rm.enqueueController(rc)
+	}
 }

-// When a pod is updated, figure out what controller/s manage it and wake them
+// When a pod is updated, figure out what ReplicationController/s manage it and wake them
 // up. If the labels of the pod have changed we need to awaken both the old
-// and new controller. old and cur must be *v1.Pod types.
+// and new ReplicationController. old and cur must be *v1.Pod types.
 func (rm *ReplicationManager) updatePod(old, cur interface{}) {
 	curPod := cur.(*v1.Pod)
 	oldPod := old.(*v1.Pod)
@@ -311,6 +256,7 @@ func (rm *ReplicationManager) updatePod(old, cur interface{}) {
 		return
 	}
 	glog.V(4).Infof("Pod %s updated, objectMeta %+v -> %+v.", curPod.Name, oldPod.ObjectMeta, curPod.ObjectMeta)
+
 	labelChanged := !reflect.DeepEqual(curPod.Labels, oldPod.Labels)
 	if curPod.DeletionTimestamp != nil {
 		// when a pod is deleted gracefully it's deletion timestamp is first modified to reflect a grace period,
@@ -326,34 +272,53 @@ func (rm *ReplicationManager) updatePod(old, cur interface{}) {
 		return
 	}

-	// Only need to get the old controller if the labels changed.
-	// Enqueue the oldRC before the curRC to give curRC a chance to adopt the oldPod.
-	if labelChanged {
-		// If the old and new rc are the same, the first one that syncs
-		// will set expectations preventing any damage from the second.
-		if oldRC := rm.getPodController(oldPod); oldRC != nil {
-			rm.enqueueController(oldRC)
+	curControllerRef := controller.GetControllerOf(curPod)
+	oldControllerRef := controller.GetControllerOf(oldPod)
+	controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef)
+	if controllerRefChanged &&
+		oldControllerRef != nil && oldControllerRef.Kind == controllerKind.Kind {
+		// The ControllerRef was changed. Sync the old controller, if any.
+		rc, err := rm.rcLister.ReplicationControllers(oldPod.Namespace).Get(oldControllerRef.Name)
+		if err == nil {
+			rm.enqueueController(rc)
 		}
 	}

-	changedToReady := !v1.IsPodReady(oldPod) && v1.IsPodReady(curPod)
-	if curRC := rm.getPodController(curPod); curRC != nil {
-		rm.enqueueController(curRC)
+	// If it has a ControllerRef, that's all that matters.
+	if curControllerRef != nil {
+		if curControllerRef.Kind != controllerKind.Kind {
+			// It's controlled by a different type of controller.
+			return
+		}
+		rc, err := rm.rcLister.ReplicationControllers(curPod.Namespace).Get(curControllerRef.Name)
+		if err != nil {
+			return
+		}
+		rm.enqueueController(rc)
 		// TODO: MinReadySeconds in the Pod will generate an Available condition to be added in
-		// the Pod status which in turn will trigger a requeue of the owning replication controller
-		// thus having its status updated with the newly available replica. For now, we can fake the
-		// update by resyncing the controller MinReadySeconds after the it is requeued because a Pod
-		// transitioned to Ready.
+		// the Pod status which in turn will trigger a requeue of the owning ReplicationController thus
+		// having its status updated with the newly available replica. For now, we can fake the
+		// update by resyncing the controller MinReadySeconds after the it is requeued because
+		// a Pod transitioned to Ready.
 		// Note that this still suffers from #29229, we are just moving the problem one level
-		// "closer" to kubelet (from the deployment to the replication controller manager).
-		if changedToReady && curRC.Spec.MinReadySeconds > 0 {
-			glog.V(2).Infof("ReplicationController %q will be enqueued after %ds for availability check", curRC.Name, curRC.Spec.MinReadySeconds)
-			rm.enqueueControllerAfter(curRC, time.Duration(curRC.Spec.MinReadySeconds)*time.Second)
+		// "closer" to kubelet (from the deployment to the ReplicationController controller).
+		if !v1.IsPodReady(oldPod) && v1.IsPodReady(curPod) && rc.Spec.MinReadySeconds > 0 {
+			glog.V(2).Infof("ReplicationController %q will be enqueued after %ds for availability check", rc.Name, rc.Spec.MinReadySeconds)
+			rm.enqueueControllerAfter(rc, time.Duration(rc.Spec.MinReadySeconds)*time.Second)
+		}
+		return
+	}
+
+	// Otherwise, it's an orphan. If anything changed, sync matching controllers
+	// to see if anyone wants to adopt it now.
+	if labelChanged || controllerRefChanged {
+		for _, rc := range rm.getPodControllers(curPod) {
+			rm.enqueueController(rc)
 		}
 	}
 }

-// When a pod is deleted, enqueue the controller that manages the pod and update its expectations.
+// When a pod is deleted, enqueue the ReplicationController that manages the pod and update its expectations.
 // obj could be an *v1.Pod, or a DeletionFinalStateUnknown marker item.
 func (rm *ReplicationManager) deletePod(obj interface{}) {
 	pod, ok := obj.(*v1.Pod)
@@ -361,45 +326,50 @@ func (rm *ReplicationManager) deletePod(obj interface{}) {
 	// When a delete is dropped, the relist will notice a pod in the store not
 	// in the list, leading to the insertion of a tombstone object which contains
 	// the deleted key/value. Note that this value might be stale. If the pod
-	// changed labels the new rc will not be woken up till the periodic resync.
+	// changed labels the new ReplicationController will not be woken up till the periodic resync.
 	if !ok {
 		tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
 		if !ok {
-			utilruntime.HandleError(fmt.Errorf("Couldn't get object from tombstone %#v", obj))
+			utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %+v", obj))
 			return
 		}
 		pod, ok = tombstone.Obj.(*v1.Pod)
 		if !ok {
-			utilruntime.HandleError(fmt.Errorf("Tombstone contained object that is not a pod %#v", obj))
+			utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a pod %#v", obj))
 			return
 		}
 	}
-	glog.V(4).Infof("Pod %s/%s deleted through %v, timestamp %+v, labels %+v.", pod.Namespace, pod.Name, utilruntime.GetCaller(), pod.DeletionTimestamp, pod.Labels)
-	if rc := rm.getPodController(pod); rc != nil {
-		rcKey, err := controller.KeyFunc(rc)
-		if err != nil {
-			utilruntime.HandleError(fmt.Errorf("Couldn't get key for replication controller %#v: %v", rc, err))
-			return
-		}
-		rm.expectations.DeletionObserved(rcKey, controller.PodKey(pod))
-		rm.enqueueController(rc)
+	glog.V(4).Infof("Pod %s/%s deleted through %v, timestamp %+v: %#v.", pod.Namespace, pod.Name, utilruntime.GetCaller(), pod.DeletionTimestamp, pod)
+
+	controllerRef := controller.GetControllerOf(pod)
+	if controllerRef == nil {
+		// No controller should care about orphans being deleted.
+		return
 	}
+	if controllerRef.Kind != controllerKind.Kind {
+		// It's controlled by a different type of controller.
+		return
+	}
+
+	rc, err := rm.rcLister.ReplicationControllers(pod.Namespace).Get(controllerRef.Name)
+	if err != nil {
+		return
+	}
+	rsKey, err := controller.KeyFunc(rc)
+	if err != nil {
+		return
+	}
+	rm.expectations.DeletionObserved(rsKey, controller.PodKey(pod))
+	rm.enqueueController(rc)
 }

 // obj could be an *v1.ReplicationController, or a DeletionFinalStateUnknown marker item.
 func (rm *ReplicationManager) enqueueController(obj interface{}) {
 	key, err := controller.KeyFunc(obj)
 	if err != nil {
-		utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err))
+		utilruntime.HandleError(fmt.Errorf("couldn't get key for object %+v: %v", obj, err))
 		return
 	}
-
-	// TODO: Handle overlapping controllers better. Either disallow them at admission time or
-	// deterministically avoid syncing controllers that fight over pods. Currently, we only
-	// ensure that the same controller is synced for a given pod. When we periodically relist
-	// all controllers there will still be some replica instability. One way to handle this is
-	// by querying the store for all controllers that this rc overlaps, as well as all
-	// controllers that overlap this rc, and sorting them.
 	rm.queue.Add(key)
 }

@@ -407,16 +377,9 @@ func (rm *ReplicationManager) enqueueController(obj interface{}) {
 func (rm *ReplicationManager) enqueueControllerAfter(obj interface{}, after time.Duration) {
 	key, err := controller.KeyFunc(obj)
 	if err != nil {
-		utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err))
+		utilruntime.HandleError(fmt.Errorf("couldn't get key for object %+v: %v", obj, err))
 		return
 	}
-
-	// TODO: Handle overlapping controllers better. Either disallow them at admission time or
-	// deterministically avoid syncing controllers that fight over pods. Currently, we only
-	// ensure that the same controller is synced for a given pod. When we periodically relist
-	// all controllers there will still be some replica instability. One way to handle this is
-	// by querying the store for all controllers that this rc overlaps, as well as all
-	// controllers that overlap this rc, and sorting them.
 	rm.queue.AddAfter(key, after)
 }

@@ -481,8 +444,8 @@ func (rm *ReplicationManager) manageReplicas(filteredPods []*v1.Pod, rc *v1.Repl
 				var err error
 				boolPtr := func(b bool) *bool { return &b }
 				controllerRef := &metav1.OwnerReference{
-					APIVersion:         getRCKind().GroupVersion().String(),
-					Kind:               getRCKind().Kind,
+					APIVersion:         controllerKind.GroupVersion().String(),
+					Kind:               controllerKind.Kind,
 					Name:               rc.Name,
 					UID:                rc.UID,
 					BlockOwnerDeletion: boolPtr(true),
@@ -610,7 +573,7 @@ func (rm *ReplicationManager) syncReplicationController(key string) error {
 		rm.queue.Add(key)
 		return err
 	}
-	cm := controller.NewPodControllerRefManager(rm.podControl, rc, labels.Set(rc.Spec.Selector).AsSelectorPreValidated(), getRCKind())
+	cm := controller.NewPodControllerRefManager(rm.podControl, rc, labels.Set(rc.Spec.Selector).AsSelectorPreValidated(), controllerKind)
 	filteredPods, err = cm.ClaimPods(pods)
 	if err != nil {
 		// Something went wrong with adoption or release.