RC/RS: Use ControllerRef to route watch events.

This is part of the completion of ControllerRef, as described here:

https://github.com/kubernetes/community/blob/master/contributors/design-proposals/controller-ref.md#watches

This also removes the need for the Pod->Controller mapping cache in RC
and RS. This mapping is now persisted in the Pod's ControllerRef
instead.
This commit is contained in:
Anthony Yeh
2017-02-23 08:58:28 -08:00
parent 298db3a0c3
commit ca13b9e532
13 changed files with 486 additions and 416 deletions

View File

@@ -29,7 +29,6 @@ import (
"k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/runtime/schema"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/apimachinery/pkg/util/wait"
utiltrace "k8s.io/apiserver/pkg/util/trace"
@@ -56,9 +55,8 @@ const (
statusUpdateRetries = 1
)
func getRCKind() schema.GroupVersionKind {
return v1.SchemeGroupVersion.WithKind("ReplicationController")
}
// controllerKind contains the schema.GroupVersionKind for this controller type.
var controllerKind = v1.SchemeGroupVersion.WithKind("ReplicationController")
// ReplicationManager is responsible for synchronizing ReplicationController objects stored
// in the system with actual running pods.
@@ -85,14 +83,12 @@ type ReplicationManager struct {
// Added as a member to the struct to allow injection for testing.
podListerSynced cache.InformerSynced
lookupCache *controller.MatchingCache
// Controllers that need to be synced
queue workqueue.RateLimitingInterface
}
// NewReplicationManager configures a replication manager with the specified event recorder
func NewReplicationManager(podInformer coreinformers.PodInformer, rcInformer coreinformers.ReplicationControllerInformer, kubeClient clientset.Interface, burstReplicas int, lookupCacheSize int) *ReplicationManager {
func NewReplicationManager(podInformer coreinformers.PodInformer, rcInformer coreinformers.ReplicationControllerInformer, kubeClient clientset.Interface, burstReplicas int) *ReplicationManager {
if kubeClient != nil && kubeClient.Core().RESTClient().GetRateLimiter() != nil {
metrics.RegisterMetricAndTrackRateLimiterUsage("replication_controller", kubeClient.Core().RESTClient().GetRateLimiter())
}
@@ -135,7 +131,6 @@ func NewReplicationManager(podInformer coreinformers.PodInformer, rcInformer cor
rm.podListerSynced = podInformer.Informer().HasSynced
rm.syncHandler = rm.syncReplicationController
rm.lookupCache = controller.NewMatchingCache(lookupCacheSize)
return rm
}
@@ -167,71 +162,19 @@ func (rm *ReplicationManager) Run(workers int, stopCh <-chan struct{}) {
glog.Infof("Shutting down RC Manager")
}
// getPodController returns the controller managing the given pod.
// TODO: Surface that we are ignoring multiple controllers for a single pod.
// TODO: use ownerReference.Controller to determine if the rc controls the pod.
func (rm *ReplicationManager) getPodController(pod *v1.Pod) *v1.ReplicationController {
// look up in the cache, if cached and the cache is valid, just return cached value
if obj, cached := rm.lookupCache.GetMatchingObject(pod); cached {
controller, ok := obj.(*v1.ReplicationController)
if !ok {
// This should not happen
utilruntime.HandleError(fmt.Errorf("lookup cache does not return a ReplicationController object"))
return nil
}
if cached && rm.isCacheValid(pod, controller) {
return controller
}
}
// if not cached or cached value is invalid, search all the rc to find the matching one, and update cache
controllers, err := rm.rcLister.GetPodControllers(pod)
// getPodControllers returns a list of ReplicationControllers matching the given pod.
func (rm *ReplicationManager) getPodControllers(pod *v1.Pod) []*v1.ReplicationController {
rcs, err := rm.rcLister.GetPodControllers(pod)
if err != nil {
glog.V(4).Infof("No controllers found for pod %v, replication manager will avoid syncing", pod.Name)
glog.V(4).Infof("No ReplicationControllers found for pod %v, controller will avoid syncing", pod.Name)
return nil
}
// In theory, overlapping controllers is user error. This sorting will not prevent
// oscillation of replicas in all cases, eg:
// rc1 (older rc): [(k1=v1)], replicas=1 rc2: [(k2=v2)], replicas=2
// pod: [(k1:v1), (k2:v2)] will wake both rc1 and rc2, and we will sync rc1.
// pod: [(k2:v2)] will wake rc2 which creates a new replica.
if len(controllers) > 1 {
// More than two items in this list indicates user error. If two replication-controller
// overlap, sort by creation timestamp, subsort by name, then pick
// the first.
utilruntime.HandleError(fmt.Errorf("user error! more than one replication controller is selecting pods with labels: %+v", pod.Labels))
sort.Sort(OverlappingControllers(controllers))
if len(rcs) > 1 {
// ControllerRef will ensure we don't do anything crazy, but more than one
// item in this list nevertheless constitutes user error.
utilruntime.HandleError(fmt.Errorf("user error! more than one ReplicationController is selecting pods with labels: %+v", pod.Labels))
}
// update lookup cache
rm.lookupCache.Update(pod, controllers[0])
return controllers[0]
}
// isCacheValid check if the cache is valid
func (rm *ReplicationManager) isCacheValid(pod *v1.Pod, cachedRC *v1.ReplicationController) bool {
_, err := rm.rcLister.ReplicationControllers(cachedRC.Namespace).Get(cachedRC.Name)
// rc has been deleted or updated, cache is invalid
if err != nil || !isControllerMatch(pod, cachedRC) {
return false
}
return true
}
// isControllerMatch take a Pod and ReplicationController, return whether the Pod and ReplicationController are matching
// TODO(mqliang): This logic is a copy from GetPodControllers(), remove the duplication
func isControllerMatch(pod *v1.Pod, rc *v1.ReplicationController) bool {
if rc.Namespace != pod.Namespace {
return false
}
selector := labels.Set(rc.Spec.Selector).AsSelectorPreValidated()
// If an rc with a nil or empty selector creeps in, it should match nothing, not everything.
if selector.Empty() || !selector.Matches(labels.Set(pod.Labels)) {
return false
}
return true
return rcs
}
// callback when RC is updated
@@ -239,20 +182,6 @@ func (rm *ReplicationManager) updateRC(old, cur interface{}) {
oldRC := old.(*v1.ReplicationController)
curRC := cur.(*v1.ReplicationController)
// We should invalidate the whole lookup cache if a RC's selector has been updated.
//
// Imagine that you have two RCs:
// * old RC1
// * new RC2
// You also have a pod that is attached to RC2 (because it doesn't match RC1 selector).
// Now imagine that you are changing RC1 selector so that it is now matching that pod,
// in such case, we must invalidate the whole cache so that pod could be adopted by RC1
//
// This makes the lookup cache less helpful, but selector update does not happen often,
// so it's not a big problem
if !reflect.DeepEqual(oldRC.Spec.Selector, curRC.Spec.Selector) {
rm.lookupCache.InvalidateAll()
}
// TODO: Remove when #31981 is resolved!
glog.Infof("Observed updated replication controller %v. Desired pod count change: %d->%d", curRC.Name, *(oldRC.Spec.Replicas), *(curRC.Spec.Replicas))
@@ -275,19 +204,10 @@ func (rm *ReplicationManager) updateRC(old, cur interface{}) {
rm.enqueueController(cur)
}
// When a pod is created, enqueue the controller that manages it and update it's expectations.
// When a pod is created, enqueue the ReplicationController that manages it and update its expectations.
func (rm *ReplicationManager) addPod(obj interface{}) {
pod := obj.(*v1.Pod)
rc := rm.getPodController(pod)
if rc == nil {
return
}
rcKey, err := controller.KeyFunc(rc)
if err != nil {
utilruntime.HandleError(fmt.Errorf("Couldn't get key for replication controller %#v: %v", rc, err))
return
}
glog.V(4).Infof("Pod %s created: %#v.", pod.Name, pod)
if pod.DeletionTimestamp != nil {
// on a restart of the controller manager, it's possible a new pod shows up in a state that
@@ -295,13 +215,38 @@ func (rm *ReplicationManager) addPod(obj interface{}) {
rm.deletePod(pod)
return
}
rm.expectations.CreationObserved(rcKey)
rm.enqueueController(rc)
// If it has a ControllerRef, that's all that matters.
if controllerRef := controller.GetControllerOf(pod); controllerRef != nil {
if controllerRef.Kind != controllerKind.Kind {
// It's controlled by a different type of controller.
return
}
rc, err := rm.rcLister.ReplicationControllers(pod.Namespace).Get(controllerRef.Name)
if err != nil {
return
}
rsKey, err := controller.KeyFunc(rc)
if err != nil {
return
}
rm.expectations.CreationObserved(rsKey)
rm.enqueueController(rc)
return
}
// Otherwise, it's an orphan. Get a list of all matching ReplicationControllers and sync
// them to see if anyone wants to adopt it.
// DO NOT observe creation because no controller should be waiting for an
// orphan.
for _, rc := range rm.getPodControllers(pod) {
rm.enqueueController(rc)
}
}
// When a pod is updated, figure out what controller/s manage it and wake them
// When a pod is updated, figure out what ReplicationController/s manage it and wake them
// up. If the labels of the pod have changed we need to awaken both the old
// and new controller. old and cur must be *v1.Pod types.
// and new ReplicationController. old and cur must be *v1.Pod types.
func (rm *ReplicationManager) updatePod(old, cur interface{}) {
curPod := cur.(*v1.Pod)
oldPod := old.(*v1.Pod)
@@ -311,6 +256,7 @@ func (rm *ReplicationManager) updatePod(old, cur interface{}) {
return
}
glog.V(4).Infof("Pod %s updated, objectMeta %+v -> %+v.", curPod.Name, oldPod.ObjectMeta, curPod.ObjectMeta)
labelChanged := !reflect.DeepEqual(curPod.Labels, oldPod.Labels)
if curPod.DeletionTimestamp != nil {
// when a pod is deleted gracefully it's deletion timestamp is first modified to reflect a grace period,
@@ -326,34 +272,53 @@ func (rm *ReplicationManager) updatePod(old, cur interface{}) {
return
}
// Only need to get the old controller if the labels changed.
// Enqueue the oldRC before the curRC to give curRC a chance to adopt the oldPod.
if labelChanged {
// If the old and new rc are the same, the first one that syncs
// will set expectations preventing any damage from the second.
if oldRC := rm.getPodController(oldPod); oldRC != nil {
rm.enqueueController(oldRC)
curControllerRef := controller.GetControllerOf(curPod)
oldControllerRef := controller.GetControllerOf(oldPod)
controllerRefChanged := !reflect.DeepEqual(curControllerRef, oldControllerRef)
if controllerRefChanged &&
oldControllerRef != nil && oldControllerRef.Kind == controllerKind.Kind {
// The ControllerRef was changed. Sync the old controller, if any.
rc, err := rm.rcLister.ReplicationControllers(oldPod.Namespace).Get(oldControllerRef.Name)
if err == nil {
rm.enqueueController(rc)
}
}
changedToReady := !v1.IsPodReady(oldPod) && v1.IsPodReady(curPod)
if curRC := rm.getPodController(curPod); curRC != nil {
rm.enqueueController(curRC)
// If it has a ControllerRef, that's all that matters.
if curControllerRef != nil {
if curControllerRef.Kind != controllerKind.Kind {
// It's controlled by a different type of controller.
return
}
rc, err := rm.rcLister.ReplicationControllers(curPod.Namespace).Get(curControllerRef.Name)
if err != nil {
return
}
rm.enqueueController(rc)
// TODO: MinReadySeconds in the Pod will generate an Available condition to be added in
// the Pod status which in turn will trigger a requeue of the owning replication controller
// thus having its status updated with the newly available replica. For now, we can fake the
// update by resyncing the controller MinReadySeconds after the it is requeued because a Pod
// transitioned to Ready.
// the Pod status which in turn will trigger a requeue of the owning ReplicationController thus
// having its status updated with the newly available replica. For now, we can fake the
// update by resyncing the controller MinReadySeconds after the it is requeued because
// a Pod transitioned to Ready.
// Note that this still suffers from #29229, we are just moving the problem one level
// "closer" to kubelet (from the deployment to the replication controller manager).
if changedToReady && curRC.Spec.MinReadySeconds > 0 {
glog.V(2).Infof("ReplicationController %q will be enqueued after %ds for availability check", curRC.Name, curRC.Spec.MinReadySeconds)
rm.enqueueControllerAfter(curRC, time.Duration(curRC.Spec.MinReadySeconds)*time.Second)
// "closer" to kubelet (from the deployment to the ReplicationController controller).
if !v1.IsPodReady(oldPod) && v1.IsPodReady(curPod) && rc.Spec.MinReadySeconds > 0 {
glog.V(2).Infof("ReplicationController %q will be enqueued after %ds for availability check", rc.Name, rc.Spec.MinReadySeconds)
rm.enqueueControllerAfter(rc, time.Duration(rc.Spec.MinReadySeconds)*time.Second)
}
return
}
// Otherwise, it's an orphan. If anything changed, sync matching controllers
// to see if anyone wants to adopt it now.
if labelChanged || controllerRefChanged {
for _, rc := range rm.getPodControllers(curPod) {
rm.enqueueController(rc)
}
}
}
// When a pod is deleted, enqueue the controller that manages the pod and update its expectations.
// When a pod is deleted, enqueue the ReplicationController that manages the pod and update its expectations.
// obj could be an *v1.Pod, or a DeletionFinalStateUnknown marker item.
func (rm *ReplicationManager) deletePod(obj interface{}) {
pod, ok := obj.(*v1.Pod)
@@ -361,45 +326,50 @@ func (rm *ReplicationManager) deletePod(obj interface{}) {
// When a delete is dropped, the relist will notice a pod in the store not
// in the list, leading to the insertion of a tombstone object which contains
// the deleted key/value. Note that this value might be stale. If the pod
// changed labels the new rc will not be woken up till the periodic resync.
// changed labels the new ReplicationController will not be woken up till the periodic resync.
if !ok {
tombstone, ok := obj.(cache.DeletedFinalStateUnknown)
if !ok {
utilruntime.HandleError(fmt.Errorf("Couldn't get object from tombstone %#v", obj))
utilruntime.HandleError(fmt.Errorf("couldn't get object from tombstone %+v", obj))
return
}
pod, ok = tombstone.Obj.(*v1.Pod)
if !ok {
utilruntime.HandleError(fmt.Errorf("Tombstone contained object that is not a pod %#v", obj))
utilruntime.HandleError(fmt.Errorf("tombstone contained object that is not a pod %#v", obj))
return
}
}
glog.V(4).Infof("Pod %s/%s deleted through %v, timestamp %+v, labels %+v.", pod.Namespace, pod.Name, utilruntime.GetCaller(), pod.DeletionTimestamp, pod.Labels)
if rc := rm.getPodController(pod); rc != nil {
rcKey, err := controller.KeyFunc(rc)
if err != nil {
utilruntime.HandleError(fmt.Errorf("Couldn't get key for replication controller %#v: %v", rc, err))
return
}
rm.expectations.DeletionObserved(rcKey, controller.PodKey(pod))
rm.enqueueController(rc)
glog.V(4).Infof("Pod %s/%s deleted through %v, timestamp %+v: %#v.", pod.Namespace, pod.Name, utilruntime.GetCaller(), pod.DeletionTimestamp, pod)
controllerRef := controller.GetControllerOf(pod)
if controllerRef == nil {
// No controller should care about orphans being deleted.
return
}
if controllerRef.Kind != controllerKind.Kind {
// It's controlled by a different type of controller.
return
}
rc, err := rm.rcLister.ReplicationControllers(pod.Namespace).Get(controllerRef.Name)
if err != nil {
return
}
rsKey, err := controller.KeyFunc(rc)
if err != nil {
return
}
rm.expectations.DeletionObserved(rsKey, controller.PodKey(pod))
rm.enqueueController(rc)
}
// obj could be an *v1.ReplicationController, or a DeletionFinalStateUnknown marker item.
func (rm *ReplicationManager) enqueueController(obj interface{}) {
key, err := controller.KeyFunc(obj)
if err != nil {
utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err))
utilruntime.HandleError(fmt.Errorf("couldn't get key for object %+v: %v", obj, err))
return
}
// TODO: Handle overlapping controllers better. Either disallow them at admission time or
// deterministically avoid syncing controllers that fight over pods. Currently, we only
// ensure that the same controller is synced for a given pod. When we periodically relist
// all controllers there will still be some replica instability. One way to handle this is
// by querying the store for all controllers that this rc overlaps, as well as all
// controllers that overlap this rc, and sorting them.
rm.queue.Add(key)
}
@@ -407,16 +377,9 @@ func (rm *ReplicationManager) enqueueController(obj interface{}) {
func (rm *ReplicationManager) enqueueControllerAfter(obj interface{}, after time.Duration) {
key, err := controller.KeyFunc(obj)
if err != nil {
utilruntime.HandleError(fmt.Errorf("Couldn't get key for object %+v: %v", obj, err))
utilruntime.HandleError(fmt.Errorf("couldn't get key for object %+v: %v", obj, err))
return
}
// TODO: Handle overlapping controllers better. Either disallow them at admission time or
// deterministically avoid syncing controllers that fight over pods. Currently, we only
// ensure that the same controller is synced for a given pod. When we periodically relist
// all controllers there will still be some replica instability. One way to handle this is
// by querying the store for all controllers that this rc overlaps, as well as all
// controllers that overlap this rc, and sorting them.
rm.queue.AddAfter(key, after)
}
@@ -481,8 +444,8 @@ func (rm *ReplicationManager) manageReplicas(filteredPods []*v1.Pod, rc *v1.Repl
var err error
boolPtr := func(b bool) *bool { return &b }
controllerRef := &metav1.OwnerReference{
APIVersion: getRCKind().GroupVersion().String(),
Kind: getRCKind().Kind,
APIVersion: controllerKind.GroupVersion().String(),
Kind: controllerKind.Kind,
Name: rc.Name,
UID: rc.UID,
BlockOwnerDeletion: boolPtr(true),
@@ -610,7 +573,7 @@ func (rm *ReplicationManager) syncReplicationController(key string) error {
rm.queue.Add(key)
return err
}
cm := controller.NewPodControllerRefManager(rm.podControl, rc, labels.Set(rc.Spec.Selector).AsSelectorPreValidated(), getRCKind())
cm := controller.NewPodControllerRefManager(rm.podControl, rc, labels.Set(rc.Spec.Selector).AsSelectorPreValidated(), controllerKind)
filteredPods, err = cm.ClaimPods(pods)
if err != nil {
// Something went wrong with adoption or release.