Merge pull request #123238 from MirrorShih/master
Fix infinite loop and replace channel with queue
This commit is contained in:
		| @@ -106,13 +106,6 @@ type CIDRAllocatorParams struct { | ||||
| 	NodeCIDRMaskSizes []int | ||||
| } | ||||
|  | ||||
| // CIDRs are reserved, then node resource is patched with them. | ||||
| // nodeReservedCIDRs holds the reservation info for a node. | ||||
| type nodeReservedCIDRs struct { | ||||
| 	allocatedCIDRs []*net.IPNet | ||||
| 	nodeName       string | ||||
| } | ||||
|  | ||||
| // New creates a new CIDR range allocator. | ||||
| func New(ctx context.Context, kubeClient clientset.Interface, cloud cloudprovider.Interface, nodeInformer informers.NodeInformer, allocatorType CIDRAllocatorType, allocatorParams CIDRAllocatorParams) (CIDRAllocator, error) { | ||||
| 	nodeList, err := listNodes(ctx, kubeClient) | ||||
|   | ||||
| @@ -20,16 +20,16 @@ import ( | ||||
| 	"context" | ||||
| 	"fmt" | ||||
| 	"net" | ||||
| 	"sync" | ||||
| 	"time" | ||||
|  | ||||
| 	v1 "k8s.io/api/core/v1" | ||||
| 	"k8s.io/apimachinery/pkg/util/wait" | ||||
| 	"k8s.io/klog/v2" | ||||
| 	netutils "k8s.io/utils/net" | ||||
|  | ||||
| 	apierrors "k8s.io/apimachinery/pkg/api/errors" | ||||
| 	"k8s.io/apimachinery/pkg/types" | ||||
| 	utilruntime "k8s.io/apimachinery/pkg/util/runtime" | ||||
| 	"k8s.io/apimachinery/pkg/util/sets" | ||||
| 	informers "k8s.io/client-go/informers/core/v1" | ||||
| 	clientset "k8s.io/client-go/kubernetes" | ||||
| 	"k8s.io/client-go/kubernetes/scheme" | ||||
| @@ -37,6 +37,7 @@ import ( | ||||
| 	corelisters "k8s.io/client-go/listers/core/v1" | ||||
| 	"k8s.io/client-go/tools/cache" | ||||
| 	"k8s.io/client-go/tools/record" | ||||
| 	"k8s.io/client-go/util/workqueue" | ||||
| 	nodeutil "k8s.io/component-helpers/node/util" | ||||
| 	"k8s.io/kubernetes/pkg/controller/nodeipam/ipam/cidrset" | ||||
| 	controllerutil "k8s.io/kubernetes/pkg/controller/util/node" | ||||
| @@ -52,14 +53,12 @@ type rangeAllocator struct { | ||||
| 	nodeLister corelisters.NodeLister | ||||
| 	// nodesSynced returns true if the node shared informer has been synced at least once. | ||||
| 	nodesSynced cache.InformerSynced | ||||
| 	// Channel that is used to pass updating Nodes and their reserved CIDRs to the background | ||||
| 	// This increases a throughput of CIDR assignment by not blocking on long operations. | ||||
| 	nodeCIDRUpdateChannel chan nodeReservedCIDRs | ||||
| 	broadcaster record.EventBroadcaster | ||||
| 	recorder    record.EventRecorder | ||||
| 	// Keep a set of nodes that are currently being processed to avoid races in CIDR allocation | ||||
| 	lock              sync.Mutex | ||||
| 	nodesInProcessing sets.String | ||||
|  | ||||
| 	// queues are where incoming work is placed to de-dup and to allow "easy" | ||||
| 	// rate limited requeues on errors | ||||
| 	queue workqueue.RateLimitingInterface | ||||
| } | ||||
|  | ||||
| // NewCIDRRangeAllocator returns a CIDRAllocator to allocate CIDRs for node (one from each of clusterCIDRs) | ||||
| @@ -94,10 +93,9 @@ func NewCIDRRangeAllocator(ctx context.Context, client clientset.Interface, node | ||||
| 		cidrSets:     cidrSets, | ||||
| 		nodeLister:   nodeInformer.Lister(), | ||||
| 		nodesSynced:  nodeInformer.Informer().HasSynced, | ||||
| 		nodeCIDRUpdateChannel: make(chan nodeReservedCIDRs, cidrUpdateQueueSize), | ||||
| 		broadcaster:  eventBroadcaster, | ||||
| 		recorder:     recorder, | ||||
| 		nodesInProcessing:     sets.NewString(), | ||||
| 		queue:        workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "cidrallocator_node"), | ||||
| 	} | ||||
|  | ||||
| 	if allocatorParams.ServiceCIDR != nil { | ||||
| @@ -130,37 +128,33 @@ func NewCIDRRangeAllocator(ctx context.Context, client clientset.Interface, node | ||||
| 	} | ||||
|  | ||||
| 	nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ | ||||
| 		AddFunc: controllerutil.CreateAddNodeHandler(func(node *v1.Node) error { | ||||
| 			return ra.AllocateOrOccupyCIDR(logger, node) | ||||
| 		}), | ||||
| 		UpdateFunc: controllerutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error { | ||||
| 			// If the PodCIDRs list is not empty we either: | ||||
| 			// - already processed a Node that already had CIDRs after NC restarted | ||||
| 			//   (cidr is marked as used), | ||||
| 			// - already processed a Node successfully and allocated CIDRs for it | ||||
| 			//   (cidr is marked as used), | ||||
| 			// - already processed a Node but we did saw a "timeout" response and | ||||
| 			//   request eventually got through in this case we haven't released | ||||
| 			//   the allocated CIDRs (cidr is still marked as used). | ||||
| 			// There's a possible error here: | ||||
| 			// - NC sees a new Node and assigns CIDRs X,Y.. to it, | ||||
| 			// - Update Node call fails with a timeout, | ||||
| 			// - Node is updated by some other component, NC sees an update and | ||||
| 			//   assigns CIDRs A,B.. to the Node, | ||||
| 			// - Both CIDR X,Y.. and CIDR A,B.. are marked as used in the local cache, | ||||
| 			//   even though Node sees only CIDR A,B.. | ||||
| 			// The problem here is that in in-memory cache we see CIDR X,Y.. as marked, | ||||
| 			// which prevents it from being assigned to any new node. The cluster | ||||
| 			// state is correct. | ||||
| 			// Restart of NC fixes the issue. | ||||
| 			if len(newNode.Spec.PodCIDRs) == 0 { | ||||
| 				return ra.AllocateOrOccupyCIDR(logger, newNode) | ||||
| 		AddFunc: func(obj interface{}) { | ||||
| 			key, err := cache.MetaNamespaceKeyFunc(obj) | ||||
| 			if err == nil { | ||||
| 				ra.queue.Add(key) | ||||
| 			} | ||||
| 			return nil | ||||
| 		}), | ||||
| 		DeleteFunc: controllerutil.CreateDeleteNodeHandler(logger, func(node *v1.Node) error { | ||||
| 			return ra.ReleaseCIDR(logger, node) | ||||
| 		}), | ||||
| 		}, | ||||
| 		UpdateFunc: func(old, new interface{}) { | ||||
| 			key, err := cache.MetaNamespaceKeyFunc(new) | ||||
| 			if err == nil { | ||||
| 				ra.queue.Add(key) | ||||
| 			} | ||||
| 		}, | ||||
| 		DeleteFunc: func(obj interface{}) { | ||||
| 			// The informer cache no longer has the object, and since Node doesn't have a finalizer, | ||||
| 			// we don't see the Update with DeletionTimestamp != 0. | ||||
| 			// TODO: instead of executing the operation directly in the handler, build a small cache with key node.Name | ||||
| 			// and value PodCIDRs use ReleaseCIDR on the reconcile loop so we can retry on `ReleaseCIDR` failures. | ||||
| 			if err := ra.ReleaseCIDR(logger, obj.(*v1.Node)); err != nil { | ||||
| 				utilruntime.HandleError(fmt.Errorf("error while processing CIDR Release: %w", err)) | ||||
| 			} | ||||
| 			// IndexerInformer uses a delta nodeQueue, therefore for deletes we have to use this | ||||
| 			// key function. | ||||
| 			key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) | ||||
| 			if err == nil { | ||||
| 				ra.queue.Add(key) | ||||
| 			} | ||||
| 		}, | ||||
| 	}) | ||||
|  | ||||
| 	return ra, nil | ||||
| @@ -176,6 +170,8 @@ func (r *rangeAllocator) Run(ctx context.Context) { | ||||
| 	r.broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: r.client.CoreV1().Events("")}) | ||||
| 	defer r.broadcaster.Shutdown() | ||||
|  | ||||
| 	defer r.queue.ShutDown() | ||||
|  | ||||
| 	logger.Info("Starting range CIDR allocator") | ||||
| 	defer logger.Info("Shutting down range CIDR allocator") | ||||
|  | ||||
| @@ -184,50 +180,100 @@ func (r *rangeAllocator) Run(ctx context.Context) { | ||||
| 	} | ||||
|  | ||||
| 	for i := 0; i < cidrUpdateWorkers; i++ { | ||||
| 		go r.worker(ctx) | ||||
| 		go wait.UntilWithContext(ctx, r.runWorker, time.Second) | ||||
| 	} | ||||
|  | ||||
| 	<-ctx.Done() | ||||
| } | ||||
|  | ||||
| func (r *rangeAllocator) worker(ctx context.Context) { | ||||
| 	logger := klog.FromContext(ctx) | ||||
| 	for { | ||||
| 		select { | ||||
| 		case workItem, ok := <-r.nodeCIDRUpdateChannel: | ||||
| 			if !ok { | ||||
| 				logger.Info("Channel nodeCIDRUpdateChannel was unexpectedly closed") | ||||
| 				return | ||||
| 			} | ||||
| 			if err := r.updateCIDRsAllocation(logger, workItem); err != nil { | ||||
| 				// Requeue the failed node for update again. | ||||
| 				r.nodeCIDRUpdateChannel <- workItem | ||||
| 			} | ||||
| 		case <-ctx.Done(): | ||||
| 			return | ||||
| 		} | ||||
| // runWorker is a long-running function that will continually call the | ||||
| // processNextWorkItem function in order to read and process a message on the | ||||
| // queue. | ||||
| func (r *rangeAllocator) runWorker(ctx context.Context) { | ||||
| 	for r.processNextNodeWorkItem(ctx) { | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func (r *rangeAllocator) insertNodeToProcessing(nodeName string) bool { | ||||
| 	r.lock.Lock() | ||||
| 	defer r.lock.Unlock() | ||||
| 	if r.nodesInProcessing.Has(nodeName) { | ||||
| // processNextWorkItem will read a single work item off the queue and | ||||
| // attempt to process it, by calling the syncHandler. | ||||
| func (r *rangeAllocator) processNextNodeWorkItem(ctx context.Context) bool { | ||||
| 	obj, shutdown := r.queue.Get() | ||||
| 	if shutdown { | ||||
| 		return false | ||||
| 	} | ||||
| 	r.nodesInProcessing.Insert(nodeName) | ||||
|  | ||||
| 	// We wrap this block in a func so we can defer r.queue.Done. | ||||
| 	err := func(logger klog.Logger, obj interface{}) error { | ||||
| 		// We call Done here so the workNodeQueue knows we have finished | ||||
| 		// processing this item. We also must remember to call Forget if we | ||||
| 		// do not want this work item being re-queued. For example, we do | ||||
| 		// not call Forget if a transient error occurs, instead the item is | ||||
| 		// put back on the queue and attempted again after a back-off | ||||
| 		// period. | ||||
| 		defer r.queue.Done(obj) | ||||
| 		var key string | ||||
| 		var ok bool | ||||
| 		// We expect strings to come off the workNodeQueue. These are of the | ||||
| 		// form namespace/name. We do this as the delayed nature of the | ||||
| 		// workNodeQueue means the items in the informer cache may actually be | ||||
| 		// more up to date that when the item was initially put onto the | ||||
| 		// workNodeQueue. | ||||
| 		if key, ok = obj.(string); !ok { | ||||
| 			// As the item in the workNodeQueue is actually invalid, we call | ||||
| 			// Forget here else we'd go into a loop of attempting to | ||||
| 			// process a work item that is invalid. | ||||
| 			r.queue.Forget(obj) | ||||
| 			utilruntime.HandleError(fmt.Errorf("expected string in workNodeQueue but got %#v", obj)) | ||||
| 			return nil | ||||
| 		} | ||||
| 		// Run the syncHandler, passing it the namespace/name string of the | ||||
| 		// Foo resource to be synced. | ||||
| 		if err := r.syncNode(logger, key); err != nil { | ||||
| 			// Put the item back on the queue to handle any transient errors. | ||||
| 			r.queue.AddRateLimited(key) | ||||
| 			return fmt.Errorf("error syncing '%s': %s, requeuing", key, err.Error()) | ||||
| 		} | ||||
| 		// Finally, if no error occurs we Forget this item so it does not | ||||
| 		// get queue again until another change happens. | ||||
| 		r.queue.Forget(obj) | ||||
| 		logger.Info("Successfully synced", "key", key) | ||||
| 		return nil | ||||
| 	}(klog.FromContext(ctx), obj) | ||||
|  | ||||
| 	if err != nil { | ||||
| 		utilruntime.HandleError(err) | ||||
| 		return true | ||||
| 	} | ||||
|  | ||||
| 	return true | ||||
| } | ||||
|  | ||||
| func (r *rangeAllocator) removeNodeFromProcessing(nodeName string) { | ||||
| 	r.lock.Lock() | ||||
| 	defer r.lock.Unlock() | ||||
| 	r.nodesInProcessing.Delete(nodeName) | ||||
| func (r *rangeAllocator) syncNode(logger klog.Logger, key string) error { | ||||
| 	startTime := time.Now() | ||||
| 	defer func() { | ||||
| 		logger.V(4).Info("Finished syncing Node request", "node", key, "elapsed", time.Since(startTime)) | ||||
| 	}() | ||||
|  | ||||
| 	node, err := r.nodeLister.Get(key) | ||||
| 	if apierrors.IsNotFound(err) { | ||||
| 		logger.V(3).Info("node has been deleted", "node", key) | ||||
| 		// TODO: obtain the node object information to call ReleaseCIDR from here | ||||
| 		// and retry if there is an error. | ||||
| 		return nil | ||||
| 	} | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 	// Check the DeletionTimestamp to determine if object is under deletion. | ||||
| 	if !node.DeletionTimestamp.IsZero() { | ||||
| 		logger.V(3).Info("node is being deleted", "node", key) | ||||
| 		return r.ReleaseCIDR(logger, node) | ||||
| 	} | ||||
| 	return r.AllocateOrOccupyCIDR(logger, node) | ||||
| } | ||||
|  | ||||
| // marks node.PodCIDRs[...] as used in allocator's tracked cidrSet | ||||
| func (r *rangeAllocator) occupyCIDRs(node *v1.Node) error { | ||||
| 	defer r.removeNodeFromProcessing(node.Name) | ||||
| 	if len(node.Spec.PodCIDRs) == 0 { | ||||
| 		return nil | ||||
| 	} | ||||
| @@ -257,34 +303,25 @@ func (r *rangeAllocator) AllocateOrOccupyCIDR(logger klog.Logger, node *v1.Node) | ||||
| 	if node == nil { | ||||
| 		return nil | ||||
| 	} | ||||
| 	if !r.insertNodeToProcessing(node.Name) { | ||||
| 		logger.V(2).Info("Node is already in a process of CIDR assignment", "node", klog.KObj(node)) | ||||
| 		return nil | ||||
| 	} | ||||
|  | ||||
| 	if len(node.Spec.PodCIDRs) > 0 { | ||||
| 		return r.occupyCIDRs(node) | ||||
| 	} | ||||
| 	// allocate and queue the assignment | ||||
| 	allocated := nodeReservedCIDRs{ | ||||
| 		nodeName:       node.Name, | ||||
| 		allocatedCIDRs: make([]*net.IPNet, len(r.cidrSets)), | ||||
| 	} | ||||
|  | ||||
| 	allocatedCIDRs := make([]*net.IPNet, len(r.cidrSets)) | ||||
|  | ||||
| 	for idx := range r.cidrSets { | ||||
| 		podCIDR, err := r.cidrSets[idx].AllocateNext() | ||||
| 		if err != nil { | ||||
| 			r.removeNodeFromProcessing(node.Name) | ||||
| 			controllerutil.RecordNodeStatusChange(logger, r.recorder, node, "CIDRNotAvailable") | ||||
| 			return fmt.Errorf("failed to allocate cidr from cluster cidr at idx:%v: %v", idx, err) | ||||
| 		} | ||||
| 		allocated.allocatedCIDRs[idx] = podCIDR | ||||
| 		allocatedCIDRs[idx] = podCIDR | ||||
| 	} | ||||
|  | ||||
| 	//queue the assignment | ||||
| 	logger.V(4).Info("Putting node with CIDR into the work queue", "node", klog.KObj(node), "CIDRs", allocated.allocatedCIDRs) | ||||
| 	r.nodeCIDRUpdateChannel <- allocated | ||||
| 	return nil | ||||
| 	logger.V(4).Info("Putting node with CIDR into the work queue", "node", klog.KObj(node), "CIDRs", allocatedCIDRs) | ||||
| 	return r.updateCIDRsAllocation(logger, node.Name, allocatedCIDRs) | ||||
| } | ||||
|  | ||||
| // ReleaseCIDR marks node.podCIDRs[...] as unused in our tracked cidrSets | ||||
| @@ -336,21 +373,20 @@ func (r *rangeAllocator) filterOutServiceRange(logger klog.Logger, serviceCIDR * | ||||
| } | ||||
|  | ||||
| // updateCIDRsAllocation assigns CIDR to Node and sends an update to the API server. | ||||
| func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, data nodeReservedCIDRs) error { | ||||
| func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, nodeName string, allocatedCIDRs []*net.IPNet) error { | ||||
| 	var err error | ||||
| 	var node *v1.Node | ||||
| 	defer r.removeNodeFromProcessing(data.nodeName) | ||||
| 	cidrsString := ipnetToStringList(data.allocatedCIDRs) | ||||
| 	node, err = r.nodeLister.Get(data.nodeName) | ||||
| 	cidrsString := ipnetToStringList(allocatedCIDRs) | ||||
| 	node, err = r.nodeLister.Get(nodeName) | ||||
| 	if err != nil { | ||||
| 		logger.Error(err, "Failed while getting node for updating Node.Spec.PodCIDRs", "node", klog.KRef("", data.nodeName)) | ||||
| 		logger.Error(err, "Failed while getting node for updating Node.Spec.PodCIDRs", "node", klog.KRef("", nodeName)) | ||||
| 		return err | ||||
| 	} | ||||
|  | ||||
| 	// if cidr list matches the proposed. | ||||
| 	// then we possibly updated this node | ||||
| 	// and just failed to ack the success. | ||||
| 	if len(node.Spec.PodCIDRs) == len(data.allocatedCIDRs) { | ||||
| 	if len(node.Spec.PodCIDRs) == len(allocatedCIDRs) { | ||||
| 		match := true | ||||
| 		for idx, cidr := range cidrsString { | ||||
| 			if node.Spec.PodCIDRs[idx] != cidr { | ||||
| @@ -359,7 +395,7 @@ func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, data nodeRese | ||||
| 			} | ||||
| 		} | ||||
| 		if match { | ||||
| 			logger.V(4).Info("Node already has allocated CIDR. It matches the proposed one", "node", klog.KObj(node), "CIDRs", data.allocatedCIDRs) | ||||
| 			logger.V(4).Info("Node already has allocated CIDR. It matches the proposed one", "node", klog.KObj(node), "CIDRs", allocatedCIDRs) | ||||
| 			return nil | ||||
| 		} | ||||
| 	} | ||||
| @@ -367,7 +403,7 @@ func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, data nodeRese | ||||
| 	// node has cidrs, release the reserved | ||||
| 	if len(node.Spec.PodCIDRs) != 0 { | ||||
| 		logger.Error(nil, "Node already has a CIDR allocated. Releasing the new one", "node", klog.KObj(node), "podCIDRs", node.Spec.PodCIDRs) | ||||
| 		for idx, cidr := range data.allocatedCIDRs { | ||||
| 		for idx, cidr := range allocatedCIDRs { | ||||
| 			if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil { | ||||
| 				logger.Error(releaseErr, "Error when releasing CIDR", "index", idx, "CIDR", cidr) | ||||
| 			} | ||||
| @@ -390,7 +426,7 @@ func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, data nodeRese | ||||
| 	// NodeController restart will return all falsely allocated CIDRs to the pool. | ||||
| 	if !apierrors.IsServerTimeout(err) { | ||||
| 		logger.Error(err, "CIDR assignment for node failed. Releasing allocated CIDR", "node", klog.KObj(node)) | ||||
| 		for idx, cidr := range data.allocatedCIDRs { | ||||
| 		for idx, cidr := range allocatedCIDRs { | ||||
| 			if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil { | ||||
| 				logger.Error(releaseErr, "Error releasing allocated CIDR for node", "node", klog.KObj(node)) | ||||
| 			} | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Kubernetes Prow Robot
					Kubernetes Prow Robot