diff --git a/cmd/cloud-controller-manager/nodeipamcontroller.go b/cmd/cloud-controller-manager/nodeipamcontroller.go index 49b5996839a..19b41e4a20a 100644 --- a/cmd/cloud-controller-manager/nodeipamcontroller.go +++ b/cmd/cloud-controller-manager/nodeipamcontroller.go @@ -62,11 +62,11 @@ func (nodeIpamController *nodeIPAMController) StartNodeIpamControllerWrapper(ini nodeIpamController.nodeIPAMControllerOptions.ApplyTo(&nodeIpamController.nodeIPAMControllerConfiguration) return func(ctx context.Context, controllerContext genericcontrollermanager.ControllerContext) (controller.Interface, bool, error) { - return startNodeIpamController(initContext, completedConfig, nodeIpamController.nodeIPAMControllerConfiguration, controllerContext, cloud) + return startNodeIpamController(ctx, initContext, completedConfig, nodeIpamController.nodeIPAMControllerConfiguration, controllerContext, cloud) } } -func startNodeIpamController(initContext app.ControllerInitContext, ccmConfig *cloudcontrollerconfig.CompletedConfig, nodeIPAMConfig nodeipamconfig.NodeIPAMControllerConfiguration, ctx genericcontrollermanager.ControllerContext, cloud cloudprovider.Interface) (controller.Interface, bool, error) { +func startNodeIpamController(ctx context.Context, initContext app.ControllerInitContext, ccmConfig *cloudcontrollerconfig.CompletedConfig, nodeIPAMConfig nodeipamconfig.NodeIPAMControllerConfiguration, controllerCtx genericcontrollermanager.ControllerContext, cloud cloudprovider.Interface) (controller.Interface, bool, error) { var serviceCIDR *net.IPNet var secondaryServiceCIDR *net.IPNet @@ -130,14 +130,14 @@ func startNodeIpamController(initContext app.ControllerInitContext, ccmConfig *c var clusterCIDRInformer v1alpha1.ClusterCIDRInformer if utilfeature.DefaultFeatureGate.Enabled(features.MultiCIDRRangeAllocator) { - clusterCIDRInformer = ctx.InformerFactory.Networking().V1alpha1().ClusterCIDRs() + clusterCIDRInformer = controllerCtx.InformerFactory.Networking().V1alpha1().ClusterCIDRs() } - nodeIpamController, err := nodeipamcontroller.NewNodeIpamController( - ctx.InformerFactory.Core().V1().Nodes(), + ctx, + controllerCtx.InformerFactory.Core().V1().Nodes(), clusterCIDRInformer, cloud, - ctx.ClientBuilder.ClientOrDie(initContext.ClientName), + controllerCtx.ClientBuilder.ClientOrDie(initContext.ClientName), clusterCIDRs, serviceCIDR, secondaryServiceCIDR, @@ -147,7 +147,7 @@ func startNodeIpamController(initContext app.ControllerInitContext, ccmConfig *c if err != nil { return nil, true, err } - go nodeIpamController.Run(ctx.Stop) + go nodeIpamController.Run(ctx) return nil, true, nil } diff --git a/cmd/kube-controller-manager/app/core.go b/cmd/kube-controller-manager/app/core.go index ce68226c99a..e665b3f8125 100644 --- a/cmd/kube-controller-manager/app/core.go +++ b/cmd/kube-controller-manager/app/core.go @@ -154,7 +154,9 @@ func startNodeIpamController(ctx context.Context, controllerContext ControllerCo clusterCIDRInformer = controllerContext.InformerFactory.Networking().V1alpha1().ClusterCIDRs() } + ctx = klog.NewContext(ctx, klog.LoggerWithName(klog.FromContext(ctx), "NodeIpamController")) nodeIpamController, err := nodeipamcontroller.NewNodeIpamController( + ctx, controllerContext.InformerFactory.Core().V1().Nodes(), clusterCIDRInformer, controllerContext.Cloud, @@ -168,7 +170,7 @@ func startNodeIpamController(ctx context.Context, controllerContext ControllerCo if err != nil { return nil, true, err } - go nodeIpamController.RunWithMetrics(ctx.Done(), controllerContext.ControllerManagerMetrics) + go nodeIpamController.RunWithMetrics(ctx, controllerContext.ControllerManagerMetrics) return nil, true, nil } diff --git a/pkg/controller/nodeipam/ipam/adapter.go b/pkg/controller/nodeipam/ipam/adapter.go index 82de5b24fa0..8995170fe00 100644 --- a/pkg/controller/nodeipam/ipam/adapter.go +++ b/pkg/controller/nodeipam/ipam/adapter.go @@ -61,7 +61,7 @@ func newAdapter(k8s clientset.Interface, cloud *gce.Cloud) *adapter { return ret } -func (a *adapter) Run(stopCh <-chan struct{}) { +func (a *adapter) Run(ctx context.Context) { defer utilruntime.HandleCrash() // Start event processing pipeline. @@ -69,7 +69,7 @@ func (a *adapter) Run(stopCh <-chan struct{}) { a.broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: a.k8s.CoreV1().Events("")}) defer a.broadcaster.Shutdown() - <-stopCh + <-ctx.Done() } func (a *adapter) Alias(ctx context.Context, node *v1.Node) (*net.IPNet, error) { @@ -88,7 +88,7 @@ func (a *adapter) Alias(ctx context.Context, node *v1.Node) (*net.IPNet, error) case 1: break default: - klog.Warningf("Node %q has more than one alias assigned (%v), defaulting to the first", node.Name, cidrs) + klog.FromContext(ctx).Info("Node has more than one alias assigned, defaulting to the first", "node", klog.KObj(node), "CIDRs", cidrs) } _, cidrRange, err := netutils.ParseCIDRSloppy(cidrs[0]) diff --git a/pkg/controller/nodeipam/ipam/cidr_allocator.go b/pkg/controller/nodeipam/ipam/cidr_allocator.go index 4ca058eefcf..13fc26ddfff 100644 --- a/pkg/controller/nodeipam/ipam/cidr_allocator.go +++ b/pkg/controller/nodeipam/ipam/cidr_allocator.go @@ -91,11 +91,11 @@ type CIDRAllocator interface { // AllocateOrOccupyCIDR looks at the given node, assigns it a valid // CIDR if it doesn't currently have one or mark the CIDR as used if // the node already have one. - AllocateOrOccupyCIDR(node *v1.Node) error + AllocateOrOccupyCIDR(logger klog.Logger, node *v1.Node) error // ReleaseCIDR releases the CIDR of the removed node. - ReleaseCIDR(node *v1.Node) error + ReleaseCIDR(logger klog.Logger, node *v1.Node) error // Run starts all the working logic of the allocator. - Run(stopCh <-chan struct{}) + Run(ctx context.Context) } // CIDRAllocatorParams is parameters that's required for creating new @@ -119,29 +119,30 @@ type nodeReservedCIDRs struct { } // New creates a new CIDR range allocator. -func New(kubeClient clientset.Interface, cloud cloudprovider.Interface, nodeInformer informers.NodeInformer, clusterCIDRInformer networkinginformers.ClusterCIDRInformer, allocatorType CIDRAllocatorType, allocatorParams CIDRAllocatorParams) (CIDRAllocator, error) { - nodeList, err := listNodes(kubeClient) +func New(ctx context.Context, kubeClient clientset.Interface, cloud cloudprovider.Interface, nodeInformer informers.NodeInformer, clusterCIDRInformer networkinginformers.ClusterCIDRInformer, allocatorType CIDRAllocatorType, allocatorParams CIDRAllocatorParams) (CIDRAllocator, error) { + logger := klog.FromContext(ctx) + nodeList, err := listNodes(logger, kubeClient) if err != nil { return nil, err } switch allocatorType { case RangeAllocatorType: - return NewCIDRRangeAllocator(kubeClient, nodeInformer, allocatorParams, nodeList) + return NewCIDRRangeAllocator(logger, kubeClient, nodeInformer, allocatorParams, nodeList) case MultiCIDRRangeAllocatorType: if !utilfeature.DefaultFeatureGate.Enabled(features.MultiCIDRRangeAllocator) { return nil, fmt.Errorf("invalid CIDR allocator type: %v, feature gate %v must be enabled", allocatorType, features.MultiCIDRRangeAllocator) } - return NewMultiCIDRRangeAllocator(kubeClient, nodeInformer, clusterCIDRInformer, allocatorParams, nodeList, nil) + return NewMultiCIDRRangeAllocator(ctx, kubeClient, nodeInformer, clusterCIDRInformer, allocatorParams, nodeList, nil) case CloudAllocatorType: - return NewCloudCIDRAllocator(kubeClient, cloud, nodeInformer) + return NewCloudCIDRAllocator(logger, kubeClient, cloud, nodeInformer) default: return nil, fmt.Errorf("invalid CIDR allocator type: %v", allocatorType) } } -func listNodes(kubeClient clientset.Interface) (*v1.NodeList, error) { +func listNodes(logger klog.Logger, kubeClient clientset.Interface) (*v1.NodeList, error) { var nodeList *v1.NodeList // We must poll because apiserver might not be up. This error causes // controller manager to restart. @@ -152,7 +153,7 @@ func listNodes(kubeClient clientset.Interface) (*v1.NodeList, error) { LabelSelector: labels.Everything().String(), }) if err != nil { - klog.Errorf("Failed to list all nodes: %v", err) + logger.Error(err, "Failed to list all nodes") return false, nil } return true, nil diff --git a/pkg/controller/nodeipam/ipam/cidrset/cidr_set_test.go b/pkg/controller/nodeipam/ipam/cidrset/cidr_set_test.go index 4175695253d..3d8a9b8dbc1 100644 --- a/pkg/controller/nodeipam/ipam/cidrset/cidr_set_test.go +++ b/pkg/controller/nodeipam/ipam/cidrset/cidr_set_test.go @@ -23,7 +23,6 @@ import ( "testing" "k8s.io/component-base/metrics/testutil" - "k8s.io/klog/v2" netutils "k8s.io/utils/net" ) @@ -558,34 +557,35 @@ func TestGetBitforCIDR(t *testing.T) { } for _, tc := range cases { - _, clusterCIDR, err := netutils.ParseCIDRSloppy(tc.clusterCIDRStr) - if err != nil { - t.Fatalf("unexpected error: %v for %v", err, tc.description) - } + t.Run(tc.description, func(t *testing.T) { + _, clusterCIDR, err := netutils.ParseCIDRSloppy(tc.clusterCIDRStr) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } - cs, err := NewCIDRSet(clusterCIDR, tc.subNetMaskSize) - if err != nil { - t.Fatalf("Error allocating CIDRSet for %v", tc.description) - } - _, subnetCIDR, err := netutils.ParseCIDRSloppy(tc.subNetCIDRStr) - if err != nil { - t.Fatalf("unexpected error: %v for %v", err, tc.description) - } + cs, err := NewCIDRSet(clusterCIDR, tc.subNetMaskSize) + if err != nil { + t.Fatalf("Error allocating CIDRSet") + } + _, subnetCIDR, err := netutils.ParseCIDRSloppy(tc.subNetCIDRStr) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + got, err := cs.getIndexForCIDR(subnetCIDR) + if err == nil && tc.expectErr { + t.Errorf("expected error but got null") + return + } - got, err := cs.getIndexForCIDR(subnetCIDR) - if err == nil && tc.expectErr { - klog.Errorf("expected error but got null for %v", tc.description) - continue - } + if err != nil && !tc.expectErr { + t.Errorf("unexpected error: %v", err) + return + } - if err != nil && !tc.expectErr { - klog.Errorf("unexpected error: %v for %v", err, tc.description) - continue - } - - if got != tc.expectedBit { - klog.Errorf("expected %v, but got %v for %v", tc.expectedBit, got, tc.description) - } + if got != tc.expectedBit { + t.Errorf("expected %v, but got %v", tc.expectedBit, got) + } + }) } } diff --git a/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go b/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go index 04d72ca5bc7..0102b8b87e0 100644 --- a/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go +++ b/pkg/controller/nodeipam/ipam/cloud_cidr_allocator.go @@ -20,6 +20,7 @@ limitations under the License. package ipam import ( + "context" "fmt" "math/rand" "net" @@ -27,6 +28,7 @@ import ( "time" "github.com/google/go-cmp/cmp" + "k8s.io/klog/v2" v1 "k8s.io/api/core/v1" @@ -85,9 +87,10 @@ type cloudCIDRAllocator struct { var _ CIDRAllocator = (*cloudCIDRAllocator)(nil) // NewCloudCIDRAllocator creates a new cloud CIDR allocator. -func NewCloudCIDRAllocator(client clientset.Interface, cloud cloudprovider.Interface, nodeInformer informers.NodeInformer) (CIDRAllocator, error) { +func NewCloudCIDRAllocator(logger klog.Logger, client clientset.Interface, cloud cloudprovider.Interface, nodeInformer informers.NodeInformer) (CIDRAllocator, error) { if client == nil { - klog.Fatalf("kubeClient is nil when starting NodeController") + logger.Error(nil, "kubeClient is nil when starting cloud CIDR allocator") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) } eventBroadcaster := record.NewBroadcaster() @@ -111,74 +114,80 @@ func NewCloudCIDRAllocator(client clientset.Interface, cloud cloudprovider.Inter } nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: controllerutil.CreateAddNodeHandler(ca.AllocateOrOccupyCIDR), + AddFunc: controllerutil.CreateAddNodeHandler( + func(node *v1.Node) error { + return ca.AllocateOrOccupyCIDR(logger, node) + }), UpdateFunc: controllerutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error { if newNode.Spec.PodCIDR == "" { - return ca.AllocateOrOccupyCIDR(newNode) + return ca.AllocateOrOccupyCIDR(logger, newNode) } // Even if PodCIDR is assigned, but NetworkUnavailable condition is // set to true, we need to process the node to set the condition. networkUnavailableTaint := &v1.Taint{Key: v1.TaintNodeNetworkUnavailable, Effect: v1.TaintEffectNoSchedule} _, cond := controllerutil.GetNodeCondition(&newNode.Status, v1.NodeNetworkUnavailable) if cond == nil || cond.Status != v1.ConditionFalse || utiltaints.TaintExists(newNode.Spec.Taints, networkUnavailableTaint) { - return ca.AllocateOrOccupyCIDR(newNode) + return ca.AllocateOrOccupyCIDR(logger, newNode) } return nil }), - DeleteFunc: controllerutil.CreateDeleteNodeHandler(ca.ReleaseCIDR), + DeleteFunc: controllerutil.CreateDeleteNodeHandler(func(node *v1.Node) error { + return ca.ReleaseCIDR(logger, node) + }), }) - - klog.Infof("Using cloud CIDR allocator (provider: %v)", cloud.ProviderName()) + logger.Info("Using cloud CIDR allocator", "provider", cloud.ProviderName()) return ca, nil } -func (ca *cloudCIDRAllocator) Run(stopCh <-chan struct{}) { +func (ca *cloudCIDRAllocator) Run(ctx context.Context) { defer utilruntime.HandleCrash() // Start event processing pipeline. ca.broadcaster.StartStructuredLogging(0) - klog.Infof("Sending events to api server.") + logger := klog.FromContext(ctx) + logger.Info("Sending events to api server") ca.broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: ca.client.CoreV1().Events("")}) defer ca.broadcaster.Shutdown() - klog.Infof("Starting cloud CIDR allocator") - defer klog.Infof("Shutting down cloud CIDR allocator") + logger.Info("Starting cloud CIDR allocator") + defer logger.Info("Shutting down cloud CIDR allocator") - if !cache.WaitForNamedCacheSync("cidrallocator", stopCh, ca.nodesSynced) { + if !cache.WaitForNamedCacheSync("cidrallocator", ctx.Done(), ca.nodesSynced) { return } for i := 0; i < cidrUpdateWorkers; i++ { - go ca.worker(stopCh) + go ca.worker(ctx) } - <-stopCh + <-ctx.Done() } -func (ca *cloudCIDRAllocator) worker(stopChan <-chan struct{}) { +func (ca *cloudCIDRAllocator) worker(ctx context.Context) { + logger := klog.FromContext(ctx) for { select { case workItem, ok := <-ca.nodeUpdateChannel: if !ok { - klog.Warning("Channel nodeCIDRUpdateChannel was unexpectedly closed") + logger.Info("Channel nodeCIDRUpdateChannel was unexpectedly closed") return } - if err := ca.updateCIDRAllocation(workItem); err == nil { - klog.V(3).Infof("Updated CIDR for %q", workItem) + if err := ca.updateCIDRAllocation(logger, workItem); err == nil { + logger.V(3).Info("Updated CIDR", "workItem", workItem) } else { - klog.Errorf("Error updating CIDR for %q: %v", workItem, err) - if canRetry, timeout := ca.retryParams(workItem); canRetry { - klog.V(2).Infof("Retrying update for %q after %v", workItem, timeout) + logger.Error(err, "Error updating CIDR", "workItem", workItem) + if canRetry, timeout := ca.retryParams(logger, workItem); canRetry { + logger.V(2).Info("Retrying update on next period", "workItem", workItem, "timeout", timeout) time.AfterFunc(timeout, func() { // Requeue the failed node for update again. ca.nodeUpdateChannel <- workItem }) continue } - klog.Errorf("Exceeded retry count for %q, dropping from queue", workItem) + logger.Error(nil, "Exceeded retry count, dropping from queue", "workItem", workItem) } ca.removeNodeFromProcessing(workItem) - case <-stopChan: + case <-ctx.Done(): return } } @@ -194,13 +203,13 @@ func (ca *cloudCIDRAllocator) insertNodeToProcessing(nodeName string) bool { return true } -func (ca *cloudCIDRAllocator) retryParams(nodeName string) (bool, time.Duration) { +func (ca *cloudCIDRAllocator) retryParams(logger klog.Logger, nodeName string) (bool, time.Duration) { ca.lock.Lock() defer ca.lock.Unlock() entry, ok := ca.nodesInProcessing[nodeName] if !ok { - klog.Errorf("Cannot get retryParams for %q as entry does not exist", nodeName) + logger.Error(nil, "Cannot get retryParams for node as entry does not exist", "node", klog.KRef("", nodeName)) return false, 0 } @@ -233,28 +242,28 @@ func (ca *cloudCIDRAllocator) removeNodeFromProcessing(nodeName string) { // WARNING: If you're adding any return calls or defer any more work from this // function you have to make sure to update nodesInProcessing properly with the // disposition of the node when the work is done. -func (ca *cloudCIDRAllocator) AllocateOrOccupyCIDR(node *v1.Node) error { +func (ca *cloudCIDRAllocator) AllocateOrOccupyCIDR(logger klog.Logger, node *v1.Node) error { if node == nil { return nil } if !ca.insertNodeToProcessing(node.Name) { - klog.V(2).InfoS("Node is already in a process of CIDR assignment", "node", klog.KObj(node)) + logger.V(2).Info("Node is already in a process of CIDR assignment", "node", klog.KObj(node)) return nil } - klog.V(4).Infof("Putting node %s into the work queue", node.Name) + logger.V(4).Info("Putting node into the work queue", "node", klog.KObj(node)) ca.nodeUpdateChannel <- node.Name return nil } // updateCIDRAllocation assigns CIDR to Node and sends an update to the API server. -func (ca *cloudCIDRAllocator) updateCIDRAllocation(nodeName string) error { +func (ca *cloudCIDRAllocator) updateCIDRAllocation(logger klog.Logger, nodeName string) error { node, err := ca.nodeLister.Get(nodeName) if err != nil { if errors.IsNotFound(err) { return nil // node no longer available, skip processing } - klog.ErrorS(err, "Failed while getting the node for updating Node.Spec.PodCIDR", "nodeName", nodeName) + logger.Error(err, "Failed while getting the node for updating Node.Spec.PodCIDR", "node", klog.KRef("", nodeName)) return err } if node.Spec.ProviderID == "" { @@ -272,7 +281,7 @@ func (ca *cloudCIDRAllocator) updateCIDRAllocation(nodeName string) error { } //Can have at most 2 ips (one for v4 and one for v6) if len(cidrStrings) > 2 { - klog.InfoS("Got more than 2 ips, truncating to 2", "cidrStrings", cidrStrings) + logger.Info("Got more than 2 ips, truncating to 2", "cidrStrings", cidrStrings) cidrStrings = cidrStrings[:2] } @@ -281,13 +290,13 @@ func (ca *cloudCIDRAllocator) updateCIDRAllocation(nodeName string) error { return fmt.Errorf("failed to parse strings %v as CIDRs: %v", cidrStrings, err) } - needUpdate, err := needPodCIDRsUpdate(node, cidrs) + needUpdate, err := needPodCIDRsUpdate(logger, node, cidrs) if err != nil { return fmt.Errorf("err: %v, CIDRS: %v", err, cidrStrings) } if needUpdate { if node.Spec.PodCIDR != "" { - klog.ErrorS(nil, "PodCIDR being reassigned!", "nodeName", node.Name, "podCIDRs", node.Spec.PodCIDRs, "cidrStrings", cidrStrings) + logger.Error(nil, "PodCIDR being reassigned", "node", klog.KObj(node), "podCIDRs", node.Spec.PodCIDRs, "cidrStrings", cidrStrings) // We fall through and set the CIDR despite this error. This // implements the same logic as implemented in the // rangeAllocator. @@ -296,14 +305,14 @@ func (ca *cloudCIDRAllocator) updateCIDRAllocation(nodeName string) error { } for i := 0; i < cidrUpdateRetries; i++ { if err = nodeutil.PatchNodeCIDRs(ca.client, types.NodeName(node.Name), cidrStrings); err == nil { - klog.InfoS("Set the node PodCIDRs", "nodeName", node.Name, "cidrStrings", cidrStrings) + logger.Info("Set the node PodCIDRs", "node", klog.KObj(node), "cidrStrings", cidrStrings) break } } } if err != nil { controllerutil.RecordNodeStatusChange(ca.recorder, node, "CIDRAssignmentFailed") - klog.ErrorS(err, "Failed to update the node PodCIDR after multiple attempts", "nodeName", node.Name, "cidrStrings", cidrStrings) + logger.Error(err, "Failed to update the node PodCIDR after multiple attempts", "node", klog.KObj(node), "cidrStrings", cidrStrings) return err } @@ -315,31 +324,31 @@ func (ca *cloudCIDRAllocator) updateCIDRAllocation(nodeName string) error { LastTransitionTime: metav1.Now(), }) if err != nil { - klog.ErrorS(err, "Error setting route status for the node", "nodeName", node.Name) + logger.Error(err, "Error setting route status for the node", "node", klog.KObj(node)) } return err } -func needPodCIDRsUpdate(node *v1.Node, podCIDRs []*net.IPNet) (bool, error) { +func needPodCIDRsUpdate(logger klog.Logger, node *v1.Node, podCIDRs []*net.IPNet) (bool, error) { if node.Spec.PodCIDR == "" { return true, nil } _, nodePodCIDR, err := netutils.ParseCIDRSloppy(node.Spec.PodCIDR) if err != nil { - klog.ErrorS(err, "Found invalid node.Spec.PodCIDR", "podCIDR", node.Spec.PodCIDR) + logger.Error(err, "Found invalid node.Spec.PodCIDR", "podCIDR", node.Spec.PodCIDR) // We will try to overwrite with new CIDR(s) return true, nil } nodePodCIDRs, err := netutils.ParseCIDRs(node.Spec.PodCIDRs) if err != nil { - klog.ErrorS(err, "Found invalid node.Spec.PodCIDRs", "podCIDRs", node.Spec.PodCIDRs) + logger.Error(err, "Found invalid node.Spec.PodCIDRs", "podCIDRs", node.Spec.PodCIDRs) // We will try to overwrite with new CIDR(s) return true, nil } if len(podCIDRs) == 1 { if cmp.Equal(nodePodCIDR, podCIDRs[0]) { - klog.V(4).InfoS("Node already has allocated CIDR. It matches the proposed one.", "nodeName", node.Name, "podCIDR", podCIDRs[0]) + logger.V(4).Info("Node already has allocated CIDR. It matches the proposed one", "node", klog.KObj(node), "podCIDR", podCIDRs[0]) return false, nil } } else if len(nodePodCIDRs) == len(podCIDRs) { @@ -351,15 +360,15 @@ func needPodCIDRsUpdate(node *v1.Node, podCIDRs []*net.IPNet) (bool, error) { return true, nil } } - klog.V(4).InfoS("Node already has allocated CIDRs. It matches the proposed one.", "nodeName", node.Name, "podCIDRs", podCIDRs) + logger.V(4).Info("Node already has allocated CIDRs. It matches the proposed one", "node", klog.KObj(node), "podCIDRs", podCIDRs) return false, nil } return true, nil } -func (ca *cloudCIDRAllocator) ReleaseCIDR(node *v1.Node) error { - klog.V(2).Infof("Node %v PodCIDR (%v) will be released by external cloud provider (not managed by controller)", - node.Name, node.Spec.PodCIDR) +func (ca *cloudCIDRAllocator) ReleaseCIDR(logger klog.Logger, node *v1.Node) error { + logger.V(2).Info("Node's PodCIDR will be released by external cloud provider (not managed by controller)", + "node", klog.KObj(node), "podCIDR", node.Spec.PodCIDR) return nil } diff --git a/pkg/controller/nodeipam/ipam/cloud_cidr_allocator_nolegacyproviders.go b/pkg/controller/nodeipam/ipam/cloud_cidr_allocator_nolegacyproviders.go index 94fdcb5d18a..0cf06b44693 100644 --- a/pkg/controller/nodeipam/ipam/cloud_cidr_allocator_nolegacyproviders.go +++ b/pkg/controller/nodeipam/ipam/cloud_cidr_allocator_nolegacyproviders.go @@ -25,9 +25,10 @@ import ( informers "k8s.io/client-go/informers/core/v1" clientset "k8s.io/client-go/kubernetes" cloudprovider "k8s.io/cloud-provider" + "k8s.io/klog/v2" ) // NewCloudCIDRAllocator creates a new cloud CIDR allocator. -func NewCloudCIDRAllocator(client clientset.Interface, cloud cloudprovider.Interface, nodeInformer informers.NodeInformer) (CIDRAllocator, error) { +func NewCloudCIDRAllocator(logger klog.Logger, client clientset.Interface, cloud cloudprovider.Interface, nodeInformer informers.NodeInformer) (CIDRAllocator, error) { return nil, errors.New("legacy cloud provider support not built") } diff --git a/pkg/controller/nodeipam/ipam/cloud_cidr_allocator_test.go b/pkg/controller/nodeipam/ipam/cloud_cidr_allocator_test.go index 21b65d4539c..35a1b30a559 100644 --- a/pkg/controller/nodeipam/ipam/cloud_cidr_allocator_test.go +++ b/pkg/controller/nodeipam/ipam/cloud_cidr_allocator_test.go @@ -28,6 +28,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes/fake" + "k8s.io/klog/v2/ktesting" netutils "k8s.io/utils/net" ) @@ -42,7 +43,6 @@ func hasNodeInProcessing(ca *cloudCIDRAllocator, name string) bool { func TestBoundedRetries(t *testing.T) { clientSet := fake.NewSimpleClientset() updateChan := make(chan string, 1) // need to buffer as we are using only on go routine - stopChan := make(chan struct{}) sharedInfomer := informers.NewSharedInformerFactory(clientSet, 1*time.Hour) ca := &cloudCIDRAllocator{ client: clientSet, @@ -51,9 +51,10 @@ func TestBoundedRetries(t *testing.T) { nodesSynced: sharedInfomer.Core().V1().Nodes().Informer().HasSynced, nodesInProcessing: map[string]*nodeProcessingInfo{}, } - go ca.worker(stopChan) + logger, ctx := ktesting.NewTestContext(t) + go ca.worker(ctx) nodeName := "testNode" - ca.AllocateOrOccupyCIDR(&v1.Node{ + ca.AllocateOrOccupyCIDR(logger, &v1.Node{ ObjectMeta: metav1.ObjectMeta{ Name: nodeName, }, @@ -176,9 +177,9 @@ func TestNeedPodCIDRsUpdate(t *testing.T) { if err != nil { t.Errorf("failed to parse %v as CIDRs: %v", tc.cidrs, err) } - + logger, _ := ktesting.NewTestContext(t) t.Run(tc.desc, func(t *testing.T) { - got, err := needPodCIDRsUpdate(&node, netCIDRs) + got, err := needPodCIDRsUpdate(logger, &node, netCIDRs) if tc.wantErr == (err == nil) { t.Errorf("err: %v, wantErr: %v", err, tc.wantErr) } diff --git a/pkg/controller/nodeipam/ipam/controller_legacyprovider.go b/pkg/controller/nodeipam/ipam/controller_legacyprovider.go index d672245c478..9f389661263 100644 --- a/pkg/controller/nodeipam/ipam/controller_legacyprovider.go +++ b/pkg/controller/nodeipam/ipam/controller_legacyprovider.go @@ -20,6 +20,7 @@ limitations under the License. package ipam import ( + "context" "fmt" "net" "sync" @@ -113,10 +114,10 @@ func NewController( // Start initializes the Controller with the existing list of nodes and // registers the informers for node changes. This will start synchronization // of the node and cloud CIDR range allocations. -func (c *Controller) Start(nodeInformer informers.NodeInformer) error { - klog.InfoS("Starting IPAM controller", "config", c.config) +func (c *Controller) Start(logger klog.Logger, nodeInformer informers.NodeInformer) error { + logger.Info("Starting IPAM controller", "config", c.config) - nodes, err := listNodes(c.adapter.k8s) + nodes, err := listNodes(logger, c.adapter.k8s) if err != nil { return err } @@ -125,9 +126,9 @@ func (c *Controller) Start(nodeInformer informers.NodeInformer) error { _, cidrRange, err := netutils.ParseCIDRSloppy(node.Spec.PodCIDR) if err == nil { c.set.Occupy(cidrRange) - klog.V(3).InfoS("Occupying CIDR for node", "CIDR", node.Spec.PodCIDR, "node", node.Name) + logger.V(3).Info("Occupying CIDR for node", "CIDR", node.Spec.PodCIDR, "node", klog.KObj(&node)) } else { - klog.ErrorS(err, "Node has an invalid CIDR", "node", node.Name, "CIDR", node.Spec.PodCIDR) + logger.Error(err, "Node has an invalid CIDR", "node", klog.KObj(&node), "CIDR", node.Spec.PodCIDR) } } @@ -138,24 +139,30 @@ func (c *Controller) Start(nodeInformer informers.NodeInformer) error { // XXX/bowei -- stagger the start of each sync cycle. syncer := c.newSyncer(node.Name) c.syncers[node.Name] = syncer - go syncer.Loop(nil) + go syncer.Loop(logger, nil) }() } nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: controllerutil.CreateAddNodeHandler(c.onAdd), - UpdateFunc: controllerutil.CreateUpdateNodeHandler(c.onUpdate), - DeleteFunc: controllerutil.CreateDeleteNodeHandler(c.onDelete), + AddFunc: controllerutil.CreateAddNodeHandler(func(node *v1.Node) error { + return c.onAdd(logger, node) + }), + UpdateFunc: controllerutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error { + return c.onUpdate(logger, newNode) + }), + DeleteFunc: controllerutil.CreateDeleteNodeHandler(func(node *v1.Node) error { + return c.onDelete(logger, node) + }), }) return nil } -func (c *Controller) Run(stopCh <-chan struct{}) { +func (c *Controller) Run(ctx context.Context) { defer utilruntime.HandleCrash() - go c.adapter.Run(stopCh) - <-stopCh + go c.adapter.Run(ctx) + <-ctx.Done() } // occupyServiceCIDR removes the service CIDR range from the cluster CIDR if it @@ -192,7 +199,7 @@ func (c *Controller) newSyncer(name string) *nodesync.NodeSync { return nodesync.New(ns, c.adapter, c.adapter, c.config.Mode, name, c.set) } -func (c *Controller) onAdd(node *v1.Node) error { +func (c *Controller) onAdd(logger klog.Logger, node *v1.Node) error { c.lock.Lock() defer c.lock.Unlock() @@ -200,30 +207,30 @@ func (c *Controller) onAdd(node *v1.Node) error { if !ok { syncer = c.newSyncer(node.Name) c.syncers[node.Name] = syncer - go syncer.Loop(nil) + go syncer.Loop(logger, nil) } else { - klog.InfoS("Add for node that already exists", "node", node.Name) + logger.Info("Add for node that already exists", "node", klog.KObj(node)) } syncer.Update(node) return nil } -func (c *Controller) onUpdate(_, node *v1.Node) error { +func (c *Controller) onUpdate(logger klog.Logger, node *v1.Node) error { c.lock.Lock() defer c.lock.Unlock() if sync, ok := c.syncers[node.Name]; ok { sync.Update(node) } else { - klog.ErrorS(nil, "Received update for non-existent node", "node", node.Name) + logger.Error(nil, "Received update for non-existent node", "node", klog.KObj(node)) return fmt.Errorf("unknown node %q", node.Name) } return nil } -func (c *Controller) onDelete(node *v1.Node) error { +func (c *Controller) onDelete(logger klog.Logger, node *v1.Node) error { c.lock.Lock() defer c.lock.Unlock() @@ -231,7 +238,7 @@ func (c *Controller) onDelete(node *v1.Node) error { syncer.Delete(node) delete(c.syncers, node.Name) } else { - klog.InfoS("Node was already deleted", "node", node.Name) + logger.Info("Node was already deleted", "node", klog.KObj(node)) } return nil diff --git a/pkg/controller/nodeipam/ipam/multi_cidr_range_allocator.go b/pkg/controller/nodeipam/ipam/multi_cidr_range_allocator.go index 3acae69f3db..256e339e935 100644 --- a/pkg/controller/nodeipam/ipam/multi_cidr_range_allocator.go +++ b/pkg/controller/nodeipam/ipam/multi_cidr_range_allocator.go @@ -102,6 +102,7 @@ type multiCIDRRangeAllocator struct { // Caller must always pass in a list of existing nodes to the new allocator. // NodeList is only nil in testing. func NewMultiCIDRRangeAllocator( + ctx context.Context, client clientset.Interface, nodeInformer informers.NodeInformer, clusterCIDRInformer networkinginformers.ClusterCIDRInformer, @@ -109,8 +110,10 @@ func NewMultiCIDRRangeAllocator( nodeList *v1.NodeList, testCIDRMap map[string][]*cidrset.ClusterCIDR, ) (CIDRAllocator, error) { + logger := klog.FromContext(ctx) if client == nil { - klog.Fatalf("client is nil") + logger.Error(nil, "kubeClient is nil when starting multi CIDRRangeAllocator") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) } eventBroadcaster := record.NewBroadcaster() @@ -137,10 +140,10 @@ func NewMultiCIDRRangeAllocator( // testCIDRMap is only set for testing purposes. if len(testCIDRMap) > 0 { ra.cidrMap = testCIDRMap - klog.Warningf("testCIDRMap should only be set for testing purposes, if this is seen in production logs, it might be a misconfiguration or a bug.") + logger.Info("TestCIDRMap should only be set for testing purposes, if this is seen in production logs, it might be a misconfiguration or a bug") } - ccList, err := listClusterCIDRs(client) + ccList, err := listClusterCIDRs(ctx, client) if err != nil { return nil, err } @@ -148,14 +151,14 @@ func NewMultiCIDRRangeAllocator( if ccList == nil { ccList = &networkingv1alpha1.ClusterCIDRList{} } - createDefaultClusterCIDR(ccList, allocatorParams) + createDefaultClusterCIDR(logger, ccList, allocatorParams) // Regenerate the cidrMaps from the existing ClusterCIDRs. for _, clusterCIDR := range ccList.Items { - klog.Infof("Regenerating existing ClusterCIDR: %v", clusterCIDR) + logger.Info("Regenerating existing ClusterCIDR", "clusterCIDR", clusterCIDR) // Create an event for invalid ClusterCIDRs, do not crash on failures. - if err := ra.reconcileBootstrap(&clusterCIDR); err != nil { - klog.Errorf("Error while regenerating existing ClusterCIDR: %v", err) + if err := ra.reconcileBootstrap(ctx, &clusterCIDR); err != nil { + logger.Error(err, "Error while regenerating existing ClusterCIDR") ra.recorder.Event(&clusterCIDR, "Warning", "InvalidClusterCIDR encountered while regenerating ClusterCIDR during bootstrap.", err.Error()) } } @@ -184,25 +187,25 @@ func NewMultiCIDRRangeAllocator( }) if allocatorParams.ServiceCIDR != nil { - ra.filterOutServiceRange(allocatorParams.ServiceCIDR) + ra.filterOutServiceRange(logger, allocatorParams.ServiceCIDR) } else { - klog.Info("No Service CIDR provided. Skipping filtering out service addresses.") + logger.Info("No Service CIDR provided. Skipping filtering out service addresses") } if allocatorParams.SecondaryServiceCIDR != nil { - ra.filterOutServiceRange(allocatorParams.SecondaryServiceCIDR) + ra.filterOutServiceRange(logger, allocatorParams.SecondaryServiceCIDR) } else { - klog.Info("No Secondary Service CIDR provided. Skipping filtering out secondary service addresses.") + logger.Info("No Secondary Service CIDR provided. Skipping filtering out secondary service addresses") } if nodeList != nil { for _, node := range nodeList.Items { if len(node.Spec.PodCIDRs) == 0 { - klog.V(4).Infof("Node %v has no CIDR, ignoring", node.Name) + logger.V(4).Info("Node has no CIDR, ignoring", "node", klog.KObj(&node)) continue } - klog.Infof("Node %v has CIDR %s, occupying it in CIDR map", node.Name, node.Spec.PodCIDRs) - if err := ra.occupyCIDRs(&node); err != nil { + logger.Info("Node has CIDR, occupying it in CIDR map", "node", klog.KObj(&node), "podCIDRs", node.Spec.PodCIDRs) + if err := ra.occupyCIDRs(logger, &node); err != nil { // This will happen if: // 1. We find garbage in the podCIDRs field. Retrying is useless. // 2. CIDR out of range: This means ClusterCIDR is not yet created @@ -231,7 +234,7 @@ func NewMultiCIDRRangeAllocator( // we don't see the Update with DeletionTimestamp != 0. // TODO: instead of executing the operation directly in the handler, build a small cache with key node.Name // and value PodCIDRs use ReleaseCIDR on the reconcile loop so we can retry on `ReleaseCIDR` failures. - ra.ReleaseCIDR(obj.(*v1.Node)) + ra.ReleaseCIDR(logger, obj.(*v1.Node)) // IndexerInformer uses a delta nodeQueue, therefore for deletes we have to use this // key function. key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj) @@ -244,51 +247,52 @@ func NewMultiCIDRRangeAllocator( return ra, nil } -func (r *multiCIDRRangeAllocator) Run(stopCh <-chan struct{}) { +func (r *multiCIDRRangeAllocator) Run(ctx context.Context) { defer utilruntime.HandleCrash() // Start event processing pipeline. + logger := klog.FromContext(ctx) r.broadcaster.StartStructuredLogging(0) - klog.Infof("Started sending events to API Server.") + logger.Info("Started sending events to API Server") r.broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: r.client.CoreV1().Events("")}) defer r.broadcaster.Shutdown() defer r.cidrQueue.ShutDown() defer r.nodeQueue.ShutDown() - klog.Infof("Starting Multi CIDR Range allocator") - defer klog.Infof("Shutting down Multi CIDR Range allocator") + logger.Info("Starting Multi CIDR Range allocator") + defer logger.Info("Shutting down Multi CIDR Range allocator") - if !cache.WaitForNamedCacheSync("multi_cidr_range_allocator", stopCh, r.nodesSynced, r.clusterCIDRSynced) { + if !cache.WaitForNamedCacheSync("multi_cidr_range_allocator", ctx.Done(), r.nodesSynced, r.clusterCIDRSynced) { return } for i := 0; i < cidrUpdateWorkers; i++ { - go wait.Until(r.runCIDRWorker, time.Second, stopCh) - go wait.Until(r.runNodeWorker, time.Second, stopCh) + go wait.UntilWithContext(ctx, r.runCIDRWorker, time.Second) + go wait.UntilWithContext(ctx, r.runNodeWorker, time.Second) } - <-stopCh + <-ctx.Done() } // runWorker is a long-running function that will continually call the // processNextWorkItem function in order to read and process a message on the // cidrQueue. -func (r *multiCIDRRangeAllocator) runCIDRWorker() { - for r.processNextCIDRWorkItem() { +func (r *multiCIDRRangeAllocator) runCIDRWorker(ctx context.Context) { + for r.processNextCIDRWorkItem(ctx) { } } // processNextWorkItem will read a single work item off the cidrQueue and // attempt to process it, by calling the syncHandler. -func (r *multiCIDRRangeAllocator) processNextCIDRWorkItem() bool { +func (r *multiCIDRRangeAllocator) processNextCIDRWorkItem(ctx context.Context) bool { obj, shutdown := r.cidrQueue.Get() if shutdown { return false } // We wrap this block in a func so we can defer c.cidrQueue.Done. - err := func(obj interface{}) error { + err := func(ctx context.Context, obj interface{}) error { // We call Done here so the cidrQueue knows we have finished // processing this item. We also must remember to call Forget if we // do not want this work item being re-queued. For example, we do @@ -313,7 +317,7 @@ func (r *multiCIDRRangeAllocator) processNextCIDRWorkItem() bool { } // Run the syncHandler, passing it the namespace/name string of the // Foo resource to be synced. - if err := r.syncClusterCIDR(key); err != nil { + if err := r.syncClusterCIDR(ctx, key); err != nil { // Put the item back on the cidrQueue to handle any transient errors. r.cidrQueue.AddRateLimited(key) return fmt.Errorf("error syncing '%s': %s, requeuing", key, err.Error()) @@ -323,7 +327,7 @@ func (r *multiCIDRRangeAllocator) processNextCIDRWorkItem() bool { r.cidrQueue.Forget(obj) klog.Infof("Successfully synced '%s'", key) return nil - }(obj) + }(ctx, obj) if err != nil { utilruntime.HandleError(err) @@ -333,21 +337,21 @@ func (r *multiCIDRRangeAllocator) processNextCIDRWorkItem() bool { return true } -func (r *multiCIDRRangeAllocator) runNodeWorker() { - for r.processNextNodeWorkItem() { +func (r *multiCIDRRangeAllocator) runNodeWorker(ctx context.Context) { + for r.processNextNodeWorkItem(ctx) { } } // processNextWorkItem will read a single work item off the cidrQueue and // attempt to process it, by calling the syncHandler. -func (r *multiCIDRRangeAllocator) processNextNodeWorkItem() bool { +func (r *multiCIDRRangeAllocator) processNextNodeWorkItem(ctx context.Context) bool { obj, shutdown := r.nodeQueue.Get() if shutdown { return false } // We wrap this block in a func so we can defer c.cidrQueue.Done. - err := func(obj interface{}) error { + err := func(logger klog.Logger, obj interface{}) error { // We call Done here so the workNodeQueue knows we have finished // processing this item. We also must remember to call Forget if we // do not want this work item being re-queued. For example, we do @@ -372,7 +376,7 @@ func (r *multiCIDRRangeAllocator) processNextNodeWorkItem() bool { } // Run the syncHandler, passing it the namespace/name string of the // Foo resource to be synced. - if err := r.syncNode(key); err != nil { + if err := r.syncNode(logger, key); err != nil { // Put the item back on the cidrQueue to handle any transient errors. r.nodeQueue.AddRateLimited(key) return fmt.Errorf("error syncing '%s': %s, requeuing", key, err.Error()) @@ -382,7 +386,7 @@ func (r *multiCIDRRangeAllocator) processNextNodeWorkItem() bool { r.nodeQueue.Forget(obj) klog.Infof("Successfully synced '%s'", key) return nil - }(obj) + }(klog.FromContext(ctx), obj) if err != nil { utilruntime.HandleError(err) @@ -392,7 +396,7 @@ func (r *multiCIDRRangeAllocator) processNextNodeWorkItem() bool { return true } -func (r *multiCIDRRangeAllocator) syncNode(key string) error { +func (r *multiCIDRRangeAllocator) syncNode(logger klog.Logger, key string) error { startTime := time.Now() defer func() { klog.V(4).Infof("Finished syncing Node request %q (%v)", key, time.Since(startTime)) @@ -411,9 +415,9 @@ func (r *multiCIDRRangeAllocator) syncNode(key string) error { // Check the DeletionTimestamp to determine if object is under deletion. if !node.DeletionTimestamp.IsZero() { klog.V(3).Infof("node is being deleted: %v", key) - return r.ReleaseCIDR(node) + return r.ReleaseCIDR(logger, node) } - return r.AllocateOrOccupyCIDR(node) + return r.AllocateOrOccupyCIDR(logger, node) } // needToAddFinalizer checks if a finalizer should be added to the object. @@ -422,15 +426,16 @@ func needToAddFinalizer(obj metav1.Object, finalizer string) bool { finalizer, nil) } -func (r *multiCIDRRangeAllocator) syncClusterCIDR(key string) error { +func (r *multiCIDRRangeAllocator) syncClusterCIDR(ctx context.Context, key string) error { startTime := time.Now() + logger := klog.FromContext(ctx) defer func() { - klog.V(4).Infof("Finished syncing clusterCIDR request %q (%v)", key, time.Since(startTime)) + logger.V(4).Info("Finished syncing clusterCIDR request", "key", key, "latency", time.Since(startTime)) }() clusterCIDR, err := r.clusterCIDRLister.Get(key) if apierrors.IsNotFound(err) { - klog.V(3).Infof("clusterCIDR has been deleted: %v", key) + logger.V(3).Info("clusterCIDR has been deleted", "key", key) return nil } @@ -440,21 +445,20 @@ func (r *multiCIDRRangeAllocator) syncClusterCIDR(key string) error { // Check the DeletionTimestamp to determine if object is under deletion. if !clusterCIDR.DeletionTimestamp.IsZero() { - return r.reconcileDelete(clusterCIDR) + return r.reconcileDelete(ctx, clusterCIDR) } - return r.reconcileCreate(clusterCIDR) + return r.reconcileCreate(ctx, clusterCIDR) } // occupyCIDRs marks node.PodCIDRs[...] as used in allocator's tracked cidrSet. -func (r *multiCIDRRangeAllocator) occupyCIDRs(node *v1.Node) error { +func (r *multiCIDRRangeAllocator) occupyCIDRs(logger klog.Logger, node *v1.Node) error { err := func(node *v1.Node) error { if len(node.Spec.PodCIDRs) == 0 { return nil } - - clusterCIDRList, err := r.orderedMatchingClusterCIDRs(node, true) + clusterCIDRList, err := r.orderedMatchingClusterCIDRs(logger, node, true) if err != nil { return err } @@ -468,10 +472,10 @@ func (r *multiCIDRRangeAllocator) occupyCIDRs(node *v1.Node) error { return fmt.Errorf("failed to parse CIDR %s on Node %v: %w", cidr, node.Name, err) } - klog.Infof("occupy CIDR %s for node: %s", cidr, node.Name) + logger.Info("occupy CIDR for node", "CIDR", cidr, "node", klog.KObj(node)) if err := r.Occupy(clusterCIDR, podCIDR); err != nil { - klog.V(3).Infof("Could not occupy cidr: %v, trying next range: %w", node.Spec.PodCIDRs, err) + logger.V(3).Info("Could not occupy cidr, trying next range", "podCIDRs", node.Spec.PodCIDRs, "err", err) break } @@ -519,14 +523,14 @@ func (r *multiCIDRRangeAllocator) Occupy(clusterCIDR *cidrset.ClusterCIDR, cidr // Release marks the CIDR as free in the cidrSet used bitmap, // Also removes the CIDR from the allocatedCIDRSet. -func (r *multiCIDRRangeAllocator) Release(clusterCIDR *cidrset.ClusterCIDR, cidr *net.IPNet) error { +func (r *multiCIDRRangeAllocator) Release(logger klog.Logger, clusterCIDR *cidrset.ClusterCIDR, cidr *net.IPNet) error { currCIDRSet, err := r.associatedCIDRSet(clusterCIDR, cidr) if err != nil { return err } if err := currCIDRSet.Release(cidr); err != nil { - klog.Infof("Unable to release cidr %v in cidrSet", cidr) + logger.Info("Unable to release cidr in cidrSet", "CIDR", cidr) return err } @@ -539,7 +543,7 @@ func (r *multiCIDRRangeAllocator) Release(clusterCIDR *cidrset.ClusterCIDR, cidr // WARNING: If you're adding any return calls or defer any more work from this // function you have to make sure to update nodesInProcessing properly with the // disposition of the node when the work is done. -func (r *multiCIDRRangeAllocator) AllocateOrOccupyCIDR(node *v1.Node) error { +func (r *multiCIDRRangeAllocator) AllocateOrOccupyCIDR(logger klog.Logger, node *v1.Node) error { r.lock.Lock() defer r.lock.Unlock() @@ -548,10 +552,10 @@ func (r *multiCIDRRangeAllocator) AllocateOrOccupyCIDR(node *v1.Node) error { } if len(node.Spec.PodCIDRs) > 0 { - return r.occupyCIDRs(node) + return r.occupyCIDRs(logger, node) } - cidrs, clusterCIDR, err := r.prioritizedCIDRs(node) + cidrs, clusterCIDR, err := r.prioritizedCIDRs(logger, node) if err != nil { controllerutil.RecordNodeStatusChange(r.recorder, node, "CIDRNotAvailable") return fmt.Errorf("failed to get cidrs for node %s", node.Name) @@ -571,11 +575,11 @@ func (r *multiCIDRRangeAllocator) AllocateOrOccupyCIDR(node *v1.Node) error { clusterCIDR: clusterCIDR, } - return r.updateCIDRsAllocation(allocated) + return r.updateCIDRsAllocation(logger, allocated) } // ReleaseCIDR marks node.podCIDRs[...] as unused in our tracked cidrSets. -func (r *multiCIDRRangeAllocator) ReleaseCIDR(node *v1.Node) error { +func (r *multiCIDRRangeAllocator) ReleaseCIDR(logger klog.Logger, node *v1.Node) error { r.lock.Lock() defer r.lock.Unlock() @@ -583,7 +587,7 @@ func (r *multiCIDRRangeAllocator) ReleaseCIDR(node *v1.Node) error { return nil } - clusterCIDR, err := r.allocatedClusterCIDR(node) + clusterCIDR, err := r.allocatedClusterCIDR(logger, node) if err != nil { return err } @@ -594,8 +598,8 @@ func (r *multiCIDRRangeAllocator) ReleaseCIDR(node *v1.Node) error { return fmt.Errorf("failed to parse CIDR %q on Node %q: %w", cidr, node.Name, err) } - klog.Infof("release CIDR %s for node: %s", cidr, node.Name) - if err := r.Release(clusterCIDR, podCIDR); err != nil { + logger.Info("release CIDR for node", "CIDR", cidr, "node", klog.KObj(node)) + if err := r.Release(logger, clusterCIDR, podCIDR); err != nil { return fmt.Errorf("failed to release cidr %q from clusterCIDR %q for node %q: %w", cidr, clusterCIDR.Name, node.Name, err) } } @@ -608,7 +612,7 @@ func (r *multiCIDRRangeAllocator) ReleaseCIDR(node *v1.Node) error { // Marks all CIDRs with subNetMaskSize that belongs to serviceCIDR as used across all cidrs // so that they won't be assignable. -func (r *multiCIDRRangeAllocator) filterOutServiceRange(serviceCIDR *net.IPNet) { +func (r *multiCIDRRangeAllocator) filterOutServiceRange(logger klog.Logger, serviceCIDR *net.IPNet) { // Checks if service CIDR has a nonempty intersection with cluster // CIDR. It is the case if either clusterCIDR contains serviceCIDR with // clusterCIDR's Mask applied (this means that clusterCIDR contains @@ -617,7 +621,7 @@ func (r *multiCIDRRangeAllocator) filterOutServiceRange(serviceCIDR *net.IPNet) for _, clusterCIDRList := range r.cidrMap { for _, clusterCIDR := range clusterCIDRList { if err := r.occupyServiceCIDR(clusterCIDR, serviceCIDR); err != nil { - klog.Errorf("unable to occupy service CIDR: %w", err) + logger.Error(err, "Unable to occupy service CIDR") } } } @@ -645,12 +649,12 @@ func (r *multiCIDRRangeAllocator) occupyServiceCIDR(clusterCIDR *cidrset.Cluster } // updateCIDRsAllocation assigns CIDR to Node and sends an update to the API server. -func (r *multiCIDRRangeAllocator) updateCIDRsAllocation(data multiCIDRNodeReservedCIDRs) error { +func (r *multiCIDRRangeAllocator) updateCIDRsAllocation(logger klog.Logger, data multiCIDRNodeReservedCIDRs) error { err := func(data multiCIDRNodeReservedCIDRs) error { cidrsString := ipnetToStringList(data.allocatedCIDRs) node, err := r.nodeLister.Get(data.nodeName) if err != nil { - klog.Errorf("Failed while getting node %v for updating Node.Spec.PodCIDRs: %v", data.nodeName, err) + logger.Error(err, "Failed while getting node for updating Node.Spec.PodCIDRs", "node", klog.KRef("", data.nodeName)) return err } @@ -666,16 +670,16 @@ func (r *multiCIDRRangeAllocator) updateCIDRsAllocation(data multiCIDRNodeReserv } } if match { - klog.V(4).Infof("Node %q already has allocated CIDR %q. It matches the proposed one.", node.Name, data.allocatedCIDRs) + logger.V(4).Info("Node already has allocated CIDR. It matches the proposed one.", "node", klog.KObj(node), "CIDRs", data.allocatedCIDRs) return nil } } // node has cidrs allocated, release the reserved. if len(node.Spec.PodCIDRs) != 0 { - klog.Errorf("Node %q already has a CIDR allocated %q. Releasing the new one.", node.Name, node.Spec.PodCIDRs) + logger.Error(nil, "Node already has a CIDR allocated. Releasing the new one", "node", klog.KObj(node), "podCIDRs", node.Spec.PodCIDRs) for _, cidr := range data.allocatedCIDRs { - if err := r.Release(data.clusterCIDR, cidr); err != nil { + if err := r.Release(logger, data.clusterCIDR, cidr); err != nil { return fmt.Errorf("failed to release cidr %s from clusterCIDR %s for node: %s: %w", cidr, data.clusterCIDR.Name, node.Name, err) } } @@ -686,20 +690,20 @@ func (r *multiCIDRRangeAllocator) updateCIDRsAllocation(data multiCIDRNodeReserv for i := 0; i < cidrUpdateRetries; i++ { if err = nodeutil.PatchNodeCIDRs(r.client, types.NodeName(node.Name), cidrsString); err == nil { data.clusterCIDR.AssociatedNodes[node.Name] = true - klog.Infof("Set node %q PodCIDR to %q", node.Name, cidrsString) + logger.Info("Set node PodCIDR", "node", klog.KObj(node), "podCIDR", cidrsString) return nil } } // failed release back to the pool. - klog.Errorf("Failed to update node %q PodCIDR to %q after %d attempts: %v", node.Name, cidrsString, cidrUpdateRetries, err) + logger.Error(err, "Failed to update node PodCIDR after attempts", "node", klog.KObj(node), "podCIDR", cidrsString, "retries", cidrUpdateRetries) controllerutil.RecordNodeStatusChange(r.recorder, node, "CIDRAssignmentFailed") // We accept the fact that we may leak CIDRs here. This is safer than releasing // them in case when we don't know if request went through. // NodeController restart will return all falsely allocated CIDRs to the pool. if !apierrors.IsServerTimeout(err) { - klog.Errorf("CIDR assignment for node %q failed: %v. Releasing allocated CIDR", node.Name, err) + logger.Error(err, "CIDR assignment for node failed. Releasing allocated CIDR", "node", klog.KObj(node)) for _, cidr := range data.allocatedCIDRs { - if err := r.Release(data.clusterCIDR, cidr); err != nil { + if err := r.Release(logger, data.clusterCIDR, cidr); err != nil { return fmt.Errorf("failed to release cidr %q from clusterCIDR %q for node: %q: %w", cidr, data.clusterCIDR.Name, node.Name, err) } } @@ -732,8 +736,8 @@ func defaultNodeSelector() *v1.NodeSelector { // prioritizedCIDRs returns a list of CIDRs to be allocated to the node. // Returns 1 CIDR if single stack. // Returns 2 CIDRs , 1 from each ip family if dual stack. -func (r *multiCIDRRangeAllocator) prioritizedCIDRs(node *v1.Node) ([]*net.IPNet, *cidrset.ClusterCIDR, error) { - clusterCIDRList, err := r.orderedMatchingClusterCIDRs(node, true) +func (r *multiCIDRRangeAllocator) prioritizedCIDRs(logger klog.Logger, node *v1.Node) ([]*net.IPNet, *cidrset.ClusterCIDR, error) { + clusterCIDRList, err := r.orderedMatchingClusterCIDRs(logger, node, true) if err != nil { return nil, nil, fmt.Errorf("unable to get a clusterCIDR for node %s: %w", node.Name, err) } @@ -743,7 +747,7 @@ func (r *multiCIDRRangeAllocator) prioritizedCIDRs(node *v1.Node) ([]*net.IPNet, if clusterCIDR.IPv4CIDRSet != nil { cidr, err := r.allocateCIDR(clusterCIDR, clusterCIDR.IPv4CIDRSet) if err != nil { - klog.V(3).Infof("unable to allocate IPv4 CIDR, trying next range: %w", err) + logger.V(3).Info("Unable to allocate IPv4 CIDR, trying next range", "err", err) continue } cidrs = append(cidrs, cidr) @@ -752,7 +756,7 @@ func (r *multiCIDRRangeAllocator) prioritizedCIDRs(node *v1.Node) ([]*net.IPNet, if clusterCIDR.IPv6CIDRSet != nil { cidr, err := r.allocateCIDR(clusterCIDR, clusterCIDR.IPv6CIDRSet) if err != nil { - klog.V(3).Infof("unable to allocate IPv6 CIDR, trying next range: %w", err) + logger.V(3).Info("Unable to allocate IPv6 CIDR, trying next range", "err", err) continue } cidrs = append(cidrs, cidr) @@ -827,8 +831,8 @@ func (r *multiCIDRRangeAllocator) cidrOverlapWithAllocatedList(cidr *net.IPNet) } // allocatedClusterCIDR returns the ClusterCIDR from which the node CIDRs were allocated. -func (r *multiCIDRRangeAllocator) allocatedClusterCIDR(node *v1.Node) (*cidrset.ClusterCIDR, error) { - clusterCIDRList, err := r.orderedMatchingClusterCIDRs(node, false) +func (r *multiCIDRRangeAllocator) allocatedClusterCIDR(logger klog.Logger, node *v1.Node) (*cidrset.ClusterCIDR, error) { + clusterCIDRList, err := r.orderedMatchingClusterCIDRs(logger, node, false) if err != nil { return nil, fmt.Errorf("unable to get a clusterCIDR for node %s: %w", node.Name, err) } @@ -852,12 +856,12 @@ func (r *multiCIDRRangeAllocator) allocatedClusterCIDR(node *v1.Node) (*cidrset. // orderedMatchingClusterCIDRs takes `occupy` as an argument, it determines whether the function // is called during an occupy or a release operation. For a release operation, a ClusterCIDR must // be added to the matching ClusterCIDRs list, irrespective of whether the ClusterCIDR is terminating. -func (r *multiCIDRRangeAllocator) orderedMatchingClusterCIDRs(node *v1.Node, occupy bool) ([]*cidrset.ClusterCIDR, error) { +func (r *multiCIDRRangeAllocator) orderedMatchingClusterCIDRs(logger klog.Logger, node *v1.Node, occupy bool) ([]*cidrset.ClusterCIDR, error) { matchingCIDRs := make([]*cidrset.ClusterCIDR, 0) pq := make(PriorityQueue, 0) for label, clusterCIDRList := range r.cidrMap { - labelsMatch, matchCnt, err := r.matchCIDRLabels(node, label) + labelsMatch, matchCnt, err := r.matchCIDRLabels(logger, node, label) if err != nil { return nil, err } @@ -902,14 +906,14 @@ func (r *multiCIDRRangeAllocator) orderedMatchingClusterCIDRs(node *v1.Node, occ // matchCIDRLabels Matches the Node labels to CIDR Configs. // Returns true only if all the labels match, also returns the count of matching labels. -func (r *multiCIDRRangeAllocator) matchCIDRLabels(node *v1.Node, label string) (bool, int, error) { +func (r *multiCIDRRangeAllocator) matchCIDRLabels(logger klog.Logger, node *v1.Node, label string) (bool, int, error) { var labelSet labels.Set var matchCnt int labelsMatch := false ls, err := labels.Parse(label) if err != nil { - klog.Errorf("Unable to parse label %s to labels.Selector: %v", label, err) + logger.Error(err, "Unable to parse label to labels.Selector", "label", label) return labelsMatch, 0, err } reqs, selectable := ls.Requirements() @@ -934,7 +938,7 @@ func (r *multiCIDRRangeAllocator) matchCIDRLabels(node *v1.Node, label string) ( // createDefaultClusterCIDR creates a default ClusterCIDR if --cluster-cidr has // been configured. It converts the --cluster-cidr and --per-node-mask-size* flags // to appropriate ClusterCIDR fields. -func createDefaultClusterCIDR(existingConfigList *networkingv1alpha1.ClusterCIDRList, +func createDefaultClusterCIDR(logger klog.Logger, existingConfigList *networkingv1alpha1.ClusterCIDRList, allocatorParams CIDRAllocatorParams) { // Create default ClusterCIDR only if --cluster-cidr has been configured if len(allocatorParams.ClusterCIDRs) == 0 { @@ -944,7 +948,7 @@ func createDefaultClusterCIDR(existingConfigList *networkingv1alpha1.ClusterCIDR for _, clusterCIDR := range existingConfigList.Items { if clusterCIDR.Name == defaultClusterCIDRName { // Default ClusterCIDR already exists, no further action required. - klog.V(3).Infof("Default ClusterCIDR %s already exists", defaultClusterCIDRName) + logger.V(3).Info("Default ClusterCIDR already exists", "defaultClusterCIDRName", defaultClusterCIDRName) return } } @@ -1006,14 +1010,15 @@ func createDefaultClusterCIDR(existingConfigList *networkingv1alpha1.ClusterCIDR } // reconcileCreate handles create ClusterCIDR events. -func (r *multiCIDRRangeAllocator) reconcileCreate(clusterCIDR *networkingv1alpha1.ClusterCIDR) error { +func (r *multiCIDRRangeAllocator) reconcileCreate(ctx context.Context, clusterCIDR *networkingv1alpha1.ClusterCIDR) error { r.lock.Lock() defer r.lock.Unlock() + logger := klog.FromContext(ctx) if needToAddFinalizer(clusterCIDR, clusterCIDRFinalizer) { - klog.V(3).Infof("Creating ClusterCIDR %s", clusterCIDR.Name) - if err := r.createClusterCIDR(clusterCIDR, false); err != nil { - klog.Errorf("Unable to create ClusterCIDR %s : %v", clusterCIDR.Name, err) + logger.V(3).Info("Creating ClusterCIDR", "clusterCIDR", clusterCIDR.Name) + if err := r.createClusterCIDR(ctx, clusterCIDR, false); err != nil { + logger.Error(err, "Unable to create ClusterCIDR", "clusterCIDR", clusterCIDR.Name) return err } } @@ -1022,21 +1027,22 @@ func (r *multiCIDRRangeAllocator) reconcileCreate(clusterCIDR *networkingv1alpha // reconcileBootstrap handles creation of existing ClusterCIDRs. // adds a finalizer if not already present. -func (r *multiCIDRRangeAllocator) reconcileBootstrap(clusterCIDR *networkingv1alpha1.ClusterCIDR) error { +func (r *multiCIDRRangeAllocator) reconcileBootstrap(ctx context.Context, clusterCIDR *networkingv1alpha1.ClusterCIDR) error { r.lock.Lock() defer r.lock.Unlock() + logger := klog.FromContext(ctx) terminating := false // Create the ClusterCIDR only if the Spec has not been modified. if clusterCIDR.Generation > 1 { terminating = true err := fmt.Errorf("CIDRs from ClusterCIDR %s will not be used for allocation as it was modified", clusterCIDR.Name) - klog.Errorf("ClusterCIDR Modified: %v", err) + logger.Error(err, "ClusterCIDR Modified") } - klog.V(2).Infof("Creating ClusterCIDR %s during bootstrap", clusterCIDR.Name) - if err := r.createClusterCIDR(clusterCIDR, terminating); err != nil { - klog.Errorf("Unable to create ClusterCIDR %s: %v", clusterCIDR.Name, err) + logger.V(2).Info("Creating ClusterCIDR during bootstrap", "clusterCIDR", clusterCIDR.Name) + if err := r.createClusterCIDR(ctx, clusterCIDR, terminating); err != nil { + logger.Error(err, "Unable to create ClusterCIDR", "clusterCIDR", clusterCIDR.Name) return err } @@ -1044,7 +1050,7 @@ func (r *multiCIDRRangeAllocator) reconcileBootstrap(clusterCIDR *networkingv1al } // createClusterCIDR creates and maps the cidrSets in the cidrMap. -func (r *multiCIDRRangeAllocator) createClusterCIDR(clusterCIDR *networkingv1alpha1.ClusterCIDR, terminating bool) error { +func (r *multiCIDRRangeAllocator) createClusterCIDR(ctx context.Context, clusterCIDR *networkingv1alpha1.ClusterCIDR, terminating bool) error { nodeSelector, err := r.nodeSelectorKey(clusterCIDR) if err != nil { return fmt.Errorf("unable to get labelSelector key: %w", err) @@ -1069,16 +1075,17 @@ func (r *multiCIDRRangeAllocator) createClusterCIDR(clusterCIDR *networkingv1alp updatedClusterCIDR.ObjectMeta.Finalizers = append(clusterCIDR.ObjectMeta.Finalizers, clusterCIDRFinalizer) } + logger := klog.FromContext(ctx) if updatedClusterCIDR.ResourceVersion == "" { // Create is only used for creating default ClusterCIDR. - if _, err := r.client.NetworkingV1alpha1().ClusterCIDRs().Create(context.TODO(), updatedClusterCIDR, metav1.CreateOptions{}); err != nil { - klog.V(2).Infof("Error creating ClusterCIDR %s: %v", clusterCIDR.Name, err) + if _, err := r.client.NetworkingV1alpha1().ClusterCIDRs().Create(ctx, updatedClusterCIDR, metav1.CreateOptions{}); err != nil { + logger.V(2).Info("Error creating ClusterCIDR", "clusterCIDR", klog.KObj(clusterCIDR), "err", err) return err } } else { // Update the ClusterCIDR object when called from reconcileCreate. - if _, err := r.client.NetworkingV1alpha1().ClusterCIDRs().Update(context.TODO(), updatedClusterCIDR, metav1.UpdateOptions{}); err != nil { - klog.V(2).Infof("Error creating ClusterCIDR %s: %v", clusterCIDR.Name, err) + if _, err := r.client.NetworkingV1alpha1().ClusterCIDRs().Update(ctx, updatedClusterCIDR, metav1.UpdateOptions{}); err != nil { + logger.V(2).Info("Error creating ClusterCIDR", "clusterCIDR", clusterCIDR.Name, "err", err) return err } } @@ -1136,30 +1143,31 @@ func (r *multiCIDRRangeAllocator) mapClusterCIDRSet(cidrMap map[string][]*cidrse // reconcileDelete releases the assigned ClusterCIDR and removes the finalizer // if the deletion timestamp is set. -func (r *multiCIDRRangeAllocator) reconcileDelete(clusterCIDR *networkingv1alpha1.ClusterCIDR) error { +func (r *multiCIDRRangeAllocator) reconcileDelete(ctx context.Context, clusterCIDR *networkingv1alpha1.ClusterCIDR) error { r.lock.Lock() defer r.lock.Unlock() + logger := klog.FromContext(ctx) if slice.ContainsString(clusterCIDR.GetFinalizers(), clusterCIDRFinalizer, nil) { - klog.V(2).Infof("Releasing ClusterCIDR: %s", clusterCIDR.Name) - if err := r.deleteClusterCIDR(clusterCIDR); err != nil { - klog.V(2).Infof("Error while deleting ClusterCIDR: %+v", err) + logger.V(2).Info("Releasing ClusterCIDR", "clusterCIDR", clusterCIDR.Name) + if err := r.deleteClusterCIDR(logger, clusterCIDR); err != nil { + klog.V(2).Info("Error while deleting ClusterCIDR", "err", err) return err } // Remove the finalizer as delete is successful. cccCopy := clusterCIDR.DeepCopy() cccCopy.ObjectMeta.Finalizers = slice.RemoveString(cccCopy.ObjectMeta.Finalizers, clusterCIDRFinalizer, nil) - if _, err := r.client.NetworkingV1alpha1().ClusterCIDRs().Update(context.TODO(), cccCopy, metav1.UpdateOptions{}); err != nil { - klog.V(2).Infof("Error removing finalizer for ClusterCIDR %s: %v", clusterCIDR.Name, err) + if _, err := r.client.NetworkingV1alpha1().ClusterCIDRs().Update(ctx, cccCopy, metav1.UpdateOptions{}); err != nil { + logger.V(2).Info("Error removing finalizer for ClusterCIDR", "clusterCIDR", clusterCIDR.Name, "err", err) return err } - klog.V(2).Infof("Removed finalizer for ClusterCIDR %s", clusterCIDR.Name) + logger.V(2).Info("Removed finalizer for ClusterCIDR", "clusterCIDR", clusterCIDR.Name) } return nil } // deleteClusterCIDR Deletes and unmaps the ClusterCIDRs from the cidrMap. -func (r *multiCIDRRangeAllocator) deleteClusterCIDR(clusterCIDR *networkingv1alpha1.ClusterCIDR) error { +func (r *multiCIDRRangeAllocator) deleteClusterCIDR(logger klog.Logger, clusterCIDR *networkingv1alpha1.ClusterCIDR) error { labelSelector, err := r.nodeSelectorKey(clusterCIDR) if err != nil { @@ -1168,7 +1176,7 @@ func (r *multiCIDRRangeAllocator) deleteClusterCIDR(clusterCIDR *networkingv1alp clusterCIDRSetList, ok := r.cidrMap[labelSelector] if !ok { - klog.Infof("Label %s not found in CIDRMap, proceeding with delete", labelSelector) + logger.Info("Label not found in CIDRMap, proceeding with delete", "labelSelector", labelSelector) return nil } @@ -1196,7 +1204,7 @@ func (r *multiCIDRRangeAllocator) deleteClusterCIDR(clusterCIDR *networkingv1alp r.cidrMap[labelSelector] = clusterCIDRSetList return nil } - klog.V(2).Info("clusterCIDR not found, proceeding with delete", "Name", clusterCIDR.Name, "label", labelSelector) + logger.V(2).Info("clusterCIDR not found, proceeding with delete", "clusterCIDR", clusterCIDR.Name, "label", labelSelector) return nil } @@ -1217,7 +1225,7 @@ func (r *multiCIDRRangeAllocator) nodeSelectorKey(clusterCIDR *networkingv1alpha return nodeSelector.String(), nil } -func listClusterCIDRs(kubeClient clientset.Interface) (*networkingv1alpha1.ClusterCIDRList, error) { +func listClusterCIDRs(ctx context.Context, kubeClient clientset.Interface) (*networkingv1alpha1.ClusterCIDRList, error) { var clusterCIDRList *networkingv1alpha1.ClusterCIDRList // We must poll because apiserver might not be up. This error causes // controller manager to restart. @@ -1230,19 +1238,20 @@ func listClusterCIDRs(kubeClient clientset.Interface) (*networkingv1alpha1.Clust Steps: 11, } + logger := klog.FromContext(ctx) if pollErr := wait.ExponentialBackoff(backoff, func() (bool, error) { var err error - clusterCIDRList, err = kubeClient.NetworkingV1alpha1().ClusterCIDRs().List(context.TODO(), metav1.ListOptions{ + clusterCIDRList, err = kubeClient.NetworkingV1alpha1().ClusterCIDRs().List(ctx, metav1.ListOptions{ FieldSelector: fields.Everything().String(), LabelSelector: labels.Everything().String(), }) if err != nil { - klog.Errorf("Failed to list all clusterCIDRs: %v", err) + logger.Error(err, "Failed to list all clusterCIDRs") return false, nil } return true, nil }); pollErr != nil { - klog.Errorf("Failed to list clusterCIDRs (after %v)", time.Now().Sub(startTimestamp)) + logger.Error(nil, "Failed to list clusterCIDRs", "latency", time.Now().Sub(startTimestamp)) return nil, fmt.Errorf("failed to list all clusterCIDRs in %v, cannot proceed without updating CIDR map", apiserverStartupGracePeriod) } diff --git a/pkg/controller/nodeipam/ipam/multi_cidr_range_allocator_test.go b/pkg/controller/nodeipam/ipam/multi_cidr_range_allocator_test.go index 72cc1408d6a..e92014d42e2 100644 --- a/pkg/controller/nodeipam/ipam/multi_cidr_range_allocator_test.go +++ b/pkg/controller/nodeipam/ipam/multi_cidr_range_allocator_test.go @@ -34,6 +34,7 @@ import ( "k8s.io/client-go/kubernetes/fake" k8stesting "k8s.io/client-go/testing" "k8s.io/client-go/tools/cache" + "k8s.io/klog/v2/ktesting" v1helper "k8s.io/kubernetes/pkg/apis/core/v1/helper" "k8s.io/kubernetes/pkg/controller" cidrset "k8s.io/kubernetes/pkg/controller/nodeipam/ipam/multicidrset" @@ -473,6 +474,7 @@ func TestMultiCIDROccupyPreExistingCIDR(t *testing.T) { } // test function + _, ctx := ktesting.NewTestContext(t) for _, tc := range testCaseMultiCIDRs { t.Run(tc.description, func(t *testing.T) { // Initialize the range allocator. @@ -482,7 +484,7 @@ func TestMultiCIDROccupyPreExistingCIDR(t *testing.T) { fakeClusterCIDRInformer := fakeInformerFactory.Networking().V1alpha1().ClusterCIDRs() nodeList, _ := tc.fakeNodeHandler.List(context.TODO(), metav1.ListOptions{}) - _, err := NewMultiCIDRRangeAllocator(tc.fakeNodeHandler, fakeNodeInformer, fakeClusterCIDRInformer, tc.allocatorParams, nodeList, tc.testCIDRMap) + _, err := NewMultiCIDRRangeAllocator(ctx, tc.fakeNodeHandler, fakeNodeInformer, fakeClusterCIDRInformer, tc.allocatorParams, nodeList, tc.testCIDRMap) if err == nil && tc.ctrlCreateFail { t.Fatalf("creating range allocator was expected to fail, but it did not") } @@ -1177,6 +1179,8 @@ func TestMultiCIDRAllocateOrOccupyCIDRSuccess(t *testing.T) { }, } + logger, ctx := ktesting.NewTestContext(t) + // test function testFunc := func(tc testCaseMultiCIDR) { nodeList, _ := tc.fakeNodeHandler.List(context.TODO(), metav1.ListOptions{}) @@ -1185,7 +1189,7 @@ func TestMultiCIDRAllocateOrOccupyCIDRSuccess(t *testing.T) { fakeClient := &fake.Clientset{} fakeInformerFactory := informers.NewSharedInformerFactory(fakeClient, controller.NoResyncPeriodFunc()) fakeClusterCIDRInformer := fakeInformerFactory.Networking().V1alpha1().ClusterCIDRs() - allocator, err := NewMultiCIDRRangeAllocator(tc.fakeNodeHandler, test.FakeNodeInformer(tc.fakeNodeHandler), fakeClusterCIDRInformer, tc.allocatorParams, nodeList, tc.testCIDRMap) + allocator, err := NewMultiCIDRRangeAllocator(ctx, tc.fakeNodeHandler, test.FakeNodeInformer(tc.fakeNodeHandler), fakeClusterCIDRInformer, tc.allocatorParams, nodeList, tc.testCIDRMap) if err != nil { t.Errorf("%v: failed to create CIDRRangeAllocator with error %v", tc.description, err) return @@ -1230,7 +1234,7 @@ func TestMultiCIDRAllocateOrOccupyCIDRSuccess(t *testing.T) { if node.Spec.PodCIDRs == nil { updateCount++ } - if err := allocator.AllocateOrOccupyCIDR(node); err != nil { + if err := allocator.AllocateOrOccupyCIDR(logger, node); err != nil { t.Errorf("%v: unexpected error in AllocateOrOccupyCIDR: %v", tc.description, err) } } @@ -1308,13 +1312,15 @@ func TestMultiCIDRAllocateOrOccupyCIDRFailure(t *testing.T) { }, } + logger, ctx := ktesting.NewTestContext(t) + testFunc := func(tc testCaseMultiCIDR) { fakeClient := &fake.Clientset{} fakeInformerFactory := informers.NewSharedInformerFactory(fakeClient, controller.NoResyncPeriodFunc()) fakeClusterCIDRInformer := fakeInformerFactory.Networking().V1alpha1().ClusterCIDRs() // Initialize the range allocator. - allocator, err := NewMultiCIDRRangeAllocator(tc.fakeNodeHandler, test.FakeNodeInformer(tc.fakeNodeHandler), fakeClusterCIDRInformer, tc.allocatorParams, nil, tc.testCIDRMap) + allocator, err := NewMultiCIDRRangeAllocator(ctx, tc.fakeNodeHandler, test.FakeNodeInformer(tc.fakeNodeHandler), fakeClusterCIDRInformer, tc.allocatorParams, nil, tc.testCIDRMap) if err != nil { t.Logf("%v: failed to create CIDRRangeAllocator with error %v", tc.description, err) } @@ -1353,7 +1359,7 @@ func TestMultiCIDRAllocateOrOccupyCIDRFailure(t *testing.T) { } } - if err := allocator.AllocateOrOccupyCIDR(tc.fakeNodeHandler.Existing[0]); err == nil { + if err := allocator.AllocateOrOccupyCIDR(logger, tc.fakeNodeHandler.Existing[0]); err == nil { t.Errorf("%v: unexpected success in AllocateOrOccupyCIDR: %v", tc.description, err) } // We don't expect any updates, so just sleep for some time @@ -1498,13 +1504,13 @@ func TestMultiCIDRReleaseCIDRSuccess(t *testing.T) { }, }, } - + logger, ctx := ktesting.NewTestContext(t) testFunc := func(tc releasetestCaseMultiCIDR) { fakeClient := &fake.Clientset{} fakeInformerFactory := informers.NewSharedInformerFactory(fakeClient, controller.NoResyncPeriodFunc()) fakeClusterCIDRInformer := fakeInformerFactory.Networking().V1alpha1().ClusterCIDRs() // Initialize the range allocator. - allocator, _ := NewMultiCIDRRangeAllocator(tc.fakeNodeHandler, test.FakeNodeInformer(tc.fakeNodeHandler), fakeClusterCIDRInformer, tc.allocatorParams, nil, tc.testCIDRMap) + allocator, _ := NewMultiCIDRRangeAllocator(ctx, tc.fakeNodeHandler, test.FakeNodeInformer(tc.fakeNodeHandler), fakeClusterCIDRInformer, tc.allocatorParams, nil, tc.testCIDRMap) rangeAllocator, ok := allocator.(*multiCIDRRangeAllocator) if !ok { t.Logf("%v: found non-default implementation of CIDRAllocator, skipping white-box test...", tc.description) @@ -1540,7 +1546,7 @@ func TestMultiCIDRReleaseCIDRSuccess(t *testing.T) { } } - err := allocator.AllocateOrOccupyCIDR(tc.fakeNodeHandler.Existing[0]) + err := allocator.AllocateOrOccupyCIDR(logger, tc.fakeNodeHandler.Existing[0]) if len(tc.expectedAllocatedCIDRFirstRound) != 0 { if err != nil { t.Fatalf("%v: unexpected error in AllocateOrOccupyCIDR: %v", tc.description, err) @@ -1570,12 +1576,12 @@ func TestMultiCIDRReleaseCIDRSuccess(t *testing.T) { }, } nodeToRelease.Spec.PodCIDRs = cidrToRelease - err = allocator.ReleaseCIDR(&nodeToRelease) + err = allocator.ReleaseCIDR(logger, &nodeToRelease) if err != nil { t.Fatalf("%v: unexpected error in ReleaseCIDR: %v", tc.description, err) } } - if err = allocator.AllocateOrOccupyCIDR(tc.fakeNodeHandler.Existing[0]); err != nil { + if err = allocator.AllocateOrOccupyCIDR(logger, tc.fakeNodeHandler.Existing[0]); err != nil { t.Fatalf("%v: unexpected error in AllocateOrOccupyCIDR: %v", tc.description, err) } if err := test.WaitForUpdatedNodeWithTimeout(tc.fakeNodeHandler, 1, wait.ForeverTestTimeout); err != nil { @@ -1614,7 +1620,7 @@ type clusterCIDRController struct { clusterCIDRStore cache.Store } -func newController() (*fake.Clientset, *clusterCIDRController) { +func newController(ctx context.Context) (*fake.Clientset, *clusterCIDRController) { client := fake.NewSimpleClientset() informerFactory := informers.NewSharedInformerFactory(client, controller.NoResyncPeriodFunc()) @@ -1657,7 +1663,7 @@ func newController() (*fake.Clientset, *clusterCIDRController) { testCIDRMap := make(map[string][]*cidrset.ClusterCIDR, 0) // Initialize the range allocator. - ra, _ := NewMultiCIDRRangeAllocator(client, nodeInformer, cccInformer, allocatorParams, nil, testCIDRMap) + ra, _ := NewMultiCIDRRangeAllocator(ctx, client, nodeInformer, cccInformer, allocatorParams, nil, testCIDRMap) cccController := ra.(*multiCIDRRangeAllocator) cccController.clusterCIDRSynced = alwaysReady @@ -1671,8 +1677,8 @@ func newController() (*fake.Clientset, *clusterCIDRController) { // Ensure default ClusterCIDR is created during bootstrap. func TestClusterCIDRDefault(t *testing.T) { defaultCCC := makeClusterCIDR(defaultClusterCIDRName, "192.168.0.0/16", "", 8, nil) - - client, _ := newController() + _, ctx := ktesting.NewTestContext(t) + client, _ := newController(ctx) createdCCC, err := client.NetworkingV1alpha1().ClusterCIDRs().Get(context.TODO(), defaultClusterCIDRName, metav1.GetOptions{}) assert.Nil(t, err, "Expected no error getting clustercidr objects") assert.Equal(t, defaultCCC.Spec, createdCCC.Spec) @@ -1752,11 +1758,11 @@ func TestSyncClusterCIDRCreate(t *testing.T) { wantErr: true, }, } - - client, cccController := newController() + _, ctx := ktesting.NewTestContext(t) + client, cccController := newController(ctx) for _, tc := range tests { cccController.clusterCIDRStore.Add(tc.ccc) - err := cccController.syncClusterCIDR(tc.ccc.Name) + err := cccController.syncClusterCIDR(ctx, tc.ccc.Name) if tc.wantErr { assert.Error(t, err) continue @@ -1773,30 +1779,32 @@ func TestSyncClusterCIDRCreate(t *testing.T) { // Ensure syncClusterCIDR for ClusterCIDR delete removes the ClusterCIDR. func TestSyncClusterCIDRDelete(t *testing.T) { - _, cccController := newController() + _, ctx := ktesting.NewTestContext(t) + _, cccController := newController(ctx) testCCC := makeClusterCIDR("testing-1", "10.1.0.0/16", "", 8, makeNodeSelector("foo", v1.NodeSelectorOpIn, []string{"bar"})) cccController.clusterCIDRStore.Add(testCCC) - err := cccController.syncClusterCIDR(testCCC.Name) + err := cccController.syncClusterCIDR(ctx, testCCC.Name) assert.NoError(t, err) deletionTimestamp := metav1.Now() testCCC.DeletionTimestamp = &deletionTimestamp cccController.clusterCIDRStore.Update(testCCC) - err = cccController.syncClusterCIDR(testCCC.Name) + err = cccController.syncClusterCIDR(ctx, testCCC.Name) assert.NoError(t, err) } // Ensure syncClusterCIDR for ClusterCIDR delete does not remove ClusterCIDR // if a node is associated with the ClusterCIDR. func TestSyncClusterCIDRDeleteWithNodesAssociated(t *testing.T) { - client, cccController := newController() + _, ctx := ktesting.NewTestContext(t) + client, cccController := newController(ctx) testCCC := makeClusterCIDR("testing-1", "10.1.0.0/16", "", 8, makeNodeSelector("foo", v1.NodeSelectorOpIn, []string{"bar"})) cccController.clusterCIDRStore.Add(testCCC) - err := cccController.syncClusterCIDR(testCCC.Name) + err := cccController.syncClusterCIDR(ctx, testCCC.Name) assert.NoError(t, err) // Mock the IPAM controller behavior associating node with ClusterCIDR. @@ -1810,7 +1818,7 @@ func TestSyncClusterCIDRDeleteWithNodesAssociated(t *testing.T) { deletionTimestamp := metav1.Now() createdCCC.DeletionTimestamp = &deletionTimestamp cccController.clusterCIDRStore.Update(createdCCC) - err = cccController.syncClusterCIDR(createdCCC.Name) + err = cccController.syncClusterCIDR(ctx, createdCCC.Name) assert.Error(t, err, fmt.Sprintf("ClusterCIDR %s marked as terminating, won't be deleted until all associated nodes are deleted", createdCCC.Name)) } diff --git a/pkg/controller/nodeipam/ipam/multicidrset/multi_cidr_set_test.go b/pkg/controller/nodeipam/ipam/multicidrset/multi_cidr_set_test.go index 34cbe6e8481..79af5337fb2 100644 --- a/pkg/controller/nodeipam/ipam/multicidrset/multi_cidr_set_test.go +++ b/pkg/controller/nodeipam/ipam/multicidrset/multi_cidr_set_test.go @@ -22,7 +22,7 @@ import ( "testing" "k8s.io/component-base/metrics/testutil" - "k8s.io/klog/v2" + "k8s.io/klog/v2/ktesting" utilnet "k8s.io/utils/net" ) @@ -568,6 +568,7 @@ func TestGetBitforCIDR(t *testing.T) { }, } + logger, _ := ktesting.NewTestContext(t) for _, tc := range cases { _, clusterCIDR, err := utilnet.ParseCIDRSloppy(tc.clusterCIDRStr) if err != nil { @@ -585,17 +586,17 @@ func TestGetBitforCIDR(t *testing.T) { got, err := cs.getIndexForCIDR(subnetCIDR) if err == nil && tc.expectErr { - klog.Errorf("expected error but got null for %v", tc.description) + logger.Error(nil, "Expected error but got null", "description", tc.description) continue } if err != nil && !tc.expectErr { - klog.Errorf("unexpected error: %v for %v", err, tc.description) + logger.Error(err, "Unexpected error", "description", tc.description) continue } if got != tc.expectedBit { - klog.Errorf("expected %v, but got %v for %v", tc.expectedBit, got, tc.description) + logger.Error(nil, "Unexpected value", "description", tc.description, "expected", tc.expectedBit, "got", got) } } } diff --git a/pkg/controller/nodeipam/ipam/range_allocator.go b/pkg/controller/nodeipam/ipam/range_allocator.go index 0562819949f..47bd392e234 100644 --- a/pkg/controller/nodeipam/ipam/range_allocator.go +++ b/pkg/controller/nodeipam/ipam/range_allocator.go @@ -17,6 +17,7 @@ limitations under the License. package ipam import ( + "context" "fmt" "net" "sync" @@ -66,9 +67,10 @@ type rangeAllocator struct { // Caller must always pass in a list of existing nodes so the new allocator. // Caller must ensure that ClusterCIDRs are semantically correct e.g (1 for non DualStack, 2 for DualStack etc..) // can initialize its CIDR map. NodeList is only nil in testing. -func NewCIDRRangeAllocator(client clientset.Interface, nodeInformer informers.NodeInformer, allocatorParams CIDRAllocatorParams, nodeList *v1.NodeList) (CIDRAllocator, error) { +func NewCIDRRangeAllocator(logger klog.Logger, client clientset.Interface, nodeInformer informers.NodeInformer, allocatorParams CIDRAllocatorParams, nodeList *v1.NodeList) (CIDRAllocator, error) { if client == nil { - klog.Fatalf("kubeClient is nil when starting NodeController") + logger.Error(nil, "kubeClient is nil when starting CIDRRangeAllocator") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) } eventBroadcaster := record.NewBroadcaster() @@ -98,24 +100,24 @@ func NewCIDRRangeAllocator(client clientset.Interface, nodeInformer informers.No } if allocatorParams.ServiceCIDR != nil { - ra.filterOutServiceRange(allocatorParams.ServiceCIDR) + ra.filterOutServiceRange(logger, allocatorParams.ServiceCIDR) } else { - klog.Info("No Service CIDR provided. Skipping filtering out service addresses.") + logger.Info("No Service CIDR provided. Skipping filtering out service addresses") } if allocatorParams.SecondaryServiceCIDR != nil { - ra.filterOutServiceRange(allocatorParams.SecondaryServiceCIDR) + ra.filterOutServiceRange(logger, allocatorParams.SecondaryServiceCIDR) } else { - klog.Info("No Secondary Service CIDR provided. Skipping filtering out secondary service addresses.") + logger.Info("No Secondary Service CIDR provided. Skipping filtering out secondary service addresses") } if nodeList != nil { for _, node := range nodeList.Items { if len(node.Spec.PodCIDRs) == 0 { - klog.V(4).Infof("Node %v has no CIDR, ignoring", node.Name) + logger.V(4).Info("Node has no CIDR, ignoring", "node", klog.KObj(&node)) continue } - klog.V(4).Infof("Node %v has CIDR %s, occupying it in CIDR map", node.Name, node.Spec.PodCIDR) + logger.V(4).Info("Node has CIDR, occupying it in CIDR map", "node", klog.KObj(&node), "podCIDR", node.Spec.PodCIDR) if err := ra.occupyCIDRs(&node); err != nil { // This will happen if: // 1. We find garbage in the podCIDRs field. Retrying is useless. @@ -127,7 +129,9 @@ func NewCIDRRangeAllocator(client clientset.Interface, nodeInformer informers.No } nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ - AddFunc: controllerutil.CreateAddNodeHandler(ra.AllocateOrOccupyCIDR), + AddFunc: controllerutil.CreateAddNodeHandler(func(node *v1.Node) error { + return ra.AllocateOrOccupyCIDR(logger, node) + }), UpdateFunc: controllerutil.CreateUpdateNodeHandler(func(_, newNode *v1.Node) error { // If the PodCIDRs list is not empty we either: // - already processed a Node that already had CIDRs after NC restarted @@ -149,52 +153,56 @@ func NewCIDRRangeAllocator(client clientset.Interface, nodeInformer informers.No // state is correct. // Restart of NC fixes the issue. if len(newNode.Spec.PodCIDRs) == 0 { - return ra.AllocateOrOccupyCIDR(newNode) + return ra.AllocateOrOccupyCIDR(logger, newNode) } return nil }), - DeleteFunc: controllerutil.CreateDeleteNodeHandler(ra.ReleaseCIDR), + DeleteFunc: controllerutil.CreateDeleteNodeHandler(func(node *v1.Node) error { + return ra.ReleaseCIDR(logger, node) + }), }) return ra, nil } -func (r *rangeAllocator) Run(stopCh <-chan struct{}) { +func (r *rangeAllocator) Run(ctx context.Context) { defer utilruntime.HandleCrash() // Start event processing pipeline. r.broadcaster.StartStructuredLogging(0) - klog.Infof("Sending events to api server.") + logger := klog.FromContext(ctx) + logger.Info("Sending events to api server") r.broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: r.client.CoreV1().Events("")}) defer r.broadcaster.Shutdown() - klog.Infof("Starting range CIDR allocator") - defer klog.Infof("Shutting down range CIDR allocator") + logger.Info("Starting range CIDR allocator") + defer logger.Info("Shutting down range CIDR allocator") - if !cache.WaitForNamedCacheSync("cidrallocator", stopCh, r.nodesSynced) { + if !cache.WaitForNamedCacheSync("cidrallocator", ctx.Done(), r.nodesSynced) { return } for i := 0; i < cidrUpdateWorkers; i++ { - go r.worker(stopCh) + go r.worker(ctx) } - <-stopCh + <-ctx.Done() } -func (r *rangeAllocator) worker(stopChan <-chan struct{}) { +func (r *rangeAllocator) worker(ctx context.Context) { + logger := klog.FromContext(ctx) for { select { case workItem, ok := <-r.nodeCIDRUpdateChannel: if !ok { - klog.Warning("Channel nodeCIDRUpdateChannel was unexpectedly closed") + logger.Info("Channel nodeCIDRUpdateChannel was unexpectedly closed") return } - if err := r.updateCIDRsAllocation(workItem); err != nil { + if err := r.updateCIDRsAllocation(logger, workItem); err != nil { // Requeue the failed node for update again. r.nodeCIDRUpdateChannel <- workItem } - case <-stopChan: + case <-ctx.Done(): return } } @@ -244,12 +252,12 @@ func (r *rangeAllocator) occupyCIDRs(node *v1.Node) error { // WARNING: If you're adding any return calls or defer any more work from this // function you have to make sure to update nodesInProcessing properly with the // disposition of the node when the work is done. -func (r *rangeAllocator) AllocateOrOccupyCIDR(node *v1.Node) error { +func (r *rangeAllocator) AllocateOrOccupyCIDR(logger klog.Logger, node *v1.Node) error { if node == nil { return nil } if !r.insertNodeToProcessing(node.Name) { - klog.V(2).Infof("Node %v is already in a process of CIDR assignment.", node.Name) + logger.V(2).Info("Node is already in a process of CIDR assignment", "node", klog.KObj(node)) return nil } @@ -273,13 +281,13 @@ func (r *rangeAllocator) AllocateOrOccupyCIDR(node *v1.Node) error { } //queue the assignment - klog.V(4).Infof("Putting node %s with CIDR %v into the work queue", node.Name, allocated.allocatedCIDRs) + logger.V(4).Info("Putting node with CIDR into the work queue", "node", klog.KObj(node), "CIDRs", allocated.allocatedCIDRs) r.nodeCIDRUpdateChannel <- allocated return nil } // ReleaseCIDR marks node.podCIDRs[...] as unused in our tracked cidrSets -func (r *rangeAllocator) ReleaseCIDR(node *v1.Node) error { +func (r *rangeAllocator) ReleaseCIDR(logger klog.Logger, node *v1.Node) error { if node == nil || len(node.Spec.PodCIDRs) == 0 { return nil } @@ -297,7 +305,7 @@ func (r *rangeAllocator) ReleaseCIDR(node *v1.Node) error { return fmt.Errorf("node:%s has an allocated cidr: %v at index:%v that does not exist in cluster cidrs configuration", node.Name, cidr, idx) } - klog.V(4).Infof("release CIDR %s for node:%v", cidr, node.Name) + logger.V(4).Info("Release CIDR for node", "CIDR", cidr, "node", klog.KObj(node)) if err = r.cidrSets[idx].Release(podCIDR); err != nil { return fmt.Errorf("error when releasing CIDR %v: %v", cidr, err) } @@ -307,7 +315,7 @@ func (r *rangeAllocator) ReleaseCIDR(node *v1.Node) error { // Marks all CIDRs with subNetMaskSize that belongs to serviceCIDR as used across all cidrs // so that they won't be assignable. -func (r *rangeAllocator) filterOutServiceRange(serviceCIDR *net.IPNet) { +func (r *rangeAllocator) filterOutServiceRange(logger klog.Logger, serviceCIDR *net.IPNet) { // Checks if service CIDR has a nonempty intersection with cluster // CIDR. It is the case if either clusterCIDR contains serviceCIDR with // clusterCIDR's Mask applied (this means that clusterCIDR contains @@ -321,20 +329,20 @@ func (r *rangeAllocator) filterOutServiceRange(serviceCIDR *net.IPNet) { // at this point, len(cidrSet) == len(clusterCidr) if err := r.cidrSets[idx].Occupy(serviceCIDR); err != nil { - klog.Errorf("Error filtering out service cidr out cluster cidr:%v (index:%v) %v: %v", cidr, idx, serviceCIDR, err) + logger.Error(err, "Error filtering out service cidr out cluster cidr", "CIDR", cidr, "index", idx, "serviceCIDR", serviceCIDR) } } } // updateCIDRsAllocation assigns CIDR to Node and sends an update to the API server. -func (r *rangeAllocator) updateCIDRsAllocation(data nodeReservedCIDRs) error { +func (r *rangeAllocator) updateCIDRsAllocation(logger klog.Logger, data nodeReservedCIDRs) error { var err error var node *v1.Node defer r.removeNodeFromProcessing(data.nodeName) cidrsString := ipnetToStringList(data.allocatedCIDRs) node, err = r.nodeLister.Get(data.nodeName) if err != nil { - klog.Errorf("Failed while getting node %v for updating Node.Spec.PodCIDRs: %v", data.nodeName, err) + logger.Error(err, "Failed while getting node for updating Node.Spec.PodCIDRs", "node", klog.KRef("", data.nodeName)) return err } @@ -350,17 +358,17 @@ func (r *rangeAllocator) updateCIDRsAllocation(data nodeReservedCIDRs) error { } } if match { - klog.V(4).Infof("Node %v already has allocated CIDR %v. It matches the proposed one.", node.Name, data.allocatedCIDRs) + logger.V(4).Info("Node already has allocated CIDR. It matches the proposed one", "node", klog.KObj(node), "CIDRs", data.allocatedCIDRs) return nil } } // node has cidrs, release the reserved if len(node.Spec.PodCIDRs) != 0 { - klog.Errorf("Node %v already has a CIDR allocated %v. Releasing the new one.", node.Name, node.Spec.PodCIDRs) + logger.Error(nil, "Node already has a CIDR allocated. Releasing the new one", "node", klog.KObj(node), "podCIDRs", node.Spec.PodCIDRs) for idx, cidr := range data.allocatedCIDRs { if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil { - klog.Errorf("Error when releasing CIDR idx:%v value: %v err:%v", idx, cidr, releaseErr) + logger.Error(releaseErr, "Error when releasing CIDR", "index", idx, "CIDR", cidr) } } return nil @@ -369,21 +377,21 @@ func (r *rangeAllocator) updateCIDRsAllocation(data nodeReservedCIDRs) error { // If we reached here, it means that the node has no CIDR currently assigned. So we set it. for i := 0; i < cidrUpdateRetries; i++ { if err = nodeutil.PatchNodeCIDRs(r.client, types.NodeName(node.Name), cidrsString); err == nil { - klog.Infof("Set node %v PodCIDR to %v", node.Name, cidrsString) + logger.Info("Set node PodCIDR", "node", klog.KObj(node), "podCIDRs", cidrsString) return nil } } // failed release back to the pool - klog.Errorf("Failed to update node %v PodCIDR to %v after multiple attempts: %v", node.Name, cidrsString, err) + logger.Error(err, "Failed to update node PodCIDR after multiple attempts", "node", klog.KObj(node), "podCIDRs", cidrsString) controllerutil.RecordNodeStatusChange(r.recorder, node, "CIDRAssignmentFailed") // We accept the fact that we may leak CIDRs here. This is safer than releasing // them in case when we don't know if request went through. // NodeController restart will return all falsely allocated CIDRs to the pool. if !apierrors.IsServerTimeout(err) { - klog.Errorf("CIDR assignment for node %v failed: %v. Releasing allocated CIDR", node.Name, err) + logger.Error(err, "CIDR assignment for node failed. Releasing allocated CIDR", "node", klog.KObj(node)) for idx, cidr := range data.allocatedCIDRs { if releaseErr := r.cidrSets[idx].Release(cidr); releaseErr != nil { - klog.Errorf("Error releasing allocated CIDR for node %v: %v", node.Name, releaseErr) + logger.Error(releaseErr, "Error releasing allocated CIDR for node", "node", klog.KObj(node)) } } } diff --git a/pkg/controller/nodeipam/ipam/range_allocator_test.go b/pkg/controller/nodeipam/ipam/range_allocator_test.go index b0dd1c32f47..bc3908781ae 100644 --- a/pkg/controller/nodeipam/ipam/range_allocator_test.go +++ b/pkg/controller/nodeipam/ipam/range_allocator_test.go @@ -26,6 +26,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes/fake" + "k8s.io/klog/v2/ktesting" "k8s.io/kubernetes/pkg/controller/nodeipam/ipam/test" "k8s.io/kubernetes/pkg/controller/testutil" netutils "k8s.io/utils/net" @@ -274,12 +275,13 @@ func TestOccupyPreExistingCIDR(t *testing.T) { } // test function + logger, _ := ktesting.NewTestContext(t) for _, tc := range testCases { t.Run(tc.description, func(t *testing.T) { // Initialize the range allocator. fakeNodeInformer := test.FakeNodeInformer(tc.fakeNodeHandler) nodeList, _ := tc.fakeNodeHandler.List(context.TODO(), metav1.ListOptions{}) - _, err := NewCIDRRangeAllocator(tc.fakeNodeHandler, fakeNodeInformer, tc.allocatorParams, nodeList) + _, err := NewCIDRRangeAllocator(logger, tc.fakeNodeHandler, fakeNodeInformer, tc.allocatorParams, nodeList) if err == nil && tc.ctrlCreateFail { t.Fatalf("creating range allocator was expected to fail, but it did not") } @@ -508,11 +510,12 @@ func TestAllocateOrOccupyCIDRSuccess(t *testing.T) { } // test function + logger, ctx := ktesting.NewTestContext(t) testFunc := func(tc testCase) { fakeNodeInformer := test.FakeNodeInformer(tc.fakeNodeHandler) nodeList, _ := tc.fakeNodeHandler.List(context.TODO(), metav1.ListOptions{}) // Initialize the range allocator. - allocator, err := NewCIDRRangeAllocator(tc.fakeNodeHandler, fakeNodeInformer, tc.allocatorParams, nodeList) + allocator, err := NewCIDRRangeAllocator(logger, tc.fakeNodeHandler, fakeNodeInformer, tc.allocatorParams, nodeList) if err != nil { t.Errorf("%v: failed to create CIDRRangeAllocator with error %v", tc.description, err) return @@ -524,7 +527,7 @@ func TestAllocateOrOccupyCIDRSuccess(t *testing.T) { } rangeAllocator.nodesSynced = test.AlwaysReady rangeAllocator.recorder = testutil.NewFakeRecorder() - go allocator.Run(wait.NeverStop) + go allocator.Run(ctx) // this is a bit of white box testing // pre allocate the cidrs as per the test @@ -545,7 +548,7 @@ func TestAllocateOrOccupyCIDRSuccess(t *testing.T) { if node.Spec.PodCIDRs == nil { updateCount++ } - if err := allocator.AllocateOrOccupyCIDR(node); err != nil { + if err := allocator.AllocateOrOccupyCIDR(logger, node); err != nil { t.Errorf("%v: unexpected error in AllocateOrOccupyCIDR: %v", tc.description, err) } } @@ -608,10 +611,10 @@ func TestAllocateOrOccupyCIDRFailure(t *testing.T) { }, }, } - + logger, ctx := ktesting.NewTestContext(t) testFunc := func(tc testCase) { // Initialize the range allocator. - allocator, err := NewCIDRRangeAllocator(tc.fakeNodeHandler, test.FakeNodeInformer(tc.fakeNodeHandler), tc.allocatorParams, nil) + allocator, err := NewCIDRRangeAllocator(logger, tc.fakeNodeHandler, test.FakeNodeInformer(tc.fakeNodeHandler), tc.allocatorParams, nil) if err != nil { t.Logf("%v: failed to create CIDRRangeAllocator with error %v", tc.description, err) } @@ -622,7 +625,7 @@ func TestAllocateOrOccupyCIDRFailure(t *testing.T) { } rangeAllocator.nodesSynced = test.AlwaysReady rangeAllocator.recorder = testutil.NewFakeRecorder() - go allocator.Run(wait.NeverStop) + go allocator.Run(ctx) // this is a bit of white box testing for setIdx, allocatedList := range tc.allocatedCIDRs { @@ -637,7 +640,7 @@ func TestAllocateOrOccupyCIDRFailure(t *testing.T) { } } } - if err := allocator.AllocateOrOccupyCIDR(tc.fakeNodeHandler.Existing[0]); err == nil { + if err := allocator.AllocateOrOccupyCIDR(logger, tc.fakeNodeHandler.Existing[0]); err == nil { t.Errorf("%v: unexpected success in AllocateOrOccupyCIDR: %v", tc.description, err) } // We don't expect any updates, so just sleep for some time @@ -753,10 +756,10 @@ func TestReleaseCIDRSuccess(t *testing.T) { }, }, } - + logger, ctx := ktesting.NewTestContext(t) testFunc := func(tc releaseTestCase) { // Initialize the range allocator. - allocator, _ := NewCIDRRangeAllocator(tc.fakeNodeHandler, test.FakeNodeInformer(tc.fakeNodeHandler), tc.allocatorParams, nil) + allocator, _ := NewCIDRRangeAllocator(logger, tc.fakeNodeHandler, test.FakeNodeInformer(tc.fakeNodeHandler), tc.allocatorParams, nil) rangeAllocator, ok := allocator.(*rangeAllocator) if !ok { t.Logf("%v: found non-default implementation of CIDRAllocator, skipping white-box test...", tc.description) @@ -764,7 +767,7 @@ func TestReleaseCIDRSuccess(t *testing.T) { } rangeAllocator.nodesSynced = test.AlwaysReady rangeAllocator.recorder = testutil.NewFakeRecorder() - go allocator.Run(wait.NeverStop) + go allocator.Run(ctx) // this is a bit of white box testing for setIdx, allocatedList := range tc.allocatedCIDRs { @@ -780,7 +783,7 @@ func TestReleaseCIDRSuccess(t *testing.T) { } } - err := allocator.AllocateOrOccupyCIDR(tc.fakeNodeHandler.Existing[0]) + err := allocator.AllocateOrOccupyCIDR(logger, tc.fakeNodeHandler.Existing[0]) if len(tc.expectedAllocatedCIDRFirstRound) != 0 { if err != nil { t.Fatalf("%v: unexpected error in AllocateOrOccupyCIDR: %v", tc.description, err) @@ -805,12 +808,12 @@ func TestReleaseCIDRSuccess(t *testing.T) { }, } nodeToRelease.Spec.PodCIDRs = cidrToRelease - err = allocator.ReleaseCIDR(&nodeToRelease) + err = allocator.ReleaseCIDR(logger, &nodeToRelease) if err != nil { t.Fatalf("%v: unexpected error in ReleaseCIDR: %v", tc.description, err) } } - if err = allocator.AllocateOrOccupyCIDR(tc.fakeNodeHandler.Existing[0]); err != nil { + if err = allocator.AllocateOrOccupyCIDR(logger, tc.fakeNodeHandler.Existing[0]); err != nil { t.Fatalf("%v: unexpected error in AllocateOrOccupyCIDR: %v", tc.description, err) } if err := test.WaitForUpdatedNodeWithTimeout(tc.fakeNodeHandler, 1, wait.ForeverTestTimeout); err != nil { diff --git a/pkg/controller/nodeipam/ipam/sync/sync.go b/pkg/controller/nodeipam/ipam/sync/sync.go index 2e736ab20d0..a8f7f6e370e 100644 --- a/pkg/controller/nodeipam/ipam/sync/sync.go +++ b/pkg/controller/nodeipam/ipam/sync/sync.go @@ -120,8 +120,8 @@ func New(c controller, cloudAlias cloudAlias, kubeAPI kubeAPI, mode NodeSyncMode // Loop runs the sync loop for a given node. done is an optional channel that // is closed when the Loop() returns. -func (sync *NodeSync) Loop(done chan struct{}) { - klog.V(2).Infof("Starting sync loop for node %q", sync.nodeName) +func (sync *NodeSync) Loop(logger klog.Logger, done chan struct{}) { + logger.V(2).Info("Starting sync loop", "node", klog.KRef("", sync.nodeName)) defer func() { if done != nil { @@ -131,27 +131,27 @@ func (sync *NodeSync) Loop(done chan struct{}) { timeout := sync.c.ResyncTimeout() delayTimer := time.NewTimer(timeout) - klog.V(4).Infof("Resync node %q in %v", sync.nodeName, timeout) + logger.V(4).Info("Try to resync node later", "node", klog.KRef("", sync.nodeName), "resyncTime", timeout) for { select { case op, more := <-sync.opChan: if !more { - klog.V(2).Infof("Stopping sync loop") + logger.V(2).Info("Stopping sync loop") return } - sync.c.ReportResult(op.run(sync)) + sync.c.ReportResult(op.run(logger, sync)) if !delayTimer.Stop() { <-delayTimer.C } case <-delayTimer.C: - klog.V(4).Infof("Running resync for node %q", sync.nodeName) - sync.c.ReportResult((&updateOp{}).run(sync)) + logger.V(4).Info("Running resync", "node", klog.KRef("", sync.nodeName)) + sync.c.ReportResult((&updateOp{}).run(logger, sync)) } timeout := sync.c.ResyncTimeout() delayTimer.Reset(timeout) - klog.V(4).Infof("Resync node %q in %v", sync.nodeName, timeout) + logger.V(4).Info("Try to resync node later", "node", klog.KRef("", sync.nodeName), "resyncTime", timeout) } } @@ -175,7 +175,7 @@ func (sync *NodeSync) Delete(node *v1.Node) { // syncOp is the interface for generic sync operation. type syncOp interface { // run the requested sync operation. - run(sync *NodeSync) error + run(logger klog.Logger, sync *NodeSync) error } // updateOp handles creation and updates of a node. @@ -190,16 +190,16 @@ func (op *updateOp) String() string { return fmt.Sprintf("updateOp(%q,%v)", op.node.Name, op.node.Spec.PodCIDR) } -func (op *updateOp) run(sync *NodeSync) error { - klog.V(3).Infof("Running updateOp %+v", op) +func (op *updateOp) run(logger klog.Logger, sync *NodeSync) error { + logger.V(3).Info("Running updateOp", "updateOp", op) ctx := context.Background() if op.node == nil { - klog.V(3).Infof("Getting node spec for %q", sync.nodeName) + logger.V(3).Info("Getting node spec", "node", klog.KRef("", sync.nodeName)) node, err := sync.kubeAPI.Node(ctx, sync.nodeName) if err != nil { - klog.Errorf("Error getting node %q spec: %v", sync.nodeName, err) + logger.Error(err, "Error getting node pec", "node", klog.KRef("", sync.nodeName)) return err } op.node = node @@ -207,7 +207,7 @@ func (op *updateOp) run(sync *NodeSync) error { aliasRange, err := sync.cloudAlias.Alias(ctx, op.node) if err != nil { - klog.Errorf("Error getting cloud alias for node %q: %v", sync.nodeName, err) + logger.Error(err, "Error getting cloud alias for node", "node", klog.KRef("", sync.nodeName)) return err } @@ -229,14 +229,13 @@ func (op *updateOp) run(sync *NodeSync) error { // match. func (op *updateOp) validateRange(ctx context.Context, sync *NodeSync, node *v1.Node, aliasRange *net.IPNet) error { if node.Spec.PodCIDR != aliasRange.String() { - klog.Errorf("Inconsistency detected between node PodCIDR and node alias (%v != %v)", - node.Spec.PodCIDR, aliasRange) + klog.FromContext(ctx).Error(nil, "Inconsistency detected between node PodCIDR and node alias", "podCIDR", node.Spec.PodCIDR, "alias", aliasRange) sync.kubeAPI.EmitNodeWarningEvent(node.Name, MismatchEvent, "Node.Spec.PodCIDR != cloud alias (%v != %v)", node.Spec.PodCIDR, aliasRange) // User intervention is required in this case, as this is most likely due // to the user mucking around with their VM aliases on the side. } else { - klog.V(4).Infof("Node %q CIDR range %v is matches cloud assignment", node.Name, node.Spec.PodCIDR) + klog.FromContext(ctx).V(4).Info("Node CIDR range is matches cloud assignment", "node", klog.KObj(node), "podCIDR", node.Spec.PodCIDR) } return nil } @@ -249,27 +248,27 @@ func (op *updateOp) updateNodeFromAlias(ctx context.Context, sync *NodeSync, nod "Cannot sync from cloud in mode %q", sync.mode) return fmt.Errorf("cannot sync from cloud in mode %q", sync.mode) } - - klog.V(2).Infof("Updating node spec with alias range, node.PodCIDR = %v", aliasRange) + logger := klog.FromContext(ctx) + logger.V(2).Info("Updating node spec with alias range", "podCIDR", aliasRange) if err := sync.set.Occupy(aliasRange); err != nil { - klog.Errorf("Error occupying range %v for node %v", aliasRange, sync.nodeName) + logger.Error(nil, "Error occupying range for node", "node", klog.KRef("", sync.nodeName), "alias", aliasRange) return err } if err := sync.kubeAPI.UpdateNodePodCIDR(ctx, node, aliasRange); err != nil { - klog.Errorf("Could not update node %q PodCIDR to %v: %v", node.Name, aliasRange, err) + logger.Error(err, "Could not update node PodCIDR", "node", klog.KObj(node), "podCIDR", aliasRange) return err } - klog.V(2).Infof("Node %q PodCIDR set to %v", node.Name, aliasRange) + logger.V(2).Info("Node PodCIDR updated", "node", klog.KObj(node), "podCIDR", aliasRange) if err := sync.kubeAPI.UpdateNodeNetworkUnavailable(node.Name, false); err != nil { - klog.Errorf("Could not update node NetworkUnavailable status to false: %v", err) + logger.Error(err, "Could not update node NetworkUnavailable status to false") return err } - klog.V(2).Infof("Updated node %q PodCIDR from cloud alias %v", node.Name, aliasRange) + logger.V(2).Info("Updated node PodCIDR from cloud alias", "node", klog.KObj(node), "alias", aliasRange) return nil } @@ -283,29 +282,29 @@ func (op *updateOp) updateAliasFromNode(ctx context.Context, sync *NodeSync, nod } _, aliasRange, err := netutils.ParseCIDRSloppy(node.Spec.PodCIDR) + + logger := klog.FromContext(ctx) if err != nil { - klog.Errorf("Could not parse PodCIDR (%q) for node %q: %v", - node.Spec.PodCIDR, node.Name, err) + logger.Error(err, "Could not parse PodCIDR for node", "node", klog.KObj(node), "podCIDR", node.Spec.PodCIDR) return err } if err := sync.set.Occupy(aliasRange); err != nil { - klog.Errorf("Error occupying range %v for node %v", aliasRange, sync.nodeName) + logger.Error(nil, "Error occupying range for node", "node", klog.KRef("", sync.nodeName), "alias", aliasRange) return err } if err := sync.cloudAlias.AddAlias(ctx, node, aliasRange); err != nil { - klog.Errorf("Could not add alias %v for node %q: %v", aliasRange, node.Name, err) + logger.Error(err, "Could not add alias for node", "node", klog.KObj(node), "alias", aliasRange) return err } if err := sync.kubeAPI.UpdateNodeNetworkUnavailable(node.Name, false); err != nil { - klog.Errorf("Could not update node NetworkUnavailable status to false: %v", err) + logger.Error(err, "Could not update node NetworkUnavailable status to false") return err } - klog.V(2).Infof("Updated node %q cloud alias with node spec, node.PodCIDR = %v", - node.Name, node.Spec.PodCIDR) + logger.V(2).Info("Updated node cloud alias with node spec", "node", klog.KObj(node), "podCIDR", node.Spec.PodCIDR) return nil } @@ -326,22 +325,23 @@ func (op *updateOp) allocateRange(ctx context.Context, sync *NodeSync, node *v1. // If addAlias returns a hard error, cidrRange will be leaked as there // is no durable record of the range. The missing space will be // recovered on the next restart of the controller. + logger := klog.FromContext(ctx) if err := sync.cloudAlias.AddAlias(ctx, node, cidrRange); err != nil { - klog.Errorf("Could not add alias %v for node %q: %v", cidrRange, node.Name, err) + logger.Error(err, "Could not add alias for node", "node", klog.KObj(node), "alias", cidrRange) return err } if err := sync.kubeAPI.UpdateNodePodCIDR(ctx, node, cidrRange); err != nil { - klog.Errorf("Could not update node %q PodCIDR to %v: %v", node.Name, cidrRange, err) + logger.Error(err, "Could not update node PodCIDR", "node", klog.KObj(node), "podCIDR", cidrRange) return err } if err := sync.kubeAPI.UpdateNodeNetworkUnavailable(node.Name, false); err != nil { - klog.Errorf("Could not update node NetworkUnavailable status to false: %v", err) + logger.Error(err, "Could not update node NetworkUnavailable status to false") return err } - klog.V(2).Infof("Allocated PodCIDR %v for node %q", cidrRange, node.Name) + logger.V(2).Info("Allocated PodCIDR for node", "node", klog.KObj(node), "podCIDR", cidrRange) return nil } @@ -358,25 +358,23 @@ func (op *deleteOp) String() string { return fmt.Sprintf("deleteOp(%q,%v)", op.node.Name, op.node.Spec.PodCIDR) } -func (op *deleteOp) run(sync *NodeSync) error { - klog.V(3).Infof("Running deleteOp %+v", op) +func (op *deleteOp) run(logger klog.Logger, sync *NodeSync) error { + logger.V(3).Info("Running deleteOp", "deleteOp", op) if op.node.Spec.PodCIDR == "" { - klog.V(2).Infof("Node %q was deleted, node had no PodCIDR range assigned", op.node.Name) + logger.V(2).Info("Node was deleted, node had no PodCIDR range assigned", "node", klog.KObj(op.node)) return nil } _, cidrRange, err := netutils.ParseCIDRSloppy(op.node.Spec.PodCIDR) if err != nil { - klog.Errorf("Deleted node %q has an invalid podCIDR %q: %v", - op.node.Name, op.node.Spec.PodCIDR, err) + logger.Error(err, "Deleted node has an invalid podCIDR", "node", klog.KObj(op.node), "podCIDR", op.node.Spec.PodCIDR) sync.kubeAPI.EmitNodeWarningEvent(op.node.Name, InvalidPodCIDR, "Node %q has an invalid PodCIDR: %q", op.node.Name, op.node.Spec.PodCIDR) return nil } sync.set.Release(cidrRange) - klog.V(2).Infof("Node %q was deleted, releasing CIDR range %v", - op.node.Name, op.node.Spec.PodCIDR) + logger.V(2).Info("Node was deleted, releasing CIDR range", "node", klog.KObj(op.node), "podCIDR", op.node.Spec.PodCIDR) return nil } diff --git a/pkg/controller/nodeipam/ipam/sync/sync_test.go b/pkg/controller/nodeipam/ipam/sync/sync_test.go index df1ddd192fc..c355f95ea96 100644 --- a/pkg/controller/nodeipam/ipam/sync/sync_test.go +++ b/pkg/controller/nodeipam/ipam/sync/sync_test.go @@ -26,6 +26,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/klog/v2" + "k8s.io/klog/v2/ktesting" "k8s.io/kubernetes/pkg/controller/nodeipam/ipam/cidrset" "k8s.io/kubernetes/pkg/controller/nodeipam/ipam/test" netutils "k8s.io/utils/net" @@ -200,7 +201,8 @@ func TestNodeSyncUpdate(t *testing.T) { doneChan := make(chan struct{}) // Do a single step of the loop. - go sync.Loop(doneChan) + logger, _ := ktesting.NewTestContext(t) + go sync.Loop(logger, doneChan) sync.Update(tc.node) close(sync.opChan) <-doneChan @@ -230,8 +232,8 @@ func TestNodeSyncResync(t *testing.T) { cidr, _ := cidrset.NewCIDRSet(clusterCIDRRange, 24) sync := New(fake, fake, fake, SyncFromCluster, "node1", cidr) doneChan := make(chan struct{}) - - go sync.Loop(doneChan) + logger, _ := ktesting.NewTestContext(t) + go sync.Loop(logger, doneChan) <-fake.reportChan close(sync.opChan) // Unblock loop(). @@ -275,7 +277,8 @@ func TestNodeSyncDelete(t *testing.T) { doneChan := make(chan struct{}) // Do a single step of the loop. - go sync.Loop(doneChan) + logger, _ := ktesting.NewTestContext(t) + go sync.Loop(logger, doneChan) sync.Delete(tc.node) <-doneChan tc.fake.dumpTrace() diff --git a/pkg/controller/nodeipam/legacyprovider.go b/pkg/controller/nodeipam/legacyprovider.go index eb0dd67e458..8ad692bd5a8 100644 --- a/pkg/controller/nodeipam/legacyprovider.go +++ b/pkg/controller/nodeipam/legacyprovider.go @@ -32,6 +32,7 @@ import ( ) func createLegacyIPAM( + logger klog.Logger, ic *Controller, nodeInformer coreinformers.NodeInformer, cloud cloudprovider.Interface, @@ -58,14 +59,16 @@ func createLegacyIPAM( cidr = clusterCIDRs[0] } if len(clusterCIDRs) > 1 { - klog.Warningf("Multiple cidrs were configured with FromCluster or FromCloud. cidrs except first one were discarded") + logger.Info("Multiple cidrs were configured with FromCluster or FromCloud. cidrs except first one were discarded") } ipamc, err := ipam.NewController(cfg, kubeClient, cloud, cidr, serviceCIDR, nodeCIDRMaskSizes[0]) if err != nil { - klog.Fatalf("Error creating ipam controller: %v", err) + logger.Error(err, "Error creating ipam controller") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) } - if err := ipamc.Start(nodeInformer); err != nil { - klog.Fatalf("Error trying to Init(): %v", err) + if err := ipamc.Start(logger, nodeInformer); err != nil { + logger.Error(err, "Error trying to Init()") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) } return ipamc } diff --git a/pkg/controller/nodeipam/node_ipam_controller.go b/pkg/controller/nodeipam/node_ipam_controller.go index 813aeab1a7f..55b91b9d6fd 100644 --- a/pkg/controller/nodeipam/node_ipam_controller.go +++ b/pkg/controller/nodeipam/node_ipam_controller.go @@ -17,6 +17,7 @@ limitations under the License. package nodeipam import ( + "context" "net" "time" @@ -50,7 +51,7 @@ const ( // legacy mode. It is needed to ensure correct building for // both provider-specific and providerless environments. type ipamController interface { - Run(<-chan struct{}) + Run(ctx context.Context) } // Controller is the controller that manages node ipam state. @@ -79,6 +80,7 @@ type Controller struct { // podCIDRs it has already allocated to nodes. Since we don't allow podCIDR changes // currently, this should be handled as a fatal error. func NewNodeIpamController( + ctx context.Context, nodeInformer coreinformers.NodeInformer, clusterCIDRInformer networkinginformers.ClusterCIDRInformer, cloud cloudprovider.Interface, @@ -89,20 +91,24 @@ func NewNodeIpamController( nodeCIDRMaskSizes []int, allocatorType ipam.CIDRAllocatorType) (*Controller, error) { + logger := klog.FromContext(ctx) if kubeClient == nil { - klog.Fatalf("kubeClient is nil when starting Controller") + logger.Error(nil, "kubeClient is nil when starting Controller") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) } // Cloud CIDR allocator does not rely on clusterCIDR or nodeCIDRMaskSize for allocation. if allocatorType != ipam.CloudAllocatorType { if len(clusterCIDRs) == 0 { - klog.Fatal("Controller: Must specify --cluster-cidr if --allocate-node-cidrs is set") + logger.Error(nil, "Controller: Must specify --cluster-cidr if --allocate-node-cidrs is set") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) } for idx, cidr := range clusterCIDRs { mask := cidr.Mask if maskSize, _ := mask.Size(); maskSize > nodeCIDRMaskSizes[idx] { - klog.Fatal("Controller: Invalid --cluster-cidr, mask size of cluster CIDR must be less than or equal to --node-cidr-mask-size configured for CIDR family") + logger.Error(nil, "Controller: Invalid --cluster-cidr, mask size of cluster CIDR must be less than or equal to --node-cidr-mask-size configured for CIDR family") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) } } } @@ -120,7 +126,7 @@ func NewNodeIpamController( // TODO: Abstract this check into a generic controller manager should run method. if ic.allocatorType == ipam.IPAMFromClusterAllocatorType || ic.allocatorType == ipam.IPAMFromCloudAllocatorType { - ic.legacyIPAM = createLegacyIPAM(ic, nodeInformer, cloud, kubeClient, clusterCIDRs, serviceCIDR, nodeCIDRMaskSizes) + ic.legacyIPAM = createLegacyIPAM(logger, ic, nodeInformer, cloud, kubeClient, clusterCIDRs, serviceCIDR, nodeCIDRMaskSizes) } else { var err error @@ -131,7 +137,7 @@ func NewNodeIpamController( NodeCIDRMaskSizes: nodeCIDRMaskSizes, } - ic.cidrAllocator, err = ipam.New(kubeClient, cloud, nodeInformer, clusterCIDRInformer, ic.allocatorType, allocatorParams) + ic.cidrAllocator, err = ipam.New(ctx, kubeClient, cloud, nodeInformer, clusterCIDRInformer, ic.allocatorType, allocatorParams) if err != nil { return nil, err } @@ -144,33 +150,32 @@ func NewNodeIpamController( } // Run starts an asynchronous loop that monitors the status of cluster nodes. -func (nc *Controller) Run(stopCh <-chan struct{}) { +func (nc *Controller) Run(ctx context.Context) { defer utilruntime.HandleCrash() // Start event processing pipeline. nc.eventBroadcaster.StartStructuredLogging(0) nc.eventBroadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: nc.kubeClient.CoreV1().Events("")}) defer nc.eventBroadcaster.Shutdown() + klog.FromContext(ctx).Info("Starting ipam controller") + defer klog.FromContext(ctx).Info("Shutting down ipam controller") - klog.Infof("Starting ipam controller") - defer klog.Infof("Shutting down ipam controller") - - if !cache.WaitForNamedCacheSync("node", stopCh, nc.nodeInformerSynced) { + if !cache.WaitForNamedCacheSync("node", ctx.Done(), nc.nodeInformerSynced) { return } if nc.allocatorType == ipam.IPAMFromClusterAllocatorType || nc.allocatorType == ipam.IPAMFromCloudAllocatorType { - go nc.legacyIPAM.Run(stopCh) + go nc.legacyIPAM.Run(ctx) } else { - go nc.cidrAllocator.Run(stopCh) + go nc.cidrAllocator.Run(ctx) } - <-stopCh + <-ctx.Done() } // RunWithMetrics is a wrapper for Run that also tracks starting and stopping of the nodeipam controller with additional metric -func (nc *Controller) RunWithMetrics(stopCh <-chan struct{}, controllerManagerMetrics *controllersmetrics.ControllerManagerMetrics) { +func (nc *Controller) RunWithMetrics(ctx context.Context, controllerManagerMetrics *controllersmetrics.ControllerManagerMetrics) { controllerManagerMetrics.ControllerStarted("nodeipam") defer controllerManagerMetrics.ControllerStopped("nodeipam") - nc.Run(stopCh) + nc.Run(ctx) } diff --git a/pkg/controller/nodeipam/node_ipam_controller_test.go b/pkg/controller/nodeipam/node_ipam_controller_test.go index ad4b433789f..4bdeeeee28b 100644 --- a/pkg/controller/nodeipam/node_ipam_controller_test.go +++ b/pkg/controller/nodeipam/node_ipam_controller_test.go @@ -20,6 +20,7 @@ limitations under the License. package nodeipam import ( + "context" "net" "os" "os/exec" @@ -30,6 +31,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes/fake" + "k8s.io/klog/v2/ktesting" "k8s.io/kubernetes/pkg/controller" "k8s.io/kubernetes/pkg/controller/nodeipam/ipam" "k8s.io/kubernetes/pkg/controller/testutil" @@ -37,7 +39,7 @@ import ( netutils "k8s.io/utils/net" ) -func newTestNodeIpamController(clusterCIDR []*net.IPNet, serviceCIDR *net.IPNet, secondaryServiceCIDR *net.IPNet, nodeCIDRMaskSizes []int, allocatorType ipam.CIDRAllocatorType) (*Controller, error) { +func newTestNodeIpamController(ctx context.Context, clusterCIDR []*net.IPNet, serviceCIDR *net.IPNet, secondaryServiceCIDR *net.IPNet, nodeCIDRMaskSizes []int, allocatorType ipam.CIDRAllocatorType) (*Controller, error) { clientSet := fake.NewSimpleClientset() fakeNodeHandler := &testutil.FakeNodeHandler{ Existing: []*v1.Node{ @@ -56,6 +58,7 @@ func newTestNodeIpamController(clusterCIDR []*net.IPNet, serviceCIDR *net.IPNet, fakeGCE := gce.NewFakeGCECloud(gce.DefaultTestClusterValues()) return NewNodeIpamController( + ctx, fakeNodeInformer, fakeClusterCIDRInformer, fakeGCE, clientSet, clusterCIDR, serviceCIDR, secondaryServiceCIDR, nodeCIDRMaskSizes, allocatorType, ) @@ -92,6 +95,7 @@ func TestNewNodeIpamControllerWithCIDRMasks(t *testing.T) { {"invalid_serviceCIDR_contains_clusterCIDR", "10.0.0.0/23", "10.0.0.0/21", emptyServiceCIDR, []int{24}, ipam.IPAMFromClusterAllocatorType, true}, {"invalid_CIDR_mask_size", "10.0.0.0/24,2000::/64", "10.1.0.0/21", emptyServiceCIDR, []int{24, 48}, ipam.IPAMFromClusterAllocatorType, true}, } { + _, ctx := ktesting.NewTestContext(t) t.Run(tc.desc, func(t *testing.T) { clusterCidrs, _ := netutils.ParseCIDRs(strings.Split(tc.clusterCIDR, ",")) _, serviceCIDRIpNet, _ := netutils.ParseCIDRSloppy(tc.serviceCIDR) @@ -99,7 +103,7 @@ func TestNewNodeIpamControllerWithCIDRMasks(t *testing.T) { if os.Getenv("EXIT_ON_FATAL") == "1" { // This is the subprocess which runs the actual code. - newTestNodeIpamController(clusterCidrs, serviceCIDRIpNet, secondaryServiceCIDRIpNet, tc.maskSize, tc.allocatorType) + newTestNodeIpamController(ctx, clusterCidrs, serviceCIDRIpNet, secondaryServiceCIDRIpNet, tc.maskSize, tc.allocatorType) return } // This is the host process that monitors the exit code of the subprocess. diff --git a/pkg/controller/nodeipam/nolegacyprovider.go b/pkg/controller/nodeipam/nolegacyprovider.go index 8f15f4d1826..dc71e9b717f 100644 --- a/pkg/controller/nodeipam/nolegacyprovider.go +++ b/pkg/controller/nodeipam/nolegacyprovider.go @@ -20,6 +20,7 @@ limitations under the License. package nodeipam import ( + "context" "net" coreinformers "k8s.io/client-go/informers/core/v1" @@ -31,11 +32,12 @@ import ( type fakeController struct { } -func (f *fakeController) Run(stopCh <-chan struct{}) { - <-stopCh +func (f *fakeController) Run(ctx context.Context) { + <-ctx.Done() } func createLegacyIPAM( + logger klog.Logger, ic *Controller, nodeInformer coreinformers.NodeInformer, cloud cloudprovider.Interface, @@ -44,6 +46,7 @@ func createLegacyIPAM( serviceCIDR *net.IPNet, nodeCIDRMaskSizes []int, ) *fakeController { - klog.Fatal("Error trying to Init(): legacy cloud provider support disabled at build time") + logger.Error(nil, "Error trying to Init(): legacy cloud provider support disabled at build time") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) return &fakeController{} } diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller.go b/pkg/controller/nodelifecycle/node_lifecycle_controller.go index df9d52b8926..f32e6b1255e 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller.go @@ -368,9 +368,10 @@ func NewNodeLifecycleController( unhealthyZoneThreshold float32, runTaintManager bool, ) (*Controller, error) { - + logger := klog.LoggerWithName(klog.FromContext(ctx), "NodeLifecycleController") if kubeClient == nil { - klog.Fatalf("kubeClient is nil when starting Controller") + logger.Error(nil, "kubeClient is nil when starting nodelifecycle Controller") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) } eventBroadcaster := record.NewBroadcaster() @@ -428,12 +429,12 @@ func NewNodeLifecycleController( if !isPod { deletedState, ok := obj.(cache.DeletedFinalStateUnknown) if !ok { - klog.Errorf("Received unexpected object: %v", obj) + logger.Error(nil, "Received unexpected object", "object", obj) return } pod, ok = deletedState.Obj.(*v1.Pod) if !ok { - klog.Errorf("DeletedFinalStateUnknown contained non-Pod object: %v", deletedState.Obj) + logger.Error(nil, "DeletedFinalStateUnknown contained non-Pod object", "object", deletedState.Obj) return } } @@ -494,7 +495,7 @@ func NewNodeLifecycleController( }) } - klog.Infof("Controller will reconcile labels.") + logger.Info("Controller will reconcile labels") nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: controllerutil.CreateAddNodeHandler(func(node *v1.Node) error { nc.nodeUpdateQueue.Add(node.Name) @@ -529,7 +530,8 @@ func (nc *Controller) Run(ctx context.Context) { // Start events processing pipeline. nc.broadcaster.StartStructuredLogging(0) - klog.Infof("Sending events to api server.") + logger := klog.FromContext(ctx) + logger.Info("Sending events to api server") nc.broadcaster.StartRecordingToSink( &v1core.EventSinkImpl{ Interface: v1core.New(nc.kubeClient.CoreV1().RESTClient()).Events(""), @@ -540,8 +542,8 @@ func (nc *Controller) Run(ctx context.Context) { defer nc.nodeUpdateQueue.ShutDown() defer nc.podUpdateQueue.ShutDown() - klog.Infof("Starting node controller") - defer klog.Infof("Shutting down node controller") + logger.Info("Starting node controller") + defer logger.Info("Shutting down node controller") if !cache.WaitForNamedCacheSync("taint", ctx.Done(), nc.leaseInformerSynced, nc.nodeInformerSynced, nc.podInformerSynced, nc.daemonSetInformerSynced) { return @@ -578,7 +580,7 @@ func (nc *Controller) Run(ctx context.Context) { // Incorporate the results of node health signal pushed from kubelet to master. go wait.UntilWithContext(ctx, func(ctx context.Context) { if err := nc.monitorNodeHealth(ctx); err != nil { - klog.Errorf("Error monitoring node health: %v", err) + logger.Error(err, "Error monitoring node health") } }, nc.nodeMonitorPeriod) @@ -586,6 +588,7 @@ func (nc *Controller) Run(ctx context.Context) { } func (nc *Controller) doNodeProcessingPassWorker(ctx context.Context) { + logger := klog.FromContext(ctx) for { obj, shutdown := nc.nodeUpdateQueue.Get() // "nodeUpdateQueue" will be shutdown when "stopCh" closed; @@ -595,13 +598,13 @@ func (nc *Controller) doNodeProcessingPassWorker(ctx context.Context) { } nodeName := obj.(string) if err := nc.doNoScheduleTaintingPass(ctx, nodeName); err != nil { - klog.Errorf("Failed to taint NoSchedule on node <%s>, requeue it: %v", nodeName, err) + logger.Error(err, "Failed to taint NoSchedule on node, requeue it", "node", klog.KRef("", nodeName)) // TODO(k82cn): Add nodeName back to the queue } // TODO: re-evaluate whether there are any labels that need to be // reconcile in 1.19. Remove this function if it's no longer necessary. if err := nc.reconcileNodeLabels(nodeName); err != nil { - klog.Errorf("Failed to reconcile labels for node <%s>, requeue it: %v", nodeName, err) + logger.Error(err, "Failed to reconcile labels for node, requeue it", "node", klog.KRef("", nodeName)) // TODO(yujuhong): Add nodeName back to the queue } nc.nodeUpdateQueue.Done(nodeName) @@ -677,6 +680,7 @@ func (nc *Controller) doNoExecuteTaintingPass(ctx context.Context) { zoneNoExecuteTainterKeys = append(zoneNoExecuteTainterKeys, k) } }() + logger := klog.FromContext(ctx) for _, k := range zoneNoExecuteTainterKeys { var zoneNoExecuteTainterWorker *scheduler.RateLimitedTimedQueue func() { @@ -689,13 +693,13 @@ func (nc *Controller) doNoExecuteTaintingPass(ctx context.Context) { zoneNoExecuteTainterWorker = nc.zoneNoExecuteTainter[k] }() // Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded). - zoneNoExecuteTainterWorker.Try(func(value scheduler.TimedValue) (bool, time.Duration) { + zoneNoExecuteTainterWorker.Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { node, err := nc.nodeLister.Get(value.Value) if apierrors.IsNotFound(err) { - klog.Warningf("Node %v no longer present in nodeLister!", value.Value) + logger.Info("Node no longer present in nodeLister", "node", klog.KRef("", value.Value)) return true, 0 } else if err != nil { - klog.Warningf("Failed to get Node %v from the nodeLister: %v", value.Value, err) + logger.Info("Failed to get Node from the nodeLister", "node", klog.KRef("", value.Value), "err", err) // retry in 50 millisecond return false, 50 * time.Millisecond } @@ -712,7 +716,7 @@ func (nc *Controller) doNoExecuteTaintingPass(ctx context.Context) { oppositeTaint = *NotReadyTaintTemplate default: // It seems that the Node is ready again, so there's no need to taint it. - klog.V(4).Infof("Node %v was in a taint queue, but it's ready now. Ignoring taint request.", value.Value) + logger.V(4).Info("Node was in a taint queue, but it's ready now. Ignoring taint request", "node", klog.KRef("", value.Value)) return true, 0 } result := controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{&oppositeTaint}, node) @@ -742,7 +746,7 @@ func (nc *Controller) doEvictionPass(ctx context.Context) { zonePodEvictorKeys = append(zonePodEvictorKeys, k) } }() - + logger := klog.FromContext(ctx) for _, k := range zonePodEvictorKeys { var zonePodEvictionWorker *scheduler.RateLimitedTimedQueue func() { @@ -756,12 +760,12 @@ func (nc *Controller) doEvictionPass(ctx context.Context) { }() // Function should return 'false' and a time after which it should be retried, or 'true' if it shouldn't (it succeeded). - zonePodEvictionWorker.Try(func(value scheduler.TimedValue) (bool, time.Duration) { + zonePodEvictionWorker.Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { node, err := nc.nodeLister.Get(value.Value) if apierrors.IsNotFound(err) { - klog.Warningf("Node %v no longer present in nodeLister!", value.Value) + logger.Info("Node no longer present in nodeLister", "node", klog.KRef("", value.Value)) } else if err != nil { - klog.Warningf("Failed to get Node %v from the nodeLister: %v", value.Value, err) + logger.Info("Failed to get Node from the nodeLister", "node", klog.KRef("", value.Value), "err", err) } nodeUID, _ := value.UID.(string) pods, err := nc.getPodsAssignedToNode(value.Value) @@ -778,10 +782,10 @@ func (nc *Controller) doEvictionPass(ctx context.Context) { return false, 0 } if !nc.nodeEvictionMap.setStatus(value.Value, evicted) { - klog.V(2).Infof("node %v was unregistered in the meantime - skipping setting status", value.Value) + logger.V(2).Info("Node was unregistered in the meantime - skipping setting status", "node", klog.KRef("", value.Value)) } if remaining { - klog.Infof("Pods awaiting deletion due to Controller eviction") + logger.Info("Pods awaiting deletion due to Controller eviction") } if node != nil { @@ -811,25 +815,24 @@ func (nc *Controller) monitorNodeHealth(ctx context.Context) error { return err } added, deleted, newZoneRepresentatives := nc.classifyNodes(nodes) - + logger := klog.FromContext(ctx) for i := range newZoneRepresentatives { - nc.addPodEvictorForNewZone(newZoneRepresentatives[i]) + nc.addPodEvictorForNewZone(logger, newZoneRepresentatives[i]) } - for i := range added { - klog.V(1).Infof("Controller observed a new Node: %#v", added[i].Name) + logger.V(1).Info("Controller observed a new Node", "node", klog.KRef("", added[i].Name)) controllerutil.RecordNodeEvent(nc.recorder, added[i].Name, string(added[i].UID), v1.EventTypeNormal, "RegisteredNode", fmt.Sprintf("Registered Node %v in Controller", added[i].Name)) nc.knownNodeSet[added[i].Name] = added[i] - nc.addPodEvictorForNewZone(added[i]) + nc.addPodEvictorForNewZone(logger, added[i]) if nc.runTaintManager { nc.markNodeAsReachable(ctx, added[i]) } else { - nc.cancelPodEviction(added[i]) + nc.cancelPodEviction(logger, added[i]) } } for i := range deleted { - klog.V(1).Infof("Controller observed a Node deletion: %v", deleted[i].Name) + logger.V(1).Info("Controller observed a Node deletion", "node", klog.KRef("", deleted[i].Name)) controllerutil.RecordNodeEvent(nc.recorder, deleted[i].Name, string(deleted[i].UID), v1.EventTypeNormal, "RemovingNode", fmt.Sprintf("Removing Node %v from Controller", deleted[i].Name)) delete(nc.knownNodeSet, deleted[i].Name) } @@ -856,13 +859,12 @@ func (nc *Controller) monitorNodeHealth(ctx context.Context) error { name := node.Name node, err = nc.kubeClient.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{}) if err != nil { - klog.Errorf("Failed while getting a Node to retry updating node health. Probably Node %s was deleted.", name) + logger.Error(nil, "Failed while getting a Node to retry updating node health. Probably Node was deleted", "node", klog.KRef("", name)) return false, err } return false, nil }); err != nil { - klog.Errorf("Update health of Node '%v' from Controller error: %v. "+ - "Skipping - no pods will be evicted.", node.Name, err) + logger.Error(err, "Update health of Node from Controller error, Skipping - no pods will be evicted", "node", klog.KObj(node)) return } @@ -925,40 +927,35 @@ func (nc *Controller) monitorNodeHealth(ctx context.Context) error { func (nc *Controller) processTaintBaseEviction(ctx context.Context, node *v1.Node, observedReadyCondition *v1.NodeCondition) { decisionTimestamp := nc.now() // Check eviction timeout against decisionTimestamp + logger := klog.FromContext(ctx) switch observedReadyCondition.Status { case v1.ConditionFalse: // We want to update the taint straight away if Node is already tainted with the UnreachableTaint if taintutils.TaintExists(node.Spec.Taints, UnreachableTaintTemplate) { taintToAdd := *NotReadyTaintTemplate if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{UnreachableTaintTemplate}, node) { - klog.Errorf("Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle.") + logger.Error(nil, "Failed to instantly swap UnreachableTaint to NotReadyTaint. Will try again in the next cycle") } } else if nc.markNodeForTainting(node, v1.ConditionFalse) { - klog.V(2).Infof("Node %v is NotReady as of %v. Adding it to the Taint queue.", - node.Name, - decisionTimestamp, - ) + logger.V(2).Info("Node is NotReady. Adding it to the Taint queue", "node", klog.KObj(node), "timeStamp", decisionTimestamp) } case v1.ConditionUnknown: // We want to update the taint straight away if Node is already tainted with the UnreachableTaint if taintutils.TaintExists(node.Spec.Taints, NotReadyTaintTemplate) { taintToAdd := *UnreachableTaintTemplate if !controllerutil.SwapNodeControllerTaint(ctx, nc.kubeClient, []*v1.Taint{&taintToAdd}, []*v1.Taint{NotReadyTaintTemplate}, node) { - klog.Errorf("Failed to instantly swap NotReadyTaint to UnreachableTaint. Will try again in the next cycle.") + logger.Error(nil, "Failed to instantly swap NotReadyTaint to UnreachableTaint. Will try again in the next cycle") } } else if nc.markNodeForTainting(node, v1.ConditionUnknown) { - klog.V(2).Infof("Node %v is unresponsive as of %v. Adding it to the Taint queue.", - node.Name, - decisionTimestamp, - ) + logger.V(2).Info("Node is unresponsive. Adding it to the Taint queue", "node", klog.KObj(node), "timeStamp", decisionTimestamp) } case v1.ConditionTrue: removed, err := nc.markNodeAsReachable(ctx, node) if err != nil { - klog.Errorf("Failed to remove taints from node %v. Will retry in next iteration.", node.Name) + logger.Error(nil, "Failed to remove taints from node. Will retry in next iteration", "node", klog.KObj(node)) } if removed { - klog.V(2).Infof("Node %s is healthy again, removing all taints", node.Name) + logger.V(2).Info("Node is healthy again, removing all taints", "node", klog.KObj(node)) } } } @@ -970,6 +967,7 @@ func (nc *Controller) processNoTaintBaseEviction(ctx context.Context, node *v1.N return fmt.Errorf("health data doesn't exist for node %q", node.Name) } // Check eviction timeout against decisionTimestamp + logger := klog.FromContext(ctx) switch observedReadyCondition.Status { case v1.ConditionFalse: if decisionTimestamp.After(nodeHealthData.readyTransitionTimestamp.Add(nc.podEvictionTimeout)) { @@ -978,11 +976,11 @@ func (nc *Controller) processNoTaintBaseEviction(ctx context.Context, node *v1.N return err } if enqueued { - klog.V(2).Infof("Node is NotReady. Adding Pods on Node %s to eviction queue: %v is later than %v + %v", - node.Name, - decisionTimestamp, - nodeHealthData.readyTransitionTimestamp, - nc.podEvictionTimeout, + logger.V(2).Info("Node is NotReady. Adding Pods on Node to eviction queue: decisionTimestamp is later than readyTransitionTimestamp + podEvictionTimeout", + "node", klog.KObj(node), + "decisionTimestamp", decisionTimestamp, + "readyTransitionTimestamp", nodeHealthData.readyTransitionTimestamp, + "podEvictionTimeout", nc.podEvictionTimeout, ) } } @@ -993,17 +991,17 @@ func (nc *Controller) processNoTaintBaseEviction(ctx context.Context, node *v1.N return err } if enqueued { - klog.V(2).Infof("Node is unresponsive. Adding Pods on Node %s to eviction queues: %v is later than %v + %v", - node.Name, - decisionTimestamp, - nodeHealthData.readyTransitionTimestamp, - nc.podEvictionTimeout-gracePeriod, + logger.V(2).Info("Node is unresponsive. Adding Pods on Node to eviction queues: decisionTimestamp is later than readyTransitionTimestamp + podEvictionTimeout-gracePeriod", + "node", klog.KObj(node), + "decisionTimestamp", decisionTimestamp, + "readyTransitionTimestamp", nodeHealthData.readyTransitionTimestamp, + "podEvictionTimeoutGracePeriod", nc.podEvictionTimeout-gracePeriod, ) } } case v1.ConditionTrue: - if nc.cancelPodEviction(node) { - klog.V(2).Infof("Node %s is ready again, cancelled pod eviction", node.Name) + if nc.cancelPodEviction(logger, node) { + logger.V(2).Info("Node is ready again, cancelled pod eviction", "node", klog.KObj(node)) } } return nil @@ -1076,23 +1074,23 @@ func (nc *Controller) tryUpdateNodeHealth(ctx context.Context, node *v1.Node) (t _, savedCondition = controllerutil.GetNodeCondition(nodeHealth.status, v1.NodeReady) savedLease = nodeHealth.lease } - + logger := klog.FromContext(ctx) if nodeHealth == nil { - klog.Warningf("Missing timestamp for Node %s. Assuming now as a timestamp.", node.Name) + logger.Info("Missing timestamp for Node. Assuming now as a timestamp", "node", klog.KObj(node)) nodeHealth = &nodeHealthData{ status: &node.Status, probeTimestamp: nc.now(), readyTransitionTimestamp: nc.now(), } } else if savedCondition == nil && currentReadyCondition != nil { - klog.V(1).Infof("Creating timestamp entry for newly observed Node %s", node.Name) + logger.V(1).Info("Creating timestamp entry for newly observed Node", "node", klog.KObj(node)) nodeHealth = &nodeHealthData{ status: &node.Status, probeTimestamp: nc.now(), readyTransitionTimestamp: nc.now(), } } else if savedCondition != nil && currentReadyCondition == nil { - klog.Errorf("ReadyCondition was removed from Status of Node %s", node.Name) + logger.Error(nil, "ReadyCondition was removed from Status of Node", "node", klog.KObj(node)) // TODO: figure out what to do in this case. For now we do the same thing as above. nodeHealth = &nodeHealthData{ status: &node.Status, @@ -1104,15 +1102,15 @@ func (nc *Controller) tryUpdateNodeHealth(ctx context.Context, node *v1.Node) (t // If ReadyCondition changed since the last time we checked, we update the transition timestamp to "now", // otherwise we leave it as it is. if savedCondition.LastTransitionTime != currentReadyCondition.LastTransitionTime { - klog.V(3).Infof("ReadyCondition for Node %s transitioned from %v to %v", node.Name, savedCondition, currentReadyCondition) + logger.V(3).Info("ReadyCondition for Node transitioned from savedCondition to currentReadyCondition", "node", klog.KObj(node), "savedCondition", savedCondition, "currentReadyCondition", currentReadyCondition) transitionTime = nc.now() } else { transitionTime = nodeHealth.readyTransitionTimestamp } - if klogV := klog.V(5); klogV.Enabled() { - klogV.Infof("Node %s ReadyCondition updated. Updating timestamp: %+v vs %+v.", node.Name, nodeHealth.status, node.Status) + if loggerV := logger.V(5); loggerV.Enabled() { + loggerV.Info("Node ReadyCondition updated. Updating timestamp", "node", klog.KObj(node), "nodeHealthStatus", nodeHealth.status, "nodeStatus", node.Status) } else { - klog.V(3).Infof("Node %s ReadyCondition updated. Updating timestamp.", node.Name) + logger.V(3).Info("Node ReadyCondition updated. Updating timestamp", "node", klog.KObj(node)) } nodeHealth = &nodeHealthData{ status: &node.Status, @@ -1147,7 +1145,7 @@ func (nc *Controller) tryUpdateNodeHealth(ctx context.Context, node *v1.Node) (t for _, nodeConditionType := range nodeConditionTypes { _, currentCondition := controllerutil.GetNodeCondition(&node.Status, nodeConditionType) if currentCondition == nil { - klog.V(2).Infof("Condition %v of node %v was never updated by kubelet", nodeConditionType, node.Name) + logger.V(2).Info("Condition of node was never updated by kubelet", "nodeConditionType", nodeConditionType, "node", klog.KObj(node)) node.Status.Conditions = append(node.Status.Conditions, v1.NodeCondition{ Type: nodeConditionType, Status: v1.ConditionUnknown, @@ -1157,8 +1155,8 @@ func (nc *Controller) tryUpdateNodeHealth(ctx context.Context, node *v1.Node) (t LastTransitionTime: nowTimestamp, }) } else { - klog.V(2).Infof("node %v hasn't been updated for %+v. Last %v is: %+v", - node.Name, nc.now().Time.Sub(nodeHealth.probeTimestamp.Time), nodeConditionType, currentCondition) + logger.V(2).Info("Node hasn't been updated", + "node", klog.KObj(node), "duration", nc.now().Time.Sub(nodeHealth.probeTimestamp.Time), "nodeConditionType", nodeConditionType, "currentCondition", currentCondition) if currentCondition.Status != v1.ConditionUnknown { currentCondition.Status = v1.ConditionUnknown currentCondition.Reason = "NodeStatusUnknown" @@ -1172,7 +1170,7 @@ func (nc *Controller) tryUpdateNodeHealth(ctx context.Context, node *v1.Node) (t if !apiequality.Semantic.DeepEqual(currentReadyCondition, &observedReadyCondition) { if _, err := nc.kubeClient.CoreV1().Nodes().UpdateStatus(ctx, node, metav1.UpdateOptions{}); err != nil { - klog.Errorf("Error updating node %s: %v", node.Name, err) + logger.Error(err, "Error updating node", "node", klog.KObj(node)) return gracePeriod, observedReadyCondition, currentReadyCondition, err } nodeHealth = &nodeHealthData{ @@ -1191,6 +1189,7 @@ func (nc *Controller) tryUpdateNodeHealth(ctx context.Context, node *v1.Node) (t func (nc *Controller) handleDisruption(ctx context.Context, zoneToNodeConditions map[string][]*v1.NodeCondition, nodes []*v1.Node) { newZoneStates := map[string]ZoneState{} allAreFullyDisrupted := true + logger := klog.FromContext(ctx) for k, v := range zoneToNodeConditions { zoneSize.WithLabelValues(k).Set(float64(len(v))) unhealthy, newState := nc.computeZoneStateFunc(v) @@ -1201,7 +1200,7 @@ func (nc *Controller) handleDisruption(ctx context.Context, zoneToNodeConditions } newZoneStates[k] = newState if _, had := nc.zoneStates[k]; !had { - klog.Errorf("Setting initial state for unseen zone: %v", k) + logger.Error(nil, "Setting initial state for unseen zone", "zone", k) nc.zoneStates[k] = stateInitial } } @@ -1229,15 +1228,15 @@ func (nc *Controller) handleDisruption(ctx context.Context, zoneToNodeConditions if !allAreFullyDisrupted || !allWasFullyDisrupted { // We're switching to full disruption mode if allAreFullyDisrupted { - klog.Info("Controller detected that all Nodes are not-Ready. Entering master disruption mode.") + logger.Info("Controller detected that all Nodes are not-Ready. Entering master disruption mode") for i := range nodes { if nc.runTaintManager { _, err := nc.markNodeAsReachable(ctx, nodes[i]) if err != nil { - klog.Errorf("Failed to remove taints from Node %v", nodes[i].Name) + logger.Error(nil, "Failed to remove taints from Node", "node", klog.KObj(nodes[i])) } } else { - nc.cancelPodEviction(nodes[i]) + nc.cancelPodEviction(logger, nodes[i]) } } // We stop all evictions. @@ -1256,7 +1255,7 @@ func (nc *Controller) handleDisruption(ctx context.Context, zoneToNodeConditions } // We're exiting full disruption mode if allWasFullyDisrupted { - klog.Info("Controller detected that some Nodes are Ready. Exiting master disruption mode.") + logger.Info("Controller detected that some Nodes are Ready. Exiting master disruption mode") // When exiting disruption mode update probe timestamps on all Nodes. now := nc.now() for i := range nodes { @@ -1279,7 +1278,7 @@ func (nc *Controller) handleDisruption(ctx context.Context, zoneToNodeConditions if v == newState { continue } - klog.Infof("Controller detected that zone %v is now in state %v.", k, newState) + logger.Info("Controller detected that zone is now in new state", "zone", k, "newState", newState) nc.setLimiterInZone(k, len(zoneToNodeConditions[k]), newState) nc.zoneStates[k] = newState } @@ -1317,12 +1316,13 @@ func (nc *Controller) doPodProcessingWorker(ctx context.Context) { func (nc *Controller) processPod(ctx context.Context, podItem podUpdateItem) { defer nc.podUpdateQueue.Done(podItem) pod, err := nc.podLister.Pods(podItem.namespace).Get(podItem.name) + logger := klog.FromContext(ctx) if err != nil { if apierrors.IsNotFound(err) { // If the pod was deleted, there is no need to requeue. return } - klog.Warningf("Failed to read pod %v/%v: %v.", podItem.namespace, podItem.name, err) + logger.Info("Failed to read pod", "pod", klog.KRef(podItem.namespace, podItem.name), "err", err) nc.podUpdateQueue.AddRateLimited(podItem) return } @@ -1338,7 +1338,7 @@ func (nc *Controller) processPod(ctx context.Context, podItem podUpdateItem) { node, err := nc.nodeLister.Get(nodeName) if err != nil { - klog.Warningf("Failed to read node %v: %v.", nodeName, err) + logger.Info("Failed to read node", "node", klog.KRef("", nodeName), "err", err) nc.podUpdateQueue.AddRateLimited(podItem) return } @@ -1356,7 +1356,7 @@ func (nc *Controller) processPod(ctx context.Context, podItem podUpdateItem) { // Pods are processed by TaintManager. if !nc.runTaintManager { if err := nc.processNoTaintBaseEviction(ctx, node, currentReadyCondition, nc.nodeMonitorGracePeriod, pods); err != nil { - klog.Warningf("Unable to process pod %+v eviction from node %v: %v.", podItem, nodeName, err) + logger.Info("Unable to process pod eviction from node", "pod", klog.KRef(podItem.namespace, podItem.name), "node", klog.KRef("", nodeName), "err", err) nc.podUpdateQueue.AddRateLimited(podItem) return } @@ -1364,7 +1364,7 @@ func (nc *Controller) processPod(ctx context.Context, podItem podUpdateItem) { if currentReadyCondition.Status != v1.ConditionTrue { if err := controllerutil.MarkPodsNotReady(ctx, nc.kubeClient, nc.recorder, pods, nodeName); err != nil { - klog.Warningf("Unable to mark pod %+v NotReady on node %v: %v.", podItem, nodeName, err) + logger.Info("Unable to mark pod NotReady on node", "pod", klog.KRef(podItem.namespace, podItem.name), "node", klog.KRef("", nodeName), "err", err) nc.podUpdateQueue.AddRateLimited(podItem) } } @@ -1447,7 +1447,7 @@ func (nc *Controller) ReducedQPSFunc(nodeNum int) float32 { } // addPodEvictorForNewZone checks if new zone appeared, and if so add new evictor. -func (nc *Controller) addPodEvictorForNewZone(node *v1.Node) { +func (nc *Controller) addPodEvictorForNewZone(logger klog.Logger, node *v1.Node) { nc.evictorLock.Lock() defer nc.evictorLock.Unlock() zone := nodetopology.GetZoneKey(node) @@ -1463,7 +1463,7 @@ func (nc *Controller) addPodEvictorForNewZone(node *v1.Node) { flowcontrol.NewTokenBucketRateLimiter(nc.evictionLimiterQPS, scheduler.EvictionRateLimiterBurst)) } // Init the metric for the new zone. - klog.Infof("Initializing eviction metric for zone: %v", zone) + logger.Info("Initializing eviction metric for zone", "zone", zone) evictionsNumber.WithLabelValues(zone).Add(0) evictionsTotal.WithLabelValues(zone).Add(0) } @@ -1471,16 +1471,16 @@ func (nc *Controller) addPodEvictorForNewZone(node *v1.Node) { // cancelPodEviction removes any queued evictions, typically because the node is available again. It // returns true if an eviction was queued. -func (nc *Controller) cancelPodEviction(node *v1.Node) bool { +func (nc *Controller) cancelPodEviction(logger klog.Logger, node *v1.Node) bool { zone := nodetopology.GetZoneKey(node) if !nc.nodeEvictionMap.setStatus(node.Name, unmarked) { - klog.V(2).Infof("node %v was unregistered in the meantime - skipping setting status", node.Name) + logger.V(2).Info("Node was unregistered in the meantime - skipping setting status", "node", klog.KObj(node)) } nc.evictorLock.Lock() defer nc.evictorLock.Unlock() wasDeleting := nc.zonePodEvictor[zone].Remove(node.Name) if wasDeleting { - klog.V(2).Infof("Cancelling pod Eviction on Node: %v", node.Name) + logger.V(2).Info("Cancelling pod Eviction on Node", "node", klog.KObj(node)) return true } return false @@ -1502,8 +1502,9 @@ func (nc *Controller) evictPods(ctx context.Context, node *v1.Node, pods []*v1.P } return false, nil } + logger := klog.FromContext(ctx) if !nc.nodeEvictionMap.setStatus(node.Name, toBeEvicted) { - klog.V(2).Infof("node %v was unregistered in the meantime - skipping setting status", node.Name) + logger.V(2).Info("Node was unregistered in the meantime - skipping setting status", "node", klog.KObj(node)) } nc.evictorLock.Lock() @@ -1532,13 +1533,14 @@ func (nc *Controller) markNodeForTainting(node *v1.Node, status v1.ConditionStat func (nc *Controller) markNodeAsReachable(ctx context.Context, node *v1.Node) (bool, error) { err := controller.RemoveTaintOffNode(ctx, nc.kubeClient, node.Name, node, UnreachableTaintTemplate) + logger := klog.FromContext(ctx) if err != nil { - klog.Errorf("Failed to remove taint from node %v: %v", node.Name, err) + logger.Error(err, "Failed to remove taint from node", "node", klog.KObj(node)) return false, err } err = controller.RemoveTaintOffNode(ctx, nc.kubeClient, node.Name, node, NotReadyTaintTemplate) if err != nil { - klog.Errorf("Failed to remove taint from node %v: %v", node.Name, err) + logger.Error(err, "Failed to remove taint from node", "node", klog.KObj(node)) return false, err } nc.evictorLock.Lock() diff --git a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go index de4bff00802..f9944d1b563 100644 --- a/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go +++ b/pkg/controller/nodelifecycle/node_lifecycle_controller_test.go @@ -41,6 +41,8 @@ import ( clientset "k8s.io/client-go/kubernetes" "k8s.io/client-go/kubernetes/fake" testcore "k8s.io/client-go/testing" + "k8s.io/klog/v2" + "k8s.io/klog/v2/ktesting" kubeletapis "k8s.io/kubelet/pkg/apis" "k8s.io/kubernetes/pkg/controller" "k8s.io/kubernetes/pkg/controller/nodelifecycle/scheduler" @@ -88,12 +90,12 @@ type nodeLifecycleController struct { } // doEviction does the fake eviction and returns the status of eviction operation. -func (nc *nodeLifecycleController) doEviction(fakeNodeHandler *testutil.FakeNodeHandler) bool { +func (nc *nodeLifecycleController) doEviction(logger klog.Logger, fakeNodeHandler *testutil.FakeNodeHandler) bool { nc.evictorLock.Lock() defer nc.evictorLock.Unlock() zones := testutil.GetZones(fakeNodeHandler) for _, zone := range zones { - nc.zonePodEvictor[zone].Try(func(value scheduler.TimedValue) (bool, time.Duration) { + nc.zonePodEvictor[zone].Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { uid, _ := value.UID.(string) pods, _ := nc.getPodsAssignedToNode(value.Value) controllerutil.DeletePods(context.TODO(), fakeNodeHandler, pods, nc.recorder, value.Value, uid, nc.daemonSetStore) @@ -721,9 +723,10 @@ func TestMonitorNodeHealthEvictPods(t *testing.T) { t.Errorf("unexpected error: %v", err) } zones := testutil.GetZones(item.fakeNodeHandler) + logger, _ := ktesting.NewTestContext(t) for _, zone := range zones { if _, ok := nodeController.zonePodEvictor[zone]; ok { - nodeController.zonePodEvictor[zone].Try(func(value scheduler.TimedValue) (bool, time.Duration) { + nodeController.zonePodEvictor[zone].Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { nodeUID, _ := value.UID.(string) pods, err := nodeController.getPodsAssignedToNode(value.Value) if err != nil { @@ -883,8 +886,9 @@ func TestPodStatusChange(t *testing.T) { t.Errorf("unexpected error: %v", err) } zones := testutil.GetZones(item.fakeNodeHandler) + logger, _ := ktesting.NewTestContext(t) for _, zone := range zones { - nodeController.zonePodEvictor[zone].Try(func(value scheduler.TimedValue) (bool, time.Duration) { + nodeController.zonePodEvictor[zone].Try(logger, func(value scheduler.TimedValue) (bool, time.Duration) { nodeUID, _ := value.UID.(string) pods, err := nodeController.getPodsAssignedToNode(value.Value) if err != nil { @@ -1467,8 +1471,9 @@ func TestMonitorNodeHealthEvictPodsWithDisruption(t *testing.T) { // Infinite loop, used for retrying in case ratelimiter fails to reload for Try function. // this breaks when we have the status that we need for test case or when we don't see the // intended result after 1 minute. + logger, _ := ktesting.NewTestContext(t) for { - podEvicted = nodeController.doEviction(fakeNodeHandler) + podEvicted = nodeController.doEviction(logger, fakeNodeHandler) if podEvicted == item.expectedEvictPods || time.Since(start) > 1*time.Minute { break } diff --git a/pkg/controller/nodelifecycle/scheduler/rate_limited_queue.go b/pkg/controller/nodelifecycle/scheduler/rate_limited_queue.go index e343fa59667..289dc019c74 100644 --- a/pkg/controller/nodelifecycle/scheduler/rate_limited_queue.go +++ b/pkg/controller/nodelifecycle/scheduler/rate_limited_queue.go @@ -23,7 +23,6 @@ import ( "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/util/flowcontrol" - "k8s.io/klog/v2" ) @@ -229,14 +228,14 @@ type ActionFunc func(TimedValue) (bool, time.Duration) // function in NodeController when Node becomes Ready again) TODO: // figure out a good way to do garbage collection for all Nodes that // were removed from the cluster. -func (q *RateLimitedTimedQueue) Try(fn ActionFunc) { +func (q *RateLimitedTimedQueue) Try(logger klog.Logger, fn ActionFunc) { val, ok := q.queue.Head() q.limiterLock.Lock() defer q.limiterLock.Unlock() for ok { // rate limit the queue checking if !q.limiter.TryAccept() { - klog.V(10).Infof("Try rate limited for value: %v", val) + logger.V(10).Info("Try rate limited", "value", val) // Try again later break } diff --git a/pkg/controller/nodelifecycle/scheduler/rate_limited_queue_test.go b/pkg/controller/nodelifecycle/scheduler/rate_limited_queue_test.go index f54b488e6c6..b5565f361a8 100644 --- a/pkg/controller/nodelifecycle/scheduler/rate_limited_queue_test.go +++ b/pkg/controller/nodelifecycle/scheduler/rate_limited_queue_test.go @@ -23,6 +23,7 @@ import ( "k8s.io/apimachinery/pkg/util/sets" "k8s.io/client-go/util/flowcontrol" + "k8s.io/klog/v2/ktesting" ) func CheckQueueEq(lhs []string, rhs TimedQueue) bool { @@ -144,7 +145,8 @@ func TestTry(t *testing.T) { evictor.Remove("second") deletedMap := sets.NewString() - evictor.Try(func(value TimedValue) (bool, time.Duration) { + logger, _ := ktesting.NewTestContext(t) + evictor.Try(logger, func(value TimedValue) (bool, time.Duration) { deletedMap.Insert(value.Value) return true, 0 }) @@ -180,7 +182,8 @@ func TestTryOrdering(t *testing.T) { order := []string{} count := 0 hasQueued := false - evictor.Try(func(value TimedValue) (bool, time.Duration) { + logger, _ := ktesting.NewTestContext(t) + evictor.Try(logger, func(value TimedValue) (bool, time.Duration) { count++ t.Logf("eviction %d", count) if value.ProcessAt.IsZero() { @@ -242,8 +245,8 @@ func TestTryRemovingWhileTry(t *testing.T) { evictor.Remove("second") close(wait) }() - - evictor.Try(func(value TimedValue) (bool, time.Duration) { + logger, _ := ktesting.NewTestContext(t) + evictor.Try(logger, func(value TimedValue) (bool, time.Duration) { count++ if value.AddedAt.IsZero() { t.Fatalf("added should not be zero") @@ -313,7 +316,8 @@ func TestAddAfterTry(t *testing.T) { evictor.Remove("second") deletedMap := sets.NewString() - evictor.Try(func(value TimedValue) (bool, time.Duration) { + logger, _ := ktesting.NewTestContext(t) + evictor.Try(logger, func(value TimedValue) (bool, time.Duration) { deletedMap.Insert(value.Value) return true, 0 }) @@ -327,7 +331,7 @@ func TestAddAfterTry(t *testing.T) { } evictor.Add("first", "11111") - evictor.Try(func(value TimedValue) (bool, time.Duration) { + evictor.Try(logger, func(value TimedValue) (bool, time.Duration) { t.Errorf("We shouldn't process the same value if the explicit remove wasn't called.") return true, 0 }) diff --git a/pkg/controller/nodelifecycle/scheduler/taint_manager.go b/pkg/controller/nodelifecycle/scheduler/taint_manager.go index f147a44aadb..6c25f0a96ac 100644 --- a/pkg/controller/nodelifecycle/scheduler/taint_manager.go +++ b/pkg/controller/nodelifecycle/scheduler/taint_manager.go @@ -105,7 +105,7 @@ func deletePodHandler(c clientset.Interface, emitEventFunc func(types.Namespaced return func(ctx context.Context, args *WorkArgs) error { ns := args.NamespacedName.Namespace name := args.NamespacedName.Name - klog.InfoS("NoExecuteTaintManager is deleting pod", "pod", args.NamespacedName.String()) + klog.FromContext(ctx).Info("NoExecuteTaintManager is deleting pod", "pod", args.NamespacedName.String()) if emitEventFunc != nil { emitEventFunc(args.NamespacedName) } @@ -202,16 +202,17 @@ func NewNoExecuteTaintManager(ctx context.Context, c clientset.Interface, podLis // Run starts NoExecuteTaintManager which will run in loop until `stopCh` is closed. func (tc *NoExecuteTaintManager) Run(ctx context.Context) { defer utilruntime.HandleCrash() - - klog.InfoS("Starting NoExecuteTaintManager") + logger := klog.FromContext(ctx) + logger.Info("Starting NoExecuteTaintManager") // Start events processing pipeline. tc.broadcaster.StartStructuredLogging(0) if tc.client != nil { - klog.InfoS("Sending events to api server") + logger.Info("Sending events to api server") tc.broadcaster.StartRecordingToSink(&v1core.EventSinkImpl{Interface: tc.client.CoreV1().Events("")}) } else { - klog.Fatalf("kubeClient is nil when starting NodeController") + logger.Error(nil, "kubeClient is nil when starting NodeController") + klog.FlushAndExit(klog.ExitFlushTimeout, 1) } defer tc.broadcaster.Shutdown() @@ -363,8 +364,8 @@ func (tc *NoExecuteTaintManager) NodeUpdated(oldNode *v1.Node, newNode *v1.Node) tc.nodeUpdateQueue.Add(updateItem) } -func (tc *NoExecuteTaintManager) cancelWorkWithEvent(nsName types.NamespacedName) { - if tc.taintEvictionQueue.CancelWork(nsName.String()) { +func (tc *NoExecuteTaintManager) cancelWorkWithEvent(logger klog.Logger, nsName types.NamespacedName) { + if tc.taintEvictionQueue.CancelWork(logger, nsName.String()) { tc.emitCancelPodDeletionEvent(nsName) } } @@ -377,22 +378,23 @@ func (tc *NoExecuteTaintManager) processPodOnNode( taints []v1.Taint, now time.Time, ) { + logger := klog.FromContext(ctx) if len(taints) == 0 { - tc.cancelWorkWithEvent(podNamespacedName) + tc.cancelWorkWithEvent(logger, podNamespacedName) } allTolerated, usedTolerations := v1helper.GetMatchingTolerations(taints, tolerations) if !allTolerated { - klog.V(2).InfoS("Not all taints are tolerated after update for pod on node", "pod", podNamespacedName.String(), "node", klog.KRef("", nodeName)) + logger.V(2).Info("Not all taints are tolerated after update for pod on node", "pod", podNamespacedName.String(), "node", klog.KRef("", nodeName)) // We're canceling scheduled work (if any), as we're going to delete the Pod right away. - tc.cancelWorkWithEvent(podNamespacedName) + tc.cancelWorkWithEvent(logger, podNamespacedName) tc.taintEvictionQueue.AddWork(ctx, NewWorkArgs(podNamespacedName.Name, podNamespacedName.Namespace), time.Now(), time.Now()) return } minTolerationTime := getMinTolerationTime(usedTolerations) // getMinTolerationTime returns negative value to denote infinite toleration. if minTolerationTime < 0 { - klog.V(4).InfoS("Current tolerations for pod tolerate forever, cancelling any scheduled deletion", "pod", podNamespacedName.String()) - tc.cancelWorkWithEvent(podNamespacedName) + logger.V(4).Info("Current tolerations for pod tolerate forever, cancelling any scheduled deletion", "pod", podNamespacedName.String()) + tc.cancelWorkWithEvent(logger, podNamespacedName) return } @@ -404,19 +406,20 @@ func (tc *NoExecuteTaintManager) processPodOnNode( if startTime.Add(minTolerationTime).Before(triggerTime) { return } - tc.cancelWorkWithEvent(podNamespacedName) + tc.cancelWorkWithEvent(logger, podNamespacedName) } tc.taintEvictionQueue.AddWork(ctx, NewWorkArgs(podNamespacedName.Name, podNamespacedName.Namespace), startTime, triggerTime) } func (tc *NoExecuteTaintManager) handlePodUpdate(ctx context.Context, podUpdate podUpdateItem) { pod, err := tc.podLister.Pods(podUpdate.podNamespace).Get(podUpdate.podName) + logger := klog.FromContext(ctx) if err != nil { if apierrors.IsNotFound(err) { // Delete podNamespacedName := types.NamespacedName{Namespace: podUpdate.podNamespace, Name: podUpdate.podName} - klog.V(4).InfoS("Noticed pod deletion", "pod", podNamespacedName) - tc.cancelWorkWithEvent(podNamespacedName) + logger.V(4).Info("Noticed pod deletion", "pod", podNamespacedName) + tc.cancelWorkWithEvent(logger, podNamespacedName) return } utilruntime.HandleError(fmt.Errorf("could not get pod %s/%s: %v", podUpdate.podName, podUpdate.podNamespace, err)) @@ -430,7 +433,7 @@ func (tc *NoExecuteTaintManager) handlePodUpdate(ctx context.Context, podUpdate // Create or Update podNamespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name} - klog.V(4).InfoS("Noticed pod update", "pod", podNamespacedName) + logger.V(4).Info("Noticed pod update", "pod", podNamespacedName) nodeName := pod.Spec.NodeName if nodeName == "" { return @@ -451,10 +454,11 @@ func (tc *NoExecuteTaintManager) handlePodUpdate(ctx context.Context, podUpdate func (tc *NoExecuteTaintManager) handleNodeUpdate(ctx context.Context, nodeUpdate nodeUpdateItem) { node, err := tc.nodeLister.Get(nodeUpdate.nodeName) + logger := klog.FromContext(ctx) if err != nil { if apierrors.IsNotFound(err) { // Delete - klog.V(4).InfoS("Noticed node deletion", "node", nodeUpdate.nodeName) + logger.V(4).Info("Noticed node deletion", "node", klog.KRef("", nodeUpdate.nodeName)) tc.taintedNodesLock.Lock() defer tc.taintedNodesLock.Unlock() delete(tc.taintedNodes, nodeUpdate.nodeName) @@ -465,12 +469,12 @@ func (tc *NoExecuteTaintManager) handleNodeUpdate(ctx context.Context, nodeUpdat } // Create or Update - klog.V(4).InfoS("Noticed node update", "node", nodeUpdate) + logger.V(4).Info("Noticed node update", "node", klog.KObj(node)) taints := getNoExecuteTaints(node.Spec.Taints) func() { tc.taintedNodesLock.Lock() defer tc.taintedNodesLock.Unlock() - klog.V(4).InfoS("Updating known taints on node", "node", node.Name, "taints", taints) + logger.V(4).Info("Updating known taints on node", "node", klog.KObj(node), "taints", taints) if len(taints) == 0 { delete(tc.taintedNodes, node.Name) } else { @@ -483,7 +487,7 @@ func (tc *NoExecuteTaintManager) handleNodeUpdate(ctx context.Context, nodeUpdat // tc.PodUpdated which will use tc.taintedNodes to potentially delete delayed pods. pods, err := tc.getPodsAssignedToNode(node.Name) if err != nil { - klog.ErrorS(err, "Failed to get pods assigned to node", "node", node.Name) + logger.Error(err, "Failed to get pods assigned to node", "node", klog.KObj(node)) return } if len(pods) == 0 { @@ -491,9 +495,9 @@ func (tc *NoExecuteTaintManager) handleNodeUpdate(ctx context.Context, nodeUpdat } // Short circuit, to make this controller a bit faster. if len(taints) == 0 { - klog.V(4).InfoS("All taints were removed from the node. Cancelling all evictions...", "node", node.Name) + logger.V(4).Info("All taints were removed from the node. Cancelling all evictions...", "node", klog.KObj(node)) for i := range pods { - tc.cancelWorkWithEvent(types.NamespacedName{Namespace: pods[i].Namespace, Name: pods[i].Name}) + tc.cancelWorkWithEvent(logger, types.NamespacedName{Namespace: pods[i].Namespace, Name: pods[i].Name}) } return } diff --git a/pkg/controller/nodelifecycle/scheduler/timed_workers.go b/pkg/controller/nodelifecycle/scheduler/timed_workers.go index 58b22fd10d0..0024a0c7c50 100644 --- a/pkg/controller/nodelifecycle/scheduler/timed_workers.go +++ b/pkg/controller/nodelifecycle/scheduler/timed_workers.go @@ -52,10 +52,11 @@ type TimedWorker struct { // createWorker creates a TimedWorker that will execute `f` not earlier than `fireAt`. func createWorker(ctx context.Context, args *WorkArgs, createdAt time.Time, fireAt time.Time, f func(ctx context.Context, args *WorkArgs) error, clock clock.WithDelayedExecution) *TimedWorker { delay := fireAt.Sub(createdAt) + logger := klog.FromContext(ctx) fWithErrorLogging := func() { err := f(ctx, args) if err != nil { - klog.Errorf("NodeLifecycle: timed worker failed with error: %q", err) + logger.Error(err, "NodeLifecycle: timed worker failed") } } if delay <= 0 { @@ -116,12 +117,13 @@ func (q *TimedWorkerQueue) getWrappedWorkerFunc(key string) func(ctx context.Con // AddWork adds a work to the WorkerQueue which will be executed not earlier than `fireAt`. func (q *TimedWorkerQueue) AddWork(ctx context.Context, args *WorkArgs, createdAt time.Time, fireAt time.Time) { key := args.KeyFromWorkArgs() - klog.V(4).Infof("Adding TimedWorkerQueue item %v at %v to be fired at %v", key, createdAt, fireAt) + logger := klog.FromContext(ctx) + logger.V(4).Info("Adding TimedWorkerQueue item and to be fired at firedTime", "item", key, "createTime", createdAt, "firedTime", fireAt) q.Lock() defer q.Unlock() if _, exists := q.workers[key]; exists { - klog.Warningf("Trying to add already existing work for %+v. Skipping.", args) + logger.Info("Trying to add already existing work, skipping", "args", args) return } worker := createWorker(ctx, args, createdAt, fireAt, q.getWrappedWorkerFunc(key), q.clock) @@ -129,13 +131,13 @@ func (q *TimedWorkerQueue) AddWork(ctx context.Context, args *WorkArgs, createdA } // CancelWork removes scheduled function execution from the queue. Returns true if work was cancelled. -func (q *TimedWorkerQueue) CancelWork(key string) bool { +func (q *TimedWorkerQueue) CancelWork(logger klog.Logger, key string) bool { q.Lock() defer q.Unlock() worker, found := q.workers[key] result := false if found { - klog.V(4).Infof("Cancelling TimedWorkerQueue item %v at %v", key, time.Now()) + logger.V(4).Info("Cancelling TimedWorkerQueue item", "item", key, "time", time.Now()) if worker != nil { result = true worker.Cancel() diff --git a/pkg/controller/nodelifecycle/scheduler/timed_workers_test.go b/pkg/controller/nodelifecycle/scheduler/timed_workers_test.go index 63371757429..61e96713891 100644 --- a/pkg/controller/nodelifecycle/scheduler/timed_workers_test.go +++ b/pkg/controller/nodelifecycle/scheduler/timed_workers_test.go @@ -23,6 +23,7 @@ import ( "testing" "time" + "k8s.io/klog/v2/ktesting" testingclock "k8s.io/utils/clock/testing" ) @@ -108,8 +109,9 @@ func TestCancel(t *testing.T) { queue.AddWork(context.TODO(), NewWorkArgs("3", "3"), now, then) queue.AddWork(context.TODO(), NewWorkArgs("4", "4"), now, then) queue.AddWork(context.TODO(), NewWorkArgs("5", "5"), now, then) - queue.CancelWork(NewWorkArgs("2", "2").KeyFromWorkArgs()) - queue.CancelWork(NewWorkArgs("4", "4").KeyFromWorkArgs()) + logger, _ := ktesting.NewTestContext(t) + queue.CancelWork(logger, NewWorkArgs("2", "2").KeyFromWorkArgs()) + queue.CancelWork(logger, NewWorkArgs("4", "4").KeyFromWorkArgs()) fakeClock.Step(11 * time.Second) wg.Wait() lastVal := atomic.LoadInt32(&testVal) @@ -141,8 +143,9 @@ func TestCancelAndReadd(t *testing.T) { queue.AddWork(context.TODO(), NewWorkArgs("3", "3"), now, then) queue.AddWork(context.TODO(), NewWorkArgs("4", "4"), now, then) queue.AddWork(context.TODO(), NewWorkArgs("5", "5"), now, then) - queue.CancelWork(NewWorkArgs("2", "2").KeyFromWorkArgs()) - queue.CancelWork(NewWorkArgs("4", "4").KeyFromWorkArgs()) + logger, _ := ktesting.NewTestContext(t) + queue.CancelWork(logger, NewWorkArgs("2", "2").KeyFromWorkArgs()) + queue.CancelWork(logger, NewWorkArgs("4", "4").KeyFromWorkArgs()) queue.AddWork(context.TODO(), NewWorkArgs("2", "2"), now, then) fakeClock.Step(11 * time.Second) wg.Wait() diff --git a/test/integration/clustercidr/ipam_test.go b/test/integration/clustercidr/ipam_test.go index 983c81d311c..1f6a1798856 100644 --- a/test/integration/clustercidr/ipam_test.go +++ b/test/integration/clustercidr/ipam_test.go @@ -33,6 +33,7 @@ import ( "k8s.io/client-go/informers" clientset "k8s.io/client-go/kubernetes" featuregatetesting "k8s.io/component-base/featuregate/testing" + "k8s.io/klog/v2/ktesting" "k8s.io/kubernetes/cmd/kube-apiserver/app/options" "k8s.io/kubernetes/pkg/controller/nodeipam" "k8s.io/kubernetes/pkg/controller/nodeipam/ipam" @@ -63,7 +64,7 @@ func TestIPAMMultiCIDRRangeAllocatorCIDRAllocate(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go ipamController.Run(ctx.Done()) + go ipamController.Run(ctx) sharedInformer.Start(ctx.Done()) tests := []struct { @@ -149,7 +150,7 @@ func TestIPAMMultiCIDRRangeAllocatorCIDRRelease(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go ipamController.Run(ctx.Done()) + go ipamController.Run(ctx) sharedInformer.Start(ctx.Done()) t.Run("Pod CIDR release after node delete", func(t *testing.T) { @@ -226,7 +227,7 @@ func TestIPAMMultiCIDRRangeAllocatorClusterCIDRDelete(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go ipamController.Run(ctx.Done()) + go ipamController.Run(ctx) sharedInformer.Start(ctx.Done()) t.Run("delete cc with node associated", func(t *testing.T) { @@ -322,7 +323,7 @@ func TestIPAMMultiCIDRRangeAllocatorClusterCIDRTerminate(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go ipamController.Run(ctx.Done()) + go ipamController.Run(ctx) sharedInformer.Start(ctx.Done()) t.Run("Pod CIDRS must not be allocated from a terminating CC", func(t *testing.T) { @@ -407,7 +408,7 @@ func TestIPAMMultiCIDRRangeAllocatorClusterCIDRTieBreak(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go ipamController.Run(ctx.Done()) + go ipamController.Run(ctx) sharedInformer.Start(ctx.Done()) tests := []struct { @@ -557,8 +558,9 @@ func booststrapMultiCIDRRangeAllocator(t *testing.T, if _, err := clientSet.CoreV1().Nodes().Create(context.TODO(), initialNode, metav1.CreateOptions{}); err != nil { t.Fatal(err) } - + _, ctx := ktesting.NewTestContext(t) ipamController, err := nodeipam.NewNodeIpamController( + ctx, sharedInformer.Core().V1().Nodes(), sharedInformer.Networking().V1alpha1().ClusterCIDRs(), nil, diff --git a/test/integration/ipamperf/ipam_test.go b/test/integration/ipamperf/ipam_test.go index a0f35627978..2380ddcbb55 100644 --- a/test/integration/ipamperf/ipam_test.go +++ b/test/integration/ipamperf/ipam_test.go @@ -17,6 +17,7 @@ limitations under the License. package ipamperf import ( + "context" "encoding/json" "fmt" "net" @@ -25,6 +26,7 @@ import ( "time" "k8s.io/klog/v2" + "k8s.io/klog/v2/ktesting" netutils "k8s.io/utils/net" "k8s.io/client-go/informers" @@ -37,7 +39,7 @@ import ( "k8s.io/kubernetes/test/integration/util" ) -func setupAllocator(kubeConfig *restclient.Config, config *Config, clusterCIDR, serviceCIDR *net.IPNet, subnetMaskSize int) (*clientset.Clientset, util.ShutdownFunc, error) { +func setupAllocator(ctx context.Context, kubeConfig *restclient.Config, config *Config, clusterCIDR, serviceCIDR *net.IPNet, subnetMaskSize int) (*clientset.Clientset, util.ShutdownFunc, error) { controllerStopChan := make(chan struct{}) shutdownFunc := func() { close(controllerStopChan) @@ -50,6 +52,7 @@ func setupAllocator(kubeConfig *restclient.Config, config *Config, clusterCIDR, sharedInformer := informers.NewSharedInformerFactory(clientSet, 1*time.Hour) ipamController, err := nodeipam.NewNodeIpamController( + ctx, sharedInformer.Core().V1().Nodes(), sharedInformer.Networking().V1alpha1().ClusterCIDRs(), config.Cloud, clientSet, []*net.IPNet{clusterCIDR}, serviceCIDR, nil, @@ -58,7 +61,7 @@ func setupAllocator(kubeConfig *restclient.Config, config *Config, clusterCIDR, if err != nil { return nil, shutdownFunc, err } - go ipamController.Run(controllerStopChan) + go ipamController.Run(ctx) sharedInformer.Start(controllerStopChan) return clientSet, shutdownFunc, nil @@ -74,8 +77,8 @@ func runTest(t *testing.T, kubeConfig *restclient.Config, config *Config, cluste nodeClient := clientset.NewForConfigOrDie(nodeClientConfig) defer deleteNodes(nodeClient) // cleanup nodes on after controller shutdown - - clientSet, shutdownFunc, err := setupAllocator(kubeConfig, config, clusterCIDR, serviceCIDR, subnetMaskSize) + _, ctx := ktesting.NewTestContext(t) + clientSet, shutdownFunc, err := setupAllocator(ctx, kubeConfig, config, clusterCIDR, serviceCIDR, subnetMaskSize) if err != nil { t.Fatalf("Error starting IPAM allocator: %v", err) }