nodeResourceSlicesController: add exponential backoff
This commit is contained in:
		@@ -37,6 +37,7 @@ import (
 | 
				
			|||||||
	resourceinformers "k8s.io/client-go/informers/resource/v1alpha2"
 | 
						resourceinformers "k8s.io/client-go/informers/resource/v1alpha2"
 | 
				
			||||||
	"k8s.io/client-go/kubernetes"
 | 
						"k8s.io/client-go/kubernetes"
 | 
				
			||||||
	"k8s.io/client-go/tools/cache"
 | 
						"k8s.io/client-go/tools/cache"
 | 
				
			||||||
 | 
						"k8s.io/client-go/util/flowcontrol"
 | 
				
			||||||
	"k8s.io/client-go/util/workqueue"
 | 
						"k8s.io/client-go/util/workqueue"
 | 
				
			||||||
	"k8s.io/klog/v2"
 | 
						"k8s.io/klog/v2"
 | 
				
			||||||
	drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha3"
 | 
						drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha3"
 | 
				
			||||||
@@ -46,7 +47,10 @@ import (
 | 
				
			|||||||
const (
 | 
					const (
 | 
				
			||||||
	// resyncPeriod for informer
 | 
						// resyncPeriod for informer
 | 
				
			||||||
	// TODO (https://github.com/kubernetes/kubernetes/issues/123688): disable?
 | 
						// TODO (https://github.com/kubernetes/kubernetes/issues/123688): disable?
 | 
				
			||||||
	resyncPeriod = time.Duration(10 * time.Minute)
 | 
						resyncPeriod   = time.Duration(10 * time.Minute)
 | 
				
			||||||
 | 
						retryPeriod    = 5 * time.Second
 | 
				
			||||||
 | 
						maxRetryPeriod = 180 * time.Second
 | 
				
			||||||
 | 
						backoffFactor  = 2.0 // Introduce a backoff multiplier as jitter factor
 | 
				
			||||||
)
 | 
					)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
// nodeResourcesController collects resource information from all registered
 | 
					// nodeResourcesController collects resource information from all registered
 | 
				
			||||||
@@ -185,6 +189,9 @@ func (c *nodeResourcesController) monitorPlugin(ctx context.Context, active *act
 | 
				
			|||||||
		logger.Info("Stopping to monitor node resources of the plugin", "reason", context.Cause(ctx), "err", ctx.Err(), "recover", r)
 | 
							logger.Info("Stopping to monitor node resources of the plugin", "reason", context.Cause(ctx), "err", ctx.Err(), "recover", r)
 | 
				
			||||||
	}()
 | 
						}()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						backOff := flowcontrol.NewBackOffWithJitter(retryPeriod, maxRetryPeriod, backoffFactor)
 | 
				
			||||||
 | 
						backOffID := "retry"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	// Keep trying until canceled.
 | 
						// Keep trying until canceled.
 | 
				
			||||||
	for ctx.Err() == nil {
 | 
						for ctx.Err() == nil {
 | 
				
			||||||
		logger.V(5).Info("Calling NodeListAndWatchResources")
 | 
							logger.V(5).Info("Calling NodeListAndWatchResources")
 | 
				
			||||||
@@ -197,9 +204,9 @@ func (c *nodeResourcesController) monitorPlugin(ctx context.Context, active *act
 | 
				
			|||||||
			default:
 | 
								default:
 | 
				
			||||||
				// This is a problem, report it and retry.
 | 
									// This is a problem, report it and retry.
 | 
				
			||||||
				logger.Error(err, "Creating gRPC stream for node resources failed")
 | 
									logger.Error(err, "Creating gRPC stream for node resources failed")
 | 
				
			||||||
				// TODO (https://github.com/kubernetes/kubernetes/issues/123689): expontential backoff?
 | 
					 | 
				
			||||||
				select {
 | 
									select {
 | 
				
			||||||
				case <-time.After(5 * time.Second):
 | 
									case <-time.After(backOff.Get(backOffID)):
 | 
				
			||||||
 | 
										backOff.Next(backOffID, time.Now())
 | 
				
			||||||
				case <-ctx.Done():
 | 
									case <-ctx.Done():
 | 
				
			||||||
				}
 | 
									}
 | 
				
			||||||
			}
 | 
								}
 | 
				
			||||||
@@ -219,9 +226,9 @@ func (c *nodeResourcesController) monitorPlugin(ctx context.Context, active *act
 | 
				
			|||||||
				case ctx.Err() == nil:
 | 
									case ctx.Err() == nil:
 | 
				
			||||||
					// This is a problem, report it and retry.
 | 
										// This is a problem, report it and retry.
 | 
				
			||||||
					logger.Error(err, "Reading node resources from gRPC stream failed")
 | 
										logger.Error(err, "Reading node resources from gRPC stream failed")
 | 
				
			||||||
					// TODO (https://github.com/kubernetes/kubernetes/issues/123689): expontential backoff?
 | 
					 | 
				
			||||||
					select {
 | 
										select {
 | 
				
			||||||
					case <-time.After(5 * time.Second):
 | 
										case <-time.After(backOff.Get(backOffID)):
 | 
				
			||||||
 | 
											backOff.Next(backOffID, time.Now())
 | 
				
			||||||
					case <-ctx.Done():
 | 
										case <-ctx.Done():
 | 
				
			||||||
					}
 | 
										}
 | 
				
			||||||
				}
 | 
									}
 | 
				
			||||||
 
 | 
				
			|||||||
		Reference in New Issue
	
	Block a user