From f7239e409595e510ee8528f18c41b8ff7aa435b4 Mon Sep 17 00:00:00 2001 From: Davanum Srinivas Date: Sun, 25 Jun 2023 16:23:44 -0400 Subject: [PATCH] Better back off delays and connection timeout to talk to containerd Set up params similar to what we do in cadvisor: https://github.com/google/cadvisor/blob/e9068e32730b0061b8c5ad34193ebf9b12c1eda7/container/containerd/client.go#L59-L61 Signed-off-by: Davanum Srinivas --- pkg/kubelet/cri/remote/remote_image.go | 16 +++++++++++----- pkg/kubelet/cri/remote/remote_runtime.go | 22 +++++++++++++++++----- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/pkg/kubelet/cri/remote/remote_image.go b/pkg/kubelet/cri/remote/remote_image.go index b444a28173f..a1afc80b8a2 100644 --- a/pkg/kubelet/cri/remote/remote_image.go +++ b/pkg/kubelet/cri/remote/remote_image.go @@ -56,11 +56,7 @@ func NewRemoteImageService(endpoint string, connectionTimeout time.Duration, tp ctx, cancel := context.WithTimeout(context.Background(), connectionTimeout) defer cancel() - dialOpts := []grpc.DialOption{ - grpc.WithConnectParams(grpc.ConnectParams{ - Backoff: backoff.DefaultConfig, - }), - } + var dialOpts []grpc.DialOption dialOpts = append(dialOpts, grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithContextDialer(dialer), @@ -77,6 +73,16 @@ func NewRemoteImageService(endpoint string, connectionTimeout time.Duration, tp grpc.WithStreamInterceptor(otelgrpc.StreamClientInterceptor(tracingOpts...))) } + connParams := grpc.ConnectParams{ + Backoff: backoff.DefaultConfig, + } + connParams.MinConnectTimeout = minConnectionTimeout + connParams.Backoff.BaseDelay = baseBackoffDelay + connParams.Backoff.MaxDelay = maxBackoffDelay + dialOpts = append(dialOpts, + grpc.WithConnectParams(connParams), + ) + conn, err := grpc.DialContext(ctx, addr, dialOpts...) if err != nil { klog.ErrorS(err, "Connect remote image service failed", "address", addr) diff --git a/pkg/kubelet/cri/remote/remote_runtime.go b/pkg/kubelet/cri/remote/remote_runtime.go index 1df3affbbea..22b1f34b224 100644 --- a/pkg/kubelet/cri/remote/remote_runtime.go +++ b/pkg/kubelet/cri/remote/remote_runtime.go @@ -56,6 +56,11 @@ type remoteRuntimeService struct { const ( // How frequently to report identical errors identicalErrorDelay = 1 * time.Minute + + // connection parameters + maxBackoffDelay = 3 * time.Second + baseBackoffDelay = 100 * time.Millisecond + minConnectionTimeout = 5 * time.Second ) // CRIVersion is the type for valid Container Runtime Interface (CRI) API @@ -80,11 +85,7 @@ func NewRemoteRuntimeService(endpoint string, connectionTimeout time.Duration, t ctx, cancel := context.WithTimeout(context.Background(), connectionTimeout) defer cancel() - dialOpts := []grpc.DialOption{ - grpc.WithConnectParams(grpc.ConnectParams{ - Backoff: backoff.DefaultConfig, - }), - } + var dialOpts []grpc.DialOption dialOpts = append(dialOpts, grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithContextDialer(dialer), @@ -100,6 +101,17 @@ func NewRemoteRuntimeService(endpoint string, connectionTimeout time.Duration, t grpc.WithUnaryInterceptor(otelgrpc.UnaryClientInterceptor(tracingOpts...)), grpc.WithStreamInterceptor(otelgrpc.StreamClientInterceptor(tracingOpts...))) } + + connParams := grpc.ConnectParams{ + Backoff: backoff.DefaultConfig, + } + connParams.MinConnectTimeout = minConnectionTimeout + connParams.Backoff.BaseDelay = baseBackoffDelay + connParams.Backoff.MaxDelay = maxBackoffDelay + dialOpts = append(dialOpts, + grpc.WithConnectParams(connParams), + ) + conn, err := grpc.DialContext(ctx, addr, dialOpts...) if err != nil { klog.ErrorS(err, "Connect remote runtime failed", "address", addr)