From 0bc9633414126c3296e26b39848bd26f3b7e838c Mon Sep 17 00:00:00 2001 From: Danny Canter Date: Thu, 30 Nov 2023 19:15:50 -0800 Subject: [PATCH] runtime/v2: net.Dial gRPC shim sockets before trying grpc This is mostly to workaround an issue with gRPC based shims after containerd restart. If a shim dies while containerd is also down/restarting, on reboot grpc.DialContext with our current set of DialOptions will make us wait for 100 seconds per shim even if the socket no longer exists or has no listener. Signed-off-by: Danny Canter --- runtime/v2/shim.go | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/runtime/v2/shim.go b/runtime/v2/shim.go index 1f1e50ccc..fa70d3c5e 100644 --- a/runtime/v2/shim.go +++ b/runtime/v2/shim.go @@ -22,12 +22,14 @@ import ( "errors" "fmt" "io" + "net" "os" "path/filepath" "strings" "time" "github.com/containerd/containerd/v2/pkg/atomicfile" + "github.com/containerd/containerd/v2/pkg/dialer" "github.com/containerd/ttrpc" "google.golang.org/grpc" "google.golang.org/grpc/connectivity" @@ -39,7 +41,6 @@ import ( "github.com/containerd/containerd/v2/errdefs" "github.com/containerd/containerd/v2/events/exchange" "github.com/containerd/containerd/v2/identifiers" - "github.com/containerd/containerd/v2/pkg/dialer" "github.com/containerd/containerd/v2/pkg/timeout" "github.com/containerd/containerd/v2/protobuf" ptypes "github.com/containerd/containerd/v2/protobuf/types" @@ -275,7 +276,7 @@ func makeConnection(ctx context.Context, id string, params client.BootstrapParam grpc.WithTransportCredentials(insecure.NewCredentials()), grpc.WithBlock(), } - return grpcDialContext(ctx, dialer.DialAddress(params.Address), onClose, gopts...) + return grpcDialContext(ctx, params.Address, onClose, gopts...) default: return nil, fmt.Errorf("unexpected protocol: %q", params.Protocol) } @@ -286,10 +287,29 @@ func makeConnection(ctx context.Context, id string, params client.BootstrapParam // a callback run when the connection is severed or explicitly closed. func grpcDialContext( ctx context.Context, - target string, + address string, onClose func(), gopts ...grpc.DialOption, ) (*grpcConn, error) { + // If grpc.WithBlock is specified in gopts this causes the connection to block waiting for + // a connection regardless of if the socket exists or has a listener when Dial begins. This + // specific behavior of WithBlock is mostly undesirable for shims, as if the socket isn't + // there when we go to load/connect there's likely an issue. However, getting rid of WithBlock is + // also undesirable as we don't want the background connection behavior, we want to ensure + // a connection before moving on. To bring this in line with the ttrpc connection behavior + // lets do an initial dial to ensure the shims socket is actually available. stat wouldn't suffice + // here as if the shim exited unexpectedly its socket may still be on the filesystem, but it'd return + // ECONNREFUSED which grpc.DialContext will happily trudge along through for the full timeout. + // + // This is especially helpful on restart of containerd as if the shim died while containerd + // was down, we end up waiting the full timeout. + conn, err := net.DialTimeout("unix", address, time.Second*10) + if err != nil { + return nil, err + } + conn.Close() + + target := dialer.DialAddress(address) client, err := grpc.DialContext(ctx, target, gopts...) if err != nil { return nil, fmt.Errorf("failed to create GRPC connection: %w", err)