runtime/v2: net.Dial gRPC shim sockets before trying grpc
This is mostly to workaround an issue with gRPC based shims after containerd restart. If a shim dies while containerd is also down/restarting, on reboot grpc.DialContext with our current set of DialOptions will make us wait for 100 seconds per shim even if the socket no longer exists or has no listener. Signed-off-by: Danny Canter <danny@dcantah.dev>
This commit is contained in:
		| @@ -22,12 +22,14 @@ import ( | ||||
| 	"errors" | ||||
| 	"fmt" | ||||
| 	"io" | ||||
| 	"net" | ||||
| 	"os" | ||||
| 	"path/filepath" | ||||
| 	"strings" | ||||
| 	"time" | ||||
|  | ||||
| 	"github.com/containerd/containerd/v2/pkg/atomicfile" | ||||
| 	"github.com/containerd/containerd/v2/pkg/dialer" | ||||
| 	"github.com/containerd/ttrpc" | ||||
| 	"google.golang.org/grpc" | ||||
| 	"google.golang.org/grpc/connectivity" | ||||
| @@ -39,7 +41,6 @@ import ( | ||||
| 	"github.com/containerd/containerd/v2/errdefs" | ||||
| 	"github.com/containerd/containerd/v2/events/exchange" | ||||
| 	"github.com/containerd/containerd/v2/identifiers" | ||||
| 	"github.com/containerd/containerd/v2/pkg/dialer" | ||||
| 	"github.com/containerd/containerd/v2/pkg/timeout" | ||||
| 	"github.com/containerd/containerd/v2/protobuf" | ||||
| 	ptypes "github.com/containerd/containerd/v2/protobuf/types" | ||||
| @@ -275,7 +276,7 @@ func makeConnection(ctx context.Context, id string, params client.BootstrapParam | ||||
| 			grpc.WithTransportCredentials(insecure.NewCredentials()), | ||||
| 			grpc.WithBlock(), | ||||
| 		} | ||||
| 		return grpcDialContext(ctx, dialer.DialAddress(params.Address), onClose, gopts...) | ||||
| 		return grpcDialContext(ctx, params.Address, onClose, gopts...) | ||||
| 	default: | ||||
| 		return nil, fmt.Errorf("unexpected protocol: %q", params.Protocol) | ||||
| 	} | ||||
| @@ -286,10 +287,29 @@ func makeConnection(ctx context.Context, id string, params client.BootstrapParam | ||||
| // a callback run when the connection is severed or explicitly closed. | ||||
| func grpcDialContext( | ||||
| 	ctx context.Context, | ||||
| 	target string, | ||||
| 	address string, | ||||
| 	onClose func(), | ||||
| 	gopts ...grpc.DialOption, | ||||
| ) (*grpcConn, error) { | ||||
| 	// If grpc.WithBlock is specified in gopts this causes the connection to block waiting for | ||||
| 	// a connection regardless of if the socket exists or has a listener when Dial begins. This | ||||
| 	// specific behavior of WithBlock is mostly undesirable for shims, as if the socket isn't | ||||
| 	// there when we go to load/connect there's likely an issue. However, getting rid of WithBlock is | ||||
| 	// also undesirable as we don't want the background connection behavior, we want to ensure | ||||
| 	// a connection before moving on. To bring this in line with the ttrpc connection behavior | ||||
| 	// lets do an initial dial to ensure the shims socket is actually available. stat wouldn't suffice | ||||
| 	// here as if the shim exited unexpectedly its socket may still be on the filesystem, but it'd return | ||||
| 	// ECONNREFUSED which grpc.DialContext will happily trudge along through for the full timeout. | ||||
| 	// | ||||
| 	// This is especially helpful on restart of containerd as if the shim died while containerd | ||||
| 	// was down, we end up waiting the full timeout. | ||||
| 	conn, err := net.DialTimeout("unix", address, time.Second*10) | ||||
| 	if err != nil { | ||||
| 		return nil, err | ||||
| 	} | ||||
| 	conn.Close() | ||||
|  | ||||
| 	target := dialer.DialAddress(address) | ||||
| 	client, err := grpc.DialContext(ctx, target, gopts...) | ||||
| 	if err != nil { | ||||
| 		return nil, fmt.Errorf("failed to create GRPC connection: %w", err) | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Danny Canter
					Danny Canter