runtime/v2: net.Dial gRPC shim sockets before trying grpc
This is mostly to workaround an issue with gRPC based shims after containerd restart. If a shim dies while containerd is also down/restarting, on reboot grpc.DialContext with our current set of DialOptions will make us wait for 100 seconds per shim even if the socket no longer exists or has no listener. Signed-off-by: Danny Canter <danny@dcantah.dev>
This commit is contained in:
parent
8459273f80
commit
0bc9633414
@ -22,12 +22,14 @@ import (
|
|||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
|
"net"
|
||||||
"os"
|
"os"
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"strings"
|
"strings"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/containerd/containerd/v2/pkg/atomicfile"
|
"github.com/containerd/containerd/v2/pkg/atomicfile"
|
||||||
|
"github.com/containerd/containerd/v2/pkg/dialer"
|
||||||
"github.com/containerd/ttrpc"
|
"github.com/containerd/ttrpc"
|
||||||
"google.golang.org/grpc"
|
"google.golang.org/grpc"
|
||||||
"google.golang.org/grpc/connectivity"
|
"google.golang.org/grpc/connectivity"
|
||||||
@ -39,7 +41,6 @@ import (
|
|||||||
"github.com/containerd/containerd/v2/errdefs"
|
"github.com/containerd/containerd/v2/errdefs"
|
||||||
"github.com/containerd/containerd/v2/events/exchange"
|
"github.com/containerd/containerd/v2/events/exchange"
|
||||||
"github.com/containerd/containerd/v2/identifiers"
|
"github.com/containerd/containerd/v2/identifiers"
|
||||||
"github.com/containerd/containerd/v2/pkg/dialer"
|
|
||||||
"github.com/containerd/containerd/v2/pkg/timeout"
|
"github.com/containerd/containerd/v2/pkg/timeout"
|
||||||
"github.com/containerd/containerd/v2/protobuf"
|
"github.com/containerd/containerd/v2/protobuf"
|
||||||
ptypes "github.com/containerd/containerd/v2/protobuf/types"
|
ptypes "github.com/containerd/containerd/v2/protobuf/types"
|
||||||
@ -275,7 +276,7 @@ func makeConnection(ctx context.Context, id string, params client.BootstrapParam
|
|||||||
grpc.WithTransportCredentials(insecure.NewCredentials()),
|
grpc.WithTransportCredentials(insecure.NewCredentials()),
|
||||||
grpc.WithBlock(),
|
grpc.WithBlock(),
|
||||||
}
|
}
|
||||||
return grpcDialContext(ctx, dialer.DialAddress(params.Address), onClose, gopts...)
|
return grpcDialContext(ctx, params.Address, onClose, gopts...)
|
||||||
default:
|
default:
|
||||||
return nil, fmt.Errorf("unexpected protocol: %q", params.Protocol)
|
return nil, fmt.Errorf("unexpected protocol: %q", params.Protocol)
|
||||||
}
|
}
|
||||||
@ -286,10 +287,29 @@ func makeConnection(ctx context.Context, id string, params client.BootstrapParam
|
|||||||
// a callback run when the connection is severed or explicitly closed.
|
// a callback run when the connection is severed or explicitly closed.
|
||||||
func grpcDialContext(
|
func grpcDialContext(
|
||||||
ctx context.Context,
|
ctx context.Context,
|
||||||
target string,
|
address string,
|
||||||
onClose func(),
|
onClose func(),
|
||||||
gopts ...grpc.DialOption,
|
gopts ...grpc.DialOption,
|
||||||
) (*grpcConn, error) {
|
) (*grpcConn, error) {
|
||||||
|
// If grpc.WithBlock is specified in gopts this causes the connection to block waiting for
|
||||||
|
// a connection regardless of if the socket exists or has a listener when Dial begins. This
|
||||||
|
// specific behavior of WithBlock is mostly undesirable for shims, as if the socket isn't
|
||||||
|
// there when we go to load/connect there's likely an issue. However, getting rid of WithBlock is
|
||||||
|
// also undesirable as we don't want the background connection behavior, we want to ensure
|
||||||
|
// a connection before moving on. To bring this in line with the ttrpc connection behavior
|
||||||
|
// lets do an initial dial to ensure the shims socket is actually available. stat wouldn't suffice
|
||||||
|
// here as if the shim exited unexpectedly its socket may still be on the filesystem, but it'd return
|
||||||
|
// ECONNREFUSED which grpc.DialContext will happily trudge along through for the full timeout.
|
||||||
|
//
|
||||||
|
// This is especially helpful on restart of containerd as if the shim died while containerd
|
||||||
|
// was down, we end up waiting the full timeout.
|
||||||
|
conn, err := net.DialTimeout("unix", address, time.Second*10)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
conn.Close()
|
||||||
|
|
||||||
|
target := dialer.DialAddress(address)
|
||||||
client, err := grpc.DialContext(ctx, target, gopts...)
|
client, err := grpc.DialContext(ctx, target, gopts...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("failed to create GRPC connection: %w", err)
|
return nil, fmt.Errorf("failed to create GRPC connection: %w", err)
|
||||||
|
Loading…
Reference in New Issue
Block a user