diff --git a/core/sandbox/proxy/controller.go b/core/sandbox/proxy/controller.go index 92078671a..070e20e3d 100644 --- a/core/sandbox/proxy/controller.go +++ b/core/sandbox/proxy/controller.go @@ -18,6 +18,7 @@ package proxy import ( "context" + "time" api "github.com/containerd/containerd/api/services/sandbox/v1" "github.com/containerd/containerd/api/types" @@ -119,9 +120,31 @@ func (s *remoteSandboxController) Shutdown(ctx context.Context, sandboxID string } func (s *remoteSandboxController) Wait(ctx context.Context, sandboxID string) (sandbox.ExitStatus, error) { - resp, err := s.client.Wait(ctx, &api.ControllerWaitRequest{SandboxID: sandboxID}) - if err != nil { - return sandbox.ExitStatus{}, errdefs.FromGRPC(err) + // For remote sandbox controllers, the controller process may restart, + // we have to retry if the error indicates that it is the grpc disconnection. + var ( + resp *api.ControllerWaitResponse + err error + retryInterval time.Duration = 128 + ) + for { + resp, err = s.client.Wait(ctx, &api.ControllerWaitRequest{SandboxID: sandboxID}) + if err != nil { + grpcErr := errdefs.FromGRPC(err) + if !errdefs.IsUnavailable(grpcErr) { + return sandbox.ExitStatus{}, grpcErr + } + select { + case <-time.After(retryInterval * time.Millisecond): + if retryInterval < 4096 { + retryInterval = retryInterval << 1 + } + continue + case <-ctx.Done(): + return sandbox.ExitStatus{}, grpcErr + } + } + break } return sandbox.ExitStatus{ diff --git a/internal/cri/server/events.go b/internal/cri/server/events.go index 823ade682..16ce146f8 100644 --- a/internal/cri/server/events.go +++ b/internal/cri/server/events.go @@ -53,7 +53,7 @@ func (c *criService) startSandboxExitMonitor(ctx context.Context, id string, exi case exitRes := <-exitCh: exitStatus, exitedAt, err := exitRes.Result() if err != nil { - log.L.WithError(err).Errorf("failed to get task exit status for %q", id) + log.L.WithError(err).Errorf("failed to get sandbox status for %q", id) exitStatus = unknownExitCode exitedAt = time.Now() }