sandbox: do retry for wait to remote sandbox controller

remote sandbox controller may restart, the Wait call should be retried
if it is an grpc disconnetion error.

Signed-off-by: Abel Feng <fshb1988@gmail.com>
This commit is contained in:
Abel Feng 2024-05-10 10:18:42 +08:00
parent b168147ca8
commit 58be881890
2 changed files with 27 additions and 4 deletions

View File

@ -18,6 +18,7 @@ package proxy
import ( import (
"context" "context"
"time"
api "github.com/containerd/containerd/api/services/sandbox/v1" api "github.com/containerd/containerd/api/services/sandbox/v1"
"github.com/containerd/containerd/api/types" "github.com/containerd/containerd/api/types"
@ -119,9 +120,31 @@ func (s *remoteSandboxController) Shutdown(ctx context.Context, sandboxID string
} }
func (s *remoteSandboxController) Wait(ctx context.Context, sandboxID string) (sandbox.ExitStatus, error) { func (s *remoteSandboxController) Wait(ctx context.Context, sandboxID string) (sandbox.ExitStatus, error) {
resp, err := s.client.Wait(ctx, &api.ControllerWaitRequest{SandboxID: sandboxID}) // For remote sandbox controllers, the controller process may restart,
// we have to retry if the error indicates that it is the grpc disconnection.
var (
resp *api.ControllerWaitResponse
err error
retryInterval time.Duration = 128
)
for {
resp, err = s.client.Wait(ctx, &api.ControllerWaitRequest{SandboxID: sandboxID})
if err != nil { if err != nil {
return sandbox.ExitStatus{}, errdefs.FromGRPC(err) grpcErr := errdefs.FromGRPC(err)
if !errdefs.IsUnavailable(grpcErr) {
return sandbox.ExitStatus{}, grpcErr
}
select {
case <-time.After(retryInterval * time.Millisecond):
if retryInterval < 4096 {
retryInterval = retryInterval << 1
}
continue
case <-ctx.Done():
return sandbox.ExitStatus{}, grpcErr
}
}
break
} }
return sandbox.ExitStatus{ return sandbox.ExitStatus{

View File

@ -53,7 +53,7 @@ func (c *criService) startSandboxExitMonitor(ctx context.Context, id string, exi
case exitRes := <-exitCh: case exitRes := <-exitCh:
exitStatus, exitedAt, err := exitRes.Result() exitStatus, exitedAt, err := exitRes.Result()
if err != nil { if err != nil {
log.L.WithError(err).Errorf("failed to get task exit status for %q", id) log.L.WithError(err).Errorf("failed to get sandbox status for %q", id)
exitStatus = unknownExitCode exitStatus = unknownExitCode
exitedAt = time.Now() exitedAt = time.Now()
} }