From 729c97cf39121fc949fcf877defe7c68103c1dfc Mon Sep 17 00:00:00 2001 From: Aditya Ramani Date: Mon, 18 Sep 2023 11:52:17 -0700 Subject: [PATCH] Handle unexpected shim kill events When a shim process is unexpectedly killed in a way that was not initiated through containerd - containerd reports the pod as not ready but the containers as running. This results in kubelet repeatedly sending container kill requests that fail since containerd cannot connect to the shim. Changes: - In the container exit handler, treat `err: Unavailable` as if the container has already exited out - When attempting to get a connection to the shim, if the controller isn't available assume that the shim has been killed (needs to be done since we have a separate exit handler that cleans up the reference to the shim controller - before kubelet has the chance to call StopPodSandbox) Signed-off-by: Aditya Ramani --- pkg/cri/sbserver/events.go | 2 +- plugins/sandbox/controller.go | 8 +++++++- services/sandbox/controller_service.go | 12 ++++++++---- services/tasks/local.go | 2 +- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/pkg/cri/sbserver/events.go b/pkg/cri/sbserver/events.go index 5d8006e70..e316bb90d 100644 --- a/pkg/cri/sbserver/events.go +++ b/pkg/cri/sbserver/events.go @@ -393,7 +393,7 @@ func handleContainerExit(ctx context.Context, e *eventtypes.TaskExit, cntr conta }, ) if err != nil { - if !errdefs.IsNotFound(err) { + if !errdefs.IsNotFound(err) && !errdefs.IsUnavailable(err) { return fmt.Errorf("failed to load task for container: %w", err) } } else { diff --git a/plugins/sandbox/controller.go b/plugins/sandbox/controller.go index d295706b7..f105d3e5a 100644 --- a/plugins/sandbox/controller.go +++ b/plugins/sandbox/controller.go @@ -262,6 +262,12 @@ func (c *controllerLocal) Wait(ctx context.Context, sandboxID string) (sandbox.E func (c *controllerLocal) Status(ctx context.Context, sandboxID string, verbose bool) (sandbox.ControllerStatus, error) { svc, err := c.getSandbox(ctx, sandboxID) + if errdefs.IsNotFound(err) { + return sandbox.ControllerStatus{ + SandboxID: sandboxID, + ExitedAt: time.Now(), + }, nil + } if err != nil { return sandbox.ControllerStatus{}, err } @@ -301,7 +307,7 @@ func (c *controllerLocal) Metrics(ctx context.Context, sandboxID string) (*types func (c *controllerLocal) getSandbox(ctx context.Context, id string) (runtimeAPI.TTRPCSandboxService, error) { shim, err := c.shims.Get(ctx, id) if err != nil { - return nil, errdefs.ErrNotFound + return nil, err } return sandbox.NewClient(shim.Client()) diff --git a/services/sandbox/controller_service.go b/services/sandbox/controller_service.go index 64c7159cd..ef1762172 100644 --- a/services/sandbox/controller_service.go +++ b/services/sandbox/controller_service.go @@ -144,6 +144,13 @@ func (s *controllerService) Status(ctx context.Context, req *api.ControllerStatu if err != nil { return &api.ControllerStatusResponse{}, errdefs.ToGRPC(err) } + extra := &anypb.Any{} + if cstatus.Extra != nil { + extra = &anypb.Any{ + TypeUrl: cstatus.Extra.GetTypeUrl(), + Value: cstatus.Extra.GetValue(), + } + } return &api.ControllerStatusResponse{ SandboxID: cstatus.SandboxID, Pid: cstatus.Pid, @@ -151,10 +158,7 @@ func (s *controllerService) Status(ctx context.Context, req *api.ControllerStatu Info: cstatus.Info, CreatedAt: protobuf.ToTimestamp(cstatus.CreatedAt), ExitedAt: protobuf.ToTimestamp(cstatus.ExitedAt), - Extra: &anypb.Any{ - TypeUrl: cstatus.Extra.GetTypeUrl(), - Value: cstatus.Extra.GetValue(), - }, + Extra: extra, }, nil } diff --git a/services/tasks/local.go b/services/tasks/local.go index a7892a824..3f73bc4cd 100644 --- a/services/tasks/local.go +++ b/services/tasks/local.go @@ -311,7 +311,7 @@ func getProcessState(ctx context.Context, p runtime.Process) (*task.Process, err state, err := p.State(ctx) if err != nil { - if errdefs.IsNotFound(err) { + if errdefs.IsNotFound(err) || errdefs.IsUnavailable(err) { return nil, err } log.G(ctx).WithError(err).Errorf("get state for %s", p.ID())