Handle unexpected shim kill events

When a shim process is unexpectedly killed in a way that was not initiated through containerd - containerd reports the pod as not ready but the containers as running. This results in kubelet repeatedly sending container kill requests that fail since containerd cannot connect to the shim.

Changes:

- In the container exit handler, treat `err: Unavailable` as if the container has already exited out
- When attempting to get a connection to the shim, if the controller isn't available assume that the shim has been killed (needs to be done since we have a separate exit handler that cleans up the reference to the shim controller - before kubelet has the chance to call StopPodSandbox)

Signed-off-by: Aditya Ramani <a_ramani@apple.com>
This commit is contained in:
Aditya Ramani 2023-09-18 11:52:17 -07:00
parent 82df7d5208
commit 729c97cf39
4 changed files with 17 additions and 7 deletions

View File

@ -393,7 +393,7 @@ func handleContainerExit(ctx context.Context, e *eventtypes.TaskExit, cntr conta
},
)
if err != nil {
if !errdefs.IsNotFound(err) {
if !errdefs.IsNotFound(err) && !errdefs.IsUnavailable(err) {
return fmt.Errorf("failed to load task for container: %w", err)
}
} else {

View File

@ -262,6 +262,12 @@ func (c *controllerLocal) Wait(ctx context.Context, sandboxID string) (sandbox.E
func (c *controllerLocal) Status(ctx context.Context, sandboxID string, verbose bool) (sandbox.ControllerStatus, error) {
svc, err := c.getSandbox(ctx, sandboxID)
if errdefs.IsNotFound(err) {
return sandbox.ControllerStatus{
SandboxID: sandboxID,
ExitedAt: time.Now(),
}, nil
}
if err != nil {
return sandbox.ControllerStatus{}, err
}
@ -301,7 +307,7 @@ func (c *controllerLocal) Metrics(ctx context.Context, sandboxID string) (*types
func (c *controllerLocal) getSandbox(ctx context.Context, id string) (runtimeAPI.TTRPCSandboxService, error) {
shim, err := c.shims.Get(ctx, id)
if err != nil {
return nil, errdefs.ErrNotFound
return nil, err
}
return sandbox.NewClient(shim.Client())

View File

@ -144,6 +144,13 @@ func (s *controllerService) Status(ctx context.Context, req *api.ControllerStatu
if err != nil {
return &api.ControllerStatusResponse{}, errdefs.ToGRPC(err)
}
extra := &anypb.Any{}
if cstatus.Extra != nil {
extra = &anypb.Any{
TypeUrl: cstatus.Extra.GetTypeUrl(),
Value: cstatus.Extra.GetValue(),
}
}
return &api.ControllerStatusResponse{
SandboxID: cstatus.SandboxID,
Pid: cstatus.Pid,
@ -151,10 +158,7 @@ func (s *controllerService) Status(ctx context.Context, req *api.ControllerStatu
Info: cstatus.Info,
CreatedAt: protobuf.ToTimestamp(cstatus.CreatedAt),
ExitedAt: protobuf.ToTimestamp(cstatus.ExitedAt),
Extra: &anypb.Any{
TypeUrl: cstatus.Extra.GetTypeUrl(),
Value: cstatus.Extra.GetValue(),
},
Extra: extra,
}, nil
}

View File

@ -311,7 +311,7 @@ func getProcessState(ctx context.Context, p runtime.Process) (*task.Process, err
state, err := p.State(ctx)
if err != nil {
if errdefs.IsNotFound(err) {
if errdefs.IsNotFound(err) || errdefs.IsUnavailable(err) {
return nil, err
}
log.G(ctx).WithError(err).Errorf("get state for %s", p.ID())