Handle unexpected shim kill events
When a shim process is unexpectedly killed in a way that was not initiated through containerd - containerd reports the pod as not ready but the containers as running. This results in kubelet repeatedly sending container kill requests that fail since containerd cannot connect to the shim. Changes: - In the container exit handler, treat `err: Unavailable` as if the container has already exited out - When attempting to get a connection to the shim, if the controller isn't available assume that the shim has been killed (needs to be done since we have a separate exit handler that cleans up the reference to the shim controller - before kubelet has the chance to call StopPodSandbox) Signed-off-by: Aditya Ramani <a_ramani@apple.com>
This commit is contained in:
parent
82df7d5208
commit
729c97cf39
@ -393,7 +393,7 @@ func handleContainerExit(ctx context.Context, e *eventtypes.TaskExit, cntr conta
|
||||
},
|
||||
)
|
||||
if err != nil {
|
||||
if !errdefs.IsNotFound(err) {
|
||||
if !errdefs.IsNotFound(err) && !errdefs.IsUnavailable(err) {
|
||||
return fmt.Errorf("failed to load task for container: %w", err)
|
||||
}
|
||||
} else {
|
||||
|
@ -262,6 +262,12 @@ func (c *controllerLocal) Wait(ctx context.Context, sandboxID string) (sandbox.E
|
||||
|
||||
func (c *controllerLocal) Status(ctx context.Context, sandboxID string, verbose bool) (sandbox.ControllerStatus, error) {
|
||||
svc, err := c.getSandbox(ctx, sandboxID)
|
||||
if errdefs.IsNotFound(err) {
|
||||
return sandbox.ControllerStatus{
|
||||
SandboxID: sandboxID,
|
||||
ExitedAt: time.Now(),
|
||||
}, nil
|
||||
}
|
||||
if err != nil {
|
||||
return sandbox.ControllerStatus{}, err
|
||||
}
|
||||
@ -301,7 +307,7 @@ func (c *controllerLocal) Metrics(ctx context.Context, sandboxID string) (*types
|
||||
func (c *controllerLocal) getSandbox(ctx context.Context, id string) (runtimeAPI.TTRPCSandboxService, error) {
|
||||
shim, err := c.shims.Get(ctx, id)
|
||||
if err != nil {
|
||||
return nil, errdefs.ErrNotFound
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return sandbox.NewClient(shim.Client())
|
||||
|
@ -144,6 +144,13 @@ func (s *controllerService) Status(ctx context.Context, req *api.ControllerStatu
|
||||
if err != nil {
|
||||
return &api.ControllerStatusResponse{}, errdefs.ToGRPC(err)
|
||||
}
|
||||
extra := &anypb.Any{}
|
||||
if cstatus.Extra != nil {
|
||||
extra = &anypb.Any{
|
||||
TypeUrl: cstatus.Extra.GetTypeUrl(),
|
||||
Value: cstatus.Extra.GetValue(),
|
||||
}
|
||||
}
|
||||
return &api.ControllerStatusResponse{
|
||||
SandboxID: cstatus.SandboxID,
|
||||
Pid: cstatus.Pid,
|
||||
@ -151,10 +158,7 @@ func (s *controllerService) Status(ctx context.Context, req *api.ControllerStatu
|
||||
Info: cstatus.Info,
|
||||
CreatedAt: protobuf.ToTimestamp(cstatus.CreatedAt),
|
||||
ExitedAt: protobuf.ToTimestamp(cstatus.ExitedAt),
|
||||
Extra: &anypb.Any{
|
||||
TypeUrl: cstatus.Extra.GetTypeUrl(),
|
||||
Value: cstatus.Extra.GetValue(),
|
||||
},
|
||||
Extra: extra,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
@ -311,7 +311,7 @@ func getProcessState(ctx context.Context, p runtime.Process) (*task.Process, err
|
||||
|
||||
state, err := p.State(ctx)
|
||||
if err != nil {
|
||||
if errdefs.IsNotFound(err) {
|
||||
if errdefs.IsNotFound(err) || errdefs.IsUnavailable(err) {
|
||||
return nil, err
|
||||
}
|
||||
log.G(ctx).WithError(err).Errorf("get state for %s", p.ID())
|
||||
|
Loading…
Reference in New Issue
Block a user