From 83af4dad871a6cb510e973ab08ab930facd9a4d2 Mon Sep 17 00:00:00 2001 From: Lantao Liu Date: Tue, 5 Feb 2019 00:16:08 -0800 Subject: [PATCH] Support unknown state for sandbox and container Signed-off-by: Lantao Liu --- pkg/server/container_remove.go | 7 +- pkg/server/container_status.go | 9 + pkg/server/container_stop.go | 59 +++++- pkg/server/events.go | 11 +- pkg/server/helpers.go | 55 ++++++ pkg/server/restart.go | 300 ++++++++++++++---------------- pkg/server/sandbox_list_test.go | 4 + pkg/server/sandbox_remove.go | 5 +- pkg/server/sandbox_status.go | 8 + pkg/server/sandbox_status_test.go | 4 + pkg/server/sandbox_stop.go | 50 ++++- pkg/store/container/container.go | 3 +- 12 files changed, 344 insertions(+), 171 deletions(-) diff --git a/pkg/server/container_remove.go b/pkg/server/container_remove.go index e05d79618..fe37a8ac7 100644 --- a/pkg/server/container_remove.go +++ b/pkg/server/container_remove.go @@ -98,9 +98,12 @@ func (c *criService) RemoveContainer(ctx context.Context, r *runtime.RemoveConta // container will not be started or removed again. func setContainerRemoving(container containerstore.Container) error { return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) { - // Do not remove container if it's still running. + // Do not remove container if it's still running or unknown. if status.State() == runtime.ContainerState_CONTAINER_RUNNING { - return status, errors.New("container is still running") + return status, errors.New("container is still running, to stop first") + } + if status.State() == runtime.ContainerState_CONTAINER_UNKNOWN { + return status, errors.New("container state is unknown, to stop first") } if status.Removing { return status, errors.New("container is already in removing state") diff --git a/pkg/server/container_status.go b/pkg/server/container_status.go index e4200acab..1e43bd42f 100644 --- a/pkg/server/container_status.go +++ b/pkg/server/container_status.go @@ -60,6 +60,15 @@ func (c *criService) ContainerStatus(ctx context.Context, r *runtime.ContainerSt } } status := toCRIContainerStatus(container, spec, imageRef) + if status.GetCreatedAt() == 0 { + // CRI doesn't allow CreatedAt == 0. + info, err := container.Container.Info(ctx) + if err != nil { + return nil, errors.Wrapf(err, "failed to get CreatedAt in %q state", status.State) + } + status.CreatedAt = info.CreatedAt.UnixNano() + } + info, err := toCRIContainerInfo(ctx, container, r.GetVerbose()) if err != nil { return nil, errors.Wrap(err, "failed to get verbose container info") diff --git a/pkg/server/container_stop.go b/pkg/server/container_stop.go index ddee397ae..f6d380d65 100644 --- a/pkg/server/container_stop.go +++ b/pkg/server/container_stop.go @@ -19,6 +19,8 @@ package server import ( "time" + "github.com/containerd/containerd" + eventtypes "github.com/containerd/containerd/api/events" "github.com/containerd/containerd/errdefs" "github.com/docker/docker/pkg/signal" "github.com/pkg/errors" @@ -60,8 +62,9 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore // Return without error if container is not running. This makes sure that // stop only takes real action after the container is started. state := container.Status.Get().State() - if state != runtime.ContainerState_CONTAINER_RUNNING { - logrus.Infof("Container to stop %q is not running, current state %q", + if state != runtime.ContainerState_CONTAINER_RUNNING && + state != runtime.ContainerState_CONTAINER_UNKNOWN { + logrus.Infof("Container to stop %q must be in running or unknown state, current state %q", id, criContainerStateToString(state)) return nil } @@ -69,9 +72,39 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore task, err := container.Container.Task(ctx, nil) if err != nil { if !errdefs.IsNotFound(err) { - return errors.Wrapf(err, "failed to stop container, task not found for container %q", id) + return errors.Wrapf(err, "failed to get task for container %q", id) + } + // Don't return for unknown state, some cleanup needs to be done. + if state != runtime.ContainerState_CONTAINER_UNKNOWN { + return nil + } + // Task is an interface, explicitly set it to nil just in case. + task = nil + } + + // Handle unknown state. + if state == runtime.ContainerState_CONTAINER_UNKNOWN { + status, err := getTaskStatus(ctx, task) + if err != nil { + return errors.Wrapf(err, "failed to get task status for %q", id) + } + switch status.Status { + case containerd.Running, containerd.Created: + // The task is still running, continue stopping the task. + case containerd.Stopped: + // The task has exited. If the task exited after containerd + // started, the event monitor will receive its exit event; if it + // exited before containerd started, the event monitor will never + // receive its exit event. + // However, we can't tell that because the task state was not + // successfully loaded during containerd start (container is + // in UNKNOWN state). + // So always do cleanup here, just in case that we've missed the + // exit event. + return cleanupUnknownContainer(ctx, id, status, container) + default: + return errors.Wrapf(err, "unsupported task status %q", status.Status) } - return nil } // We only need to kill the task. The event handler will Delete the @@ -141,3 +174,21 @@ func (c *criService) waitContainerStop(ctx context.Context, container containers return nil } } + +// cleanupUnknownContainer cleanup stopped container in unknown state. +func cleanupUnknownContainer(ctx context.Context, id string, status containerd.Status, + cntr containerstore.Container) error { + // Reuse handleContainerExit to do the cleanup. + // NOTE(random-liu): If the task did exit after containerd started, both + // the event monitor and the cleanup function would update the container + // state. The final container state will be whatever being updated first. + // There is no way to completely avoid this race condition, and for best + // effort unknown state container cleanup, this seems acceptable. + return handleContainerExit(ctx, &eventtypes.TaskExit{ + ContainerID: id, + ID: id, + Pid: 0, + ExitStatus: status.ExitStatus, + ExitedAt: status.ExitTime, + }, cntr) +} diff --git a/pkg/server/events.go b/pkg/server/events.go index 2c510a953..bc384f754 100644 --- a/pkg/server/events.go +++ b/pkg/server/events.go @@ -260,7 +260,16 @@ func handleContainerExit(ctx context.Context, e *eventtypes.TaskExit, cntr conta // Attach container IO so that `Delete` could cleanup the stream properly. task, err := cntr.Container.Task(ctx, func(*containerdio.FIFOSet) (containerdio.IO, error) { - return cntr.IO, nil + // We can't directly return cntr.IO here, because + // even if cntr.IO is nil, the cio.IO interface + // is not. + // See https://tour.golang.org/methods/12: + // Note that an interface value that holds a nil + // concrete value is itself non-nil. + if cntr.IO != nil { + return cntr.IO, nil + } + return nil, nil }, ) if err != nil { diff --git a/pkg/server/helpers.go b/pkg/server/helpers.go index 0727ba177..b44be5a2a 100644 --- a/pkg/server/helpers.go +++ b/pkg/server/helpers.go @@ -25,9 +25,12 @@ import ( "regexp" "strconv" "strings" + "time" "github.com/BurntSushi/toml" + "github.com/containerd/containerd" "github.com/containerd/containerd/containers" + "github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/runtime/linux/runctypes" runcoptions "github.com/containerd/containerd/runtime/v2/runc/options" "github.com/containerd/typeurl" @@ -44,7 +47,9 @@ import ( runtimeoptions "github.com/containerd/cri/pkg/api/runtimeoptions/v1" criconfig "github.com/containerd/cri/pkg/config" "github.com/containerd/cri/pkg/store" + containerstore "github.com/containerd/cri/pkg/store/container" imagestore "github.com/containerd/cri/pkg/store/image" + sandboxstore "github.com/containerd/cri/pkg/store/sandbox" "github.com/containerd/cri/pkg/util" ) @@ -524,3 +529,53 @@ func restrictOOMScoreAdj(preferredOOMScoreAdj int) (int, error) { } return preferredOOMScoreAdj, nil } + +const ( + // unknownExitCode is the exit code when exit reason is unknown. + unknownExitCode = 255 + // unknownExitReason is the exit reason when exit reason is unknown. + unknownExitReason = "Unknown" +) + +// unknownContainerStatus returns the default container status when its status is unknown. +func unknownContainerStatus() containerstore.Status { + return containerstore.Status{ + CreatedAt: 0, + StartedAt: 0, + FinishedAt: 0, + ExitCode: unknownExitCode, + Reason: unknownExitReason, + } +} + +// unknownSandboxStatus returns the default sandbox status when its status is unknown. +func unknownSandboxStatus() sandboxstore.Status { + return sandboxstore.Status{ + State: sandboxstore.StateUnknown, + } +} + +// unknownExitStatus generates containerd.Status for container exited with unknown exit code. +func unknownExitStatus() containerd.Status { + return containerd.Status{ + Status: containerd.Stopped, + ExitStatus: unknownExitCode, + ExitTime: time.Now(), + } +} + +// getTaskStatus returns status for a given task. It returns unknown exit status if +// the task is nil or not found. +func getTaskStatus(ctx context.Context, task containerd.Task) (containerd.Status, error) { + if task == nil { + return unknownExitStatus(), nil + } + status, err := task.Status(ctx) + if err != nil { + if !errdefs.IsNotFound(err) { + return containerd.Status{}, err + } + return unknownExitStatus(), nil + } + return status, nil +} diff --git a/pkg/server/restart.go b/pkg/server/restart.go index b45fe31a6..7d1c1c8f7 100644 --- a/pkg/server/restart.go +++ b/pkg/server/restart.go @@ -179,140 +179,130 @@ func (c *criService) loadContainer(ctx context.Context, cntr containerd.Containe status = unknownContainerStatus() } - // Load up-to-date status from containerd. var containerIO *cio.ContainerIO - t, err := cntr.Task(ctx, func(fifos *containerdio.FIFOSet) (_ containerdio.IO, err error) { - stdoutWC, stderrWC, err := c.createContainerLoggers(meta.LogPath, meta.Config.GetTty()) - if err != nil { - return nil, err - } - defer func() { + err = func() error { + // Load up-to-date status from containerd. + t, err := cntr.Task(ctx, func(fifos *containerdio.FIFOSet) (_ containerdio.IO, err error) { + stdoutWC, stderrWC, err := c.createContainerLoggers(meta.LogPath, meta.Config.GetTty()) if err != nil { - if stdoutWC != nil { - stdoutWC.Close() - } - if stderrWC != nil { - stderrWC.Close() - } + return nil, err } - }() - containerIO, err = cio.NewContainerIO(id, - cio.WithFIFOs(fifos), - ) - if err != nil { - return nil, err - } - containerIO.AddOutput("log", stdoutWC, stderrWC) - containerIO.Pipe() - return containerIO, nil - }) - if err != nil && !errdefs.IsNotFound(err) { - return container, errors.Wrap(err, "failed to load task") - } - var s containerd.Status - var notFound bool - if errdefs.IsNotFound(err) { - // Task is not found. - notFound = true - } else { - // Task is found. Get task status. - s, err = t.Status(ctx) - if err != nil { - // It's still possible that task is deleted during this window. - if !errdefs.IsNotFound(err) { - return container, errors.Wrap(err, "failed to get task status") - } - notFound = true - } - } - if notFound { - // Task is not created or has been deleted, use the checkpointed status - // to generate container status. - switch status.State() { - case runtime.ContainerState_CONTAINER_CREATED: - // NOTE: Another possibility is that we've tried to start the container, but - // containerd got restarted during that. In that case, we still - // treat the container as `CREATED`. + defer func() { + if err != nil { + if stdoutWC != nil { + stdoutWC.Close() + } + if stderrWC != nil { + stderrWC.Close() + } + } + }() containerIO, err = cio.NewContainerIO(id, - cio.WithNewFIFOs(volatileContainerDir, meta.Config.GetTty(), meta.Config.GetStdin()), + cio.WithFIFOs(fifos), ) if err != nil { - return container, errors.Wrap(err, "failed to create container io") + return nil, err } - case runtime.ContainerState_CONTAINER_RUNNING: - // Container was in running state, but its task has been deleted, - // set unknown exited state. Container io is not needed in this case. - status.FinishedAt = time.Now().UnixNano() - status.ExitCode = unknownExitCode - status.Reason = unknownExitReason - default: - // Container is in exited/unknown state, return the status as it is. + containerIO.AddOutput("log", stdoutWC, stderrWC) + containerIO.Pipe() + return containerIO, nil + }) + if err != nil && !errdefs.IsNotFound(err) { + return errors.Wrap(err, "failed to load task") } - } else { - // Task status is found. Update container status based on the up-to-date task status. - switch s.Status { - case containerd.Created: - // Task has been created, but not started yet. This could only happen if containerd - // gets restarted during container start. - // Container must be in `CREATED` state. - if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) { - return container, errors.Wrap(err, "failed to delete task") + var s containerd.Status + var notFound bool + if errdefs.IsNotFound(err) { + // Task is not found. + notFound = true + } else { + // Task is found. Get task status. + s, err = t.Status(ctx) + if err != nil { + // It's still possible that task is deleted during this window. + if !errdefs.IsNotFound(err) { + return errors.Wrap(err, "failed to get task status") + } + notFound = true } - if status.State() != runtime.ContainerState_CONTAINER_CREATED { - return container, errors.Errorf("unexpected container state for created task: %q", status.State()) - } - case containerd.Running: - // Task is running. Container must be in `RUNNING` state, based on our assuption that - // "task should not be started when containerd is down". + } + if notFound { + // Task is not created or has been deleted, use the checkpointed status + // to generate container status. switch status.State() { - case runtime.ContainerState_CONTAINER_EXITED: - return container, errors.Errorf("unexpected container state for running task: %q", status.State()) + case runtime.ContainerState_CONTAINER_CREATED: + // NOTE: Another possibility is that we've tried to start the container, but + // containerd got restarted during that. In that case, we still + // treat the container as `CREATED`. + containerIO, err = cio.NewContainerIO(id, + cio.WithNewFIFOs(volatileContainerDir, meta.Config.GetTty(), meta.Config.GetStdin()), + ) + if err != nil { + return errors.Wrap(err, "failed to create container io") + } case runtime.ContainerState_CONTAINER_RUNNING: + // Container was in running state, but its task has been deleted, + // set unknown exited state. Container io is not needed in this case. + status.FinishedAt = time.Now().UnixNano() + status.ExitCode = unknownExitCode + status.Reason = unknownExitReason default: - // This may happen if containerd gets restarted after task is started, but - // before status is checkpointed. - status.StartedAt = time.Now().UnixNano() - status.Pid = t.Pid() + // Container is in exited/unknown state, return the status as it is. } - case containerd.Stopped: - // Task is stopped. Updata status and delete the task. - if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) { - return container, errors.Wrap(err, "failed to delete task") + } else { + // Task status is found. Update container status based on the up-to-date task status. + switch s.Status { + case containerd.Created: + // Task has been created, but not started yet. This could only happen if containerd + // gets restarted during container start. + // Container must be in `CREATED` state. + if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) { + return errors.Wrap(err, "failed to delete task") + } + if status.State() != runtime.ContainerState_CONTAINER_CREATED { + return errors.Errorf("unexpected container state for created task: %q", status.State()) + } + case containerd.Running: + // Task is running. Container must be in `RUNNING` state, based on our assuption that + // "task should not be started when containerd is down". + switch status.State() { + case runtime.ContainerState_CONTAINER_EXITED: + return errors.Errorf("unexpected container state for running task: %q", status.State()) + case runtime.ContainerState_CONTAINER_RUNNING: + default: + // This may happen if containerd gets restarted after task is started, but + // before status is checkpointed. + status.StartedAt = time.Now().UnixNano() + status.Pid = t.Pid() + } + case containerd.Stopped: + // Task is stopped. Updata status and delete the task. + if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) { + return errors.Wrap(err, "failed to delete task") + } + status.FinishedAt = s.ExitTime.UnixNano() + status.ExitCode = int32(s.ExitStatus) + default: + return errors.Errorf("unexpected task status %q", s.Status) } - status.FinishedAt = s.ExitTime.UnixNano() - status.ExitCode = int32(s.ExitStatus) - default: - return container, errors.Errorf("unexpected task status %q", s.Status) } + return nil + }() + if err != nil { + logrus.WithError(err).Errorf("Failed to load container status for %q", id) + status = unknownContainerStatus() } opts := []containerstore.Opts{ containerstore.WithStatus(status, containerDir), containerstore.WithContainer(cntr), } + // containerIO could be nil for container in unknown state. if containerIO != nil { opts = append(opts, containerstore.WithContainerIO(containerIO)) } return containerstore.NewContainer(*meta, opts...) } -const ( - // unknownExitCode is the exit code when exit reason is unknown. - unknownExitCode = 255 - // unknownExitReason is the exit reason when exit reason is unknown. - unknownExitReason = "Unknown" -) - -// unknownContainerStatus returns the default container status when its status is unknown. -func unknownContainerStatus() containerstore.Status { - return containerstore.Status{ - CreatedAt: 0, - StartedAt: 0, - FinishedAt: 0, - ExitCode: unknownExitCode, - Reason: unknownExitReason, - } -} - // loadSandbox loads sandbox from containerd. func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error) { ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout) @@ -333,61 +323,59 @@ func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.S } meta := data.(*sandboxstore.Metadata) - // Load sandbox created timestamp. - info, err := cntr.Info(ctx) - if err != nil { - return sandbox, errors.Wrap(err, "failed to get sandbox container info") - } - createdAt := info.CreatedAt - - // Load sandbox status. - t, err := cntr.Task(ctx, nil) - if err != nil && !errdefs.IsNotFound(err) { - return sandbox, errors.Wrap(err, "failed to load task") - } - var s containerd.Status - var notFound bool - if errdefs.IsNotFound(err) { - // Task is not found. - notFound = true - } else { - // Task is found. Get task status. - s, err = t.Status(ctx) + s, err := func() (sandboxstore.Status, error) { + status := unknownSandboxStatus() + // Load sandbox created timestamp. + info, err := cntr.Info(ctx) if err != nil { - // It's still possible that task is deleted during this window. - if !errdefs.IsNotFound(err) { - return sandbox, errors.Wrap(err, "failed to get task status") - } + return status, errors.Wrap(err, "failed to get sandbox container info") + } + status.CreatedAt = info.CreatedAt + + // Load sandbox state. + t, err := cntr.Task(ctx, nil) + if err != nil && !errdefs.IsNotFound(err) { + return status, errors.Wrap(err, "failed to load task") + } + var taskStatus containerd.Status + var notFound bool + if errdefs.IsNotFound(err) { + // Task is not found. notFound = true - } - } - var state sandboxstore.State - var pid uint32 - if notFound { - // Task does not exist, set sandbox state as NOTREADY. - state = sandboxstore.StateNotReady - } else { - if s.Status == containerd.Running { - // Task is running, set sandbox state as READY. - state = sandboxstore.StateReady - pid = t.Pid() } else { - // Task is not running. Delete the task and set sandbox state as NOTREADY. - if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) { - return sandbox, errors.Wrap(err, "failed to delete task") + // Task is found. Get task status. + taskStatus, err = t.Status(ctx) + if err != nil { + // It's still possible that task is deleted during this window. + if !errdefs.IsNotFound(err) { + return status, errors.Wrap(err, "failed to get task status") + } + notFound = true } - state = sandboxstore.StateNotReady } + if notFound { + // Task does not exist, set sandbox state as NOTREADY. + status.State = sandboxstore.StateNotReady + } else { + if taskStatus.Status == containerd.Running { + // Task is running, set sandbox state as READY. + status.State = sandboxstore.StateReady + status.Pid = t.Pid() + } else { + // Task is not running. Delete the task and set sandbox state as NOTREADY. + if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) { + return status, errors.Wrap(err, "failed to delete task") + } + status.State = sandboxstore.StateNotReady + } + } + return status, nil + }() + if err != nil { + logrus.WithError(err).Errorf("Failed to load sandbox status for %q", cntr.ID()) } - sandbox = sandboxstore.NewSandbox( - *meta, - sandboxstore.Status{ - Pid: pid, - CreatedAt: createdAt, - State: state, - }, - ) + sandbox = sandboxstore.NewSandbox(*meta, s) sandbox.Container = cntr // Load network namespace. diff --git a/pkg/server/sandbox_list_test.go b/pkg/server/sandbox_list_test.go index ff0126129..2fda7a7c9 100644 --- a/pkg/server/sandbox_list_test.go +++ b/pkg/server/sandbox_list_test.go @@ -63,6 +63,10 @@ func TestToCRISandbox(t *testing.T) { state: sandboxstore.StateNotReady, expectedState: runtime.PodSandboxState_SANDBOX_NOTREADY, }, + "sandbox state unknown": { + state: sandboxstore.StateUnknown, + expectedState: runtime.PodSandboxState_SANDBOX_NOTREADY, + }, } { status := sandboxstore.Status{ CreatedAt: createdAt, diff --git a/pkg/server/sandbox_remove.go b/pkg/server/sandbox_remove.go index 29ebd44ed..b4ad03fe1 100644 --- a/pkg/server/sandbox_remove.go +++ b/pkg/server/sandbox_remove.go @@ -46,8 +46,9 @@ func (c *criService) RemovePodSandbox(ctx context.Context, r *runtime.RemovePodS // Use the full sandbox id. id := sandbox.ID - // Return error if sandbox container is still running. - if sandbox.Status.Get().State == sandboxstore.StateReady { + // Return error if sandbox container is still running or unknown. + state := sandbox.Status.Get().State + if state == sandboxstore.StateReady || state == sandboxstore.StateUnknown { return nil, errors.Errorf("sandbox container %q is not fully stopped", id) } diff --git a/pkg/server/sandbox_status.go b/pkg/server/sandbox_status.go index fb8f3d928..6e0e01501 100644 --- a/pkg/server/sandbox_status.go +++ b/pkg/server/sandbox_status.go @@ -42,6 +42,14 @@ func (c *criService) PodSandboxStatus(ctx context.Context, r *runtime.PodSandbox return nil, errors.Wrap(err, "failed to get sandbox ip") } status := toCRISandboxStatus(sandbox.Metadata, sandbox.Status.Get(), ip) + if status.GetCreatedAt() == 0 { + // CRI doesn't allow CreatedAt == 0. + info, err := sandbox.Container.Info(ctx) + if err != nil { + return nil, errors.Wrapf(err, "failed to get CreatedAt for sandbox container in %q state", status.State) + } + status.CreatedAt = info.CreatedAt.UnixNano() + } if !r.GetVerbose() { return &runtime.PodSandboxStatusResponse{Status: status}, nil } diff --git a/pkg/server/sandbox_status_test.go b/pkg/server/sandbox_status_test.go index 23453d32b..3ef23db0e 100644 --- a/pkg/server/sandbox_status_test.go +++ b/pkg/server/sandbox_status_test.go @@ -86,6 +86,10 @@ func TestPodSandboxStatus(t *testing.T) { state: sandboxstore.StateNotReady, expectedState: runtime.PodSandboxState_SANDBOX_NOTREADY, }, + "sandbox state unknown": { + state: sandboxstore.StateUnknown, + expectedState: runtime.PodSandboxState_SANDBOX_NOTREADY, + }, } { t.Logf("TestCase: %s", desc) status := sandboxstore.Status{ diff --git a/pkg/server/sandbox_stop.go b/pkg/server/sandbox_stop.go index 5fc78e8ab..69e2038b1 100644 --- a/pkg/server/sandbox_stop.go +++ b/pkg/server/sandbox_stop.go @@ -19,6 +19,8 @@ package server import ( "time" + "github.com/containerd/containerd" + eventtypes "github.com/containerd/containerd/api/events" "github.com/containerd/containerd/errdefs" cni "github.com/containerd/go-cni" "github.com/pkg/errors" @@ -60,10 +62,11 @@ func (c *criService) StopPodSandbox(ctx context.Context, r *runtime.StopPodSandb return nil, errors.Wrap(err, "failed to unmount sandbox files") } - // Only stop sandbox container when it's running. - if sandbox.Status.Get().State == sandboxstore.StateReady { + // Only stop sandbox container when it's running or unknown. + state := sandbox.Status.Get().State + if state == sandboxstore.StateReady || state == sandboxstore.StateUnknown { if err := c.stopSandboxContainer(ctx, sandbox); err != nil { - return nil, errors.Wrapf(err, "failed to stop sandbox container %q", id) + return nil, errors.Wrapf(err, "failed to stop sandbox container %q in %q state", id, state) } } @@ -95,12 +98,36 @@ func (c *criService) StopPodSandbox(ctx context.Context, r *runtime.StopPodSandb // the event monitor handles the `TaskExit` event. func (c *criService) stopSandboxContainer(ctx context.Context, sandbox sandboxstore.Sandbox) error { container := sandbox.Container + state := sandbox.Status.Get().State task, err := container.Task(ctx, nil) if err != nil { - if errdefs.IsNotFound(err) { + if !errdefs.IsNotFound(err) { + return errors.Wrap(err, "failed to get sandbox container") + } + // Don't return for unknown state, some cleanup needs to be done. + if state != sandboxstore.StateUnknown { return nil } - return errors.Wrap(err, "failed to get sandbox container") + // Task is an interface, explicitly set it to nil just in case. + task = nil + } + + // Handle unknown state. + // The cleanup logic is the same with container unknown state. + if state == sandboxstore.StateUnknown { + status, err := getTaskStatus(ctx, task) + if err != nil { + return errors.Wrapf(err, "failed to get task status for %q", sandbox.ID) + } + switch status.Status { + case containerd.Running, containerd.Created: + // The task is still running, continue stopping the task. + case containerd.Stopped: + // The task has exited, explicitly cleanup. + return cleanupUnknownSandbox(ctx, sandbox.ID, status, sandbox) + default: + return errors.Wrapf(err, "unsupported task status %q", status.Status) + } } // Kill the sandbox container. @@ -137,3 +164,16 @@ func (c *criService) teardownPod(id string, path string, config *runtime.PodSand cni.WithLabels(labels), cni.WithCapabilityPortMap(toCNIPortMappings(config.GetPortMappings()))) } + +// cleanupUnknownSandbox cleanup stopped sandbox in unknown state. +func cleanupUnknownSandbox(ctx context.Context, id string, status containerd.Status, + sandbox sandboxstore.Sandbox) error { + // Reuse handleSandboxExit to do the cleanup. + return handleSandboxExit(ctx, &eventtypes.TaskExit{ + ContainerID: id, + ID: id, + Pid: 0, + ExitStatus: status.ExitStatus, + ExitedAt: status.ExitTime, + }, sandbox) +} diff --git a/pkg/store/container/container.go b/pkg/store/container/container.go index fb6cad133..07a57b187 100644 --- a/pkg/store/container/container.go +++ b/pkg/store/container/container.go @@ -36,7 +36,8 @@ type Container struct { Status StatusStorage // Container is the containerd container client. Container containerd.Container - // Container IO + // Container IO. + // IO could only be nil when the container is in unknown state. IO *cio.ContainerIO // StopCh is used to propagate the stop information of the container. *store.StopCh