Use wait instead of TaskExit.

Signed-off-by: Lantao Liu <lantaol@google.com>
2019-04-11 16:36:44 -07:00 · 2019-04-11 16:36:44 -07:00 · d1f9611cb0
commit d1f9611cb0
parent ebb0928057
18 changed files with 390 additions and 266 deletions
--- a/pkg/server/container_remove.go
+++ b/pkg/server/container_remove.go
@ -105,6 +105,9 @@ func setContainerRemoving(container containerstore.Container) error {
 		if status.State() == runtime.ContainerState_CONTAINER_UNKNOWN {
 			return status, errors.New("container state is unknown, to stop first")
 		}
 		if status.Starting {
 			return status, errors.New("container is in starting state, can't be removed")
 		}
 		if status.Removing {
 			return status, errors.New("container is already in removing state")
 		}
--- a/pkg/server/container_remove_test.go
+++ b/pkg/server/container_remove_test.go
@ -40,6 +40,13 @@ func TestSetContainerRemoving(t *testing.T) {
 			},
 			expectErr: true,
 		},
 		"should return error when container is in starting state": {
 			status: containerstore.Status{
 				CreatedAt: time.Now().UnixNano(),
 				Starting:  true,
 			},
 			expectErr: true,
 		},
 		"should return error when container is in removing state": {
 			status: containerstore.Status{
 				CreatedAt:  time.Now().UnixNano(),
@ -71,6 +78,8 @@ func TestSetContainerRemoving(t *testing.T) {
 		} else {
 			assert.NoError(t, err)
 			assert.True(t, container.Status.Get().Removing, "removing should be set")
 			assert.NoError(t, resetContainerRemoving(container))
 			assert.False(t, container.Status.Get().Removing, "removing should be reset")
 		}
 	}
 }
--- a/pkg/server/container_start.go
+++ b/pkg/server/container_start.go
@ -38,64 +38,48 @@ import (
 // StartContainer starts the container.
 func (c *criService) StartContainer(ctx context.Context, r *runtime.StartContainerRequest) (retRes *runtime.StartContainerResponse, retErr error) {
-	container, err := c.containerStore.Get(r.GetContainerId())
+	cntr, err := c.containerStore.Get(r.GetContainerId())
 	if err != nil {
 		return nil, errors.Wrapf(err, "an error occurred when try to find container %q", r.GetContainerId())
 	}
 	var startErr error
 	// update container status in one transaction to avoid race with event monitor.
 	if err := container.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
 		// Always apply status change no matter startContainer fails or not. Because startContainer
 		// may change container state no matter it fails or succeeds.
 		startErr = c.startContainer(ctx, container, &status)
 		return status, nil
 	}); startErr != nil {
 		return nil, startErr
 	} else if err != nil {
 		return nil, errors.Wrapf(err, "failed to update container %q metadata", container.ID)
 	}
 	return &runtime.StartContainerResponse{}, nil
 }
 // startContainer actually starts the container. The function needs to be run in one transaction. Any updates
 // to the status passed in will be applied no matter the function returns error or not.
 func (c *criService) startContainer(ctx context.Context,
 	cntr containerstore.Container,
 	status *containerstore.Status) (retErr error) {
 	id := cntr.ID
 	meta := cntr.Metadata
 	container := cntr.Container
 	config := meta.Config
-	// Return error if container is not in created state.
+	// Set starting state to prevent other start/remove operations against this container
-	if status.State() != runtime.ContainerState_CONTAINER_CREATED {
+	// while it's being started.
-		return errors.Errorf("container %q is in %s state", id, criContainerStateToString(status.State()))
+	if err := setContainerStarting(cntr); err != nil {
 		return nil, errors.Wrapf(err, "failed to set starting state for container %q", id)
 	}
 	// Do not start the container when there is a removal in progress.
 	if status.Removing {
 		return errors.Errorf("container %q is in removing state", id)
 	}
 	defer func() {
 		if retErr != nil {
 			// Set container to exited if fail to start.
-			status.Pid = 0
+			if err := cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
-			status.FinishedAt = time.Now().UnixNano()
+				status.Pid = 0
-			status.ExitCode = errorStartExitCode
+				status.FinishedAt = time.Now().UnixNano()
-			status.Reason = errorStartReason
+				status.ExitCode = errorStartExitCode
-			status.Message = retErr.Error()
+				status.Reason = errorStartReason
 				status.Message = retErr.Error()
 				return status, nil
 			}); err != nil {
 				logrus.WithError(err).Errorf("failed to set start failure state for container %q", id)
 			}
 		}
 		if err := resetContainerStarting(cntr); err != nil {
 			logrus.WithError(err).Errorf("failed to reset starting state for container %q", id)
 		}
 	}()
 	// Get sandbox config from sandbox store.
 	sandbox, err := c.sandboxStore.Get(meta.SandboxID)
 	if err != nil {
-		return errors.Wrapf(err, "sandbox %q not found", meta.SandboxID)
+		return nil, errors.Wrapf(err, "sandbox %q not found", meta.SandboxID)
 	}
 	sandboxID := meta.SandboxID
 	if sandbox.Status.Get().State != sandboxstore.StateReady {
-		return errors.Errorf("sandbox container %q is not running", sandboxID)
+		return nil, errors.Errorf("sandbox container %q is not running", sandboxID)
 	}
 	ioCreation := func(id string) (_ containerdio.IO, err error) {
@ -110,7 +94,7 @@ func (c *criService) startContainer(ctx context.Context,
 	ctrInfo, err := container.Info(ctx)
 	if err != nil {
-		return errors.Wrap(err, "failed to get container info")
+		return nil, errors.Wrap(err, "failed to get container info")
 	}
 	var taskOpts []containerd.NewTaskOpts
@ -120,7 +104,7 @@ func (c *criService) startContainer(ctx context.Context,
 	}
 	task, err := container.NewTask(ctx, ioCreation, taskOpts...)
 	if err != nil {
-		return errors.Wrap(err, "failed to create containerd task")
+		return nil, errors.Wrap(err, "failed to create containerd task")
 	}
 	defer func() {
 		if retErr != nil {
@ -133,15 +117,61 @@ func (c *criService) startContainer(ctx context.Context,
 		}
 	}()
 	// wait is a long running background request, no timeout needed.
 	exitCh, err := task.Wait(ctrdutil.NamespacedContext())
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to wait for containerd task")
 	}
 	// Start containerd task.
 	if err := task.Start(ctx); err != nil {
-		return errors.Wrapf(err, "failed to start containerd task %q", id)
+		return nil, errors.Wrapf(err, "failed to start containerd task %q", id)
 	}
 	// Update container start timestamp.
-	status.Pid = task.Pid()
+	if err := cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
-	status.StartedAt = time.Now().UnixNano()
+		status.Pid = task.Pid()
-	return nil
+		status.StartedAt = time.Now().UnixNano()
 		return status, nil
 	}); err != nil {
 		return nil, errors.Wrapf(err, "failed to update container %q state", id)
 	}
 	// start the monitor after updating container state, this ensures that
 	// event monitor receives the TaskExit event and update container state
 	// after this.
 	c.eventMonitor.startExitMonitor(context.Background(), id, task.Pid(), exitCh)
 	return &runtime.StartContainerResponse{}, nil
 }
 // setContainerStarting sets the container into starting state. In starting state, the
 // container will not be removed or started again.
 func setContainerStarting(container containerstore.Container) error {
 	return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) {
 		// Return error if container is not in created state.
 		if status.State() != runtime.ContainerState_CONTAINER_CREATED {
 			return status, errors.Errorf("container is in %s state", criContainerStateToString(status.State()))
 		}
 		// Do not start the container when there is a removal in progress.
 		if status.Removing {
 			return status, errors.New("container is in removing state, can't be started")
 		}
 		if status.Starting {
 			return status, errors.New("container is already in starting state")
 		}
 		status.Starting = true
 		return status, nil
 	})
 }
 // resetContainerStarting resets the container starting state on start failure. So
 // that we could remove the container later.
 func resetContainerStarting(container containerstore.Container) error {
 	return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) {
 		status.Starting = false
 		return status, nil
 	})
 }
 // createContainerLoggers creates container loggers and return write closer for stdout and stderr.
--- a/pkg/server/container_start_test.go
+++ b/pkg/server/container_start_test.go
@ -0,0 +1,98 @@
 /*
 Copyright 2019 The containerd Authors.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 */
 package server
 import (
 	"testing"
 	"time"
 	"github.com/stretchr/testify/assert"
 	containerstore "github.com/containerd/cri/pkg/store/container"
 )
 // TestSetContainerStarting tests setContainerStarting sets removing
 // state correctly.
 func TestSetContainerStarting(t *testing.T) {
 	testID := "test-id"
 	for desc, test := range map[string]struct {
 		status    containerstore.Status
 		expectErr bool
 	}{
 		"should not return error when container is in created state": {
 			status: containerstore.Status{
 				CreatedAt: time.Now().UnixNano(),
 			},
 			expectErr: false,
 		},
 		"should return error when container is in running state": {
 			status: containerstore.Status{
 				CreatedAt: time.Now().UnixNano(),
 				StartedAt: time.Now().UnixNano(),
 			},
 			expectErr: true,
 		},
 		"should return error when container is in exited state": {
 			status: containerstore.Status{
 				CreatedAt:  time.Now().UnixNano(),
 				StartedAt:  time.Now().UnixNano(),
 				FinishedAt: time.Now().UnixNano(),
 			},
 			expectErr: true,
 		},
 		"should return error when container is in unknown state": {
 			status: containerstore.Status{
 				CreatedAt:  0,
 				StartedAt:  0,
 				FinishedAt: 0,
 			},
 			expectErr: true,
 		},
 		"should return error when container is in starting state": {
 			status: containerstore.Status{
 				CreatedAt: time.Now().UnixNano(),
 				Starting:  true,
 			},
 			expectErr: true,
 		},
 		"should return error when container is in removing state": {
 			status: containerstore.Status{
 				CreatedAt: time.Now().UnixNano(),
 				Removing:  true,
 			},
 			expectErr: true,
 		},
 	} {
 		t.Logf("TestCase %q", desc)
 		container, err := containerstore.NewContainer(
 			containerstore.Metadata{ID: testID},
 			containerstore.WithFakeStatus(test.status),
 		)
 		assert.NoError(t, err)
 		err = setContainerStarting(container)
 		if test.expectErr {
 			assert.Error(t, err)
 			assert.Equal(t, test.status, container.Status.Get(), "metadata should not be updated")
 		} else {
 			assert.NoError(t, err)
 			assert.True(t, container.Status.Get().Starting, "starting should be set")
 			assert.NoError(t, resetContainerStarting(container))
 			assert.False(t, container.Status.Get().Starting, "starting should be reset")
 		}
 	}
 }
--- a/pkg/server/container_stop.go
+++ b/pkg/server/container_stop.go
@ -28,6 +28,7 @@ import (
 	"golang.org/x/sys/unix"
 	runtime "k8s.io/kubernetes/pkg/kubelet/apis/cri/runtime/v1alpha2"
 	ctrdutil "github.com/containerd/cri/pkg/containerd/util"
 	"github.com/containerd/cri/pkg/store"
 	containerstore "github.com/containerd/cri/pkg/store/container"
 )
@ -74,36 +75,34 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore
 			return errors.Wrapf(err, "failed to get task for container %q", id)
 		}
 		// Don't return for unknown state, some cleanup needs to be done.
-		if state != runtime.ContainerState_CONTAINER_UNKNOWN {
+		if state == runtime.ContainerState_CONTAINER_UNKNOWN {
-			return nil
+			return cleanupUnknownContainer(ctx, id, container)
 		}
-		// Task is an interface, explicitly set it to nil just in case.
+		return nil
 		task = nil
 	}
 	// Handle unknown state.
 	if state == runtime.ContainerState_CONTAINER_UNKNOWN {
-		status, err := getTaskStatus(ctx, task)
+		// Start an exit handler for containers in unknown state.
 		waitCtx, waitCancel := context.WithCancel(ctrdutil.NamespacedContext())
 		defer waitCancel()
 		exitCh, err := task.Wait(waitCtx)
 		if err != nil {
-			return errors.Wrapf(err, "failed to get task status for %q", id)
+			if !errdefs.IsNotFound(err) {
-		}
+				return errors.Wrapf(err, "failed to wait for task for %q", id)
-		switch status.Status {
+			}
-		case containerd.Running, containerd.Created:
+			return cleanupUnknownContainer(ctx, id, container)
 			// The task is still running, continue stopping the task.
 		case containerd.Stopped:
 			// The task has exited. If the task exited after containerd
 			// started, the event monitor will receive its exit event; if it
 			// exited before containerd started, the event monitor will never
 			// receive its exit event.
 			// However, we can't tell that because the task state was not
 			// successfully loaded during containerd start (container is
 			// in UNKNOWN state).
 			// So always do cleanup here, just in case that we've missed the
 			// exit event.
 			return cleanupUnknownContainer(ctx, id, status, container)
 		default:
 			return errors.Wrapf(err, "unsupported task status %q", status.Status)
 		}
 		exitCtx, exitCancel := context.WithCancel(context.Background())
 		stopCh := c.eventMonitor.startExitMonitor(exitCtx, id, task.Pid(), exitCh)
 		defer func() {
 			exitCancel()
 			// This ensures that exit monitor is stopped before
 			// `Wait` is cancelled, so no exit event is generated
 			// because of the `Wait` cancellation.
 			<-stopCh
 		}()
 	}
 	// We only need to kill the task. The event handler will Delete the
@ -176,19 +175,13 @@ func (c *criService) waitContainerStop(ctx context.Context, container containers
 }
 // cleanupUnknownContainer cleanup stopped container in unknown state.
-func cleanupUnknownContainer(ctx context.Context, id string, status containerd.Status,
+func cleanupUnknownContainer(ctx context.Context, id string, cntr containerstore.Container) error {
 	cntr containerstore.Container) error {
 	// Reuse handleContainerExit to do the cleanup.
 	// NOTE(random-liu): If the task did exit after containerd started, both
 	// the event monitor and the cleanup function would update the container
 	// state. The final container state will be whatever being updated first.
 	// There is no way to completely avoid this race condition, and for best
 	// effort unknown state container cleanup, this seems acceptable.
 	return handleContainerExit(ctx, &eventtypes.TaskExit{
 		ContainerID: id,
 		ID:          id,
 		Pid:         0,
-		ExitStatus:  status.ExitStatus,
+		ExitStatus:  unknownExitCode,
-		ExitedAt:    status.ExitTime,
+		ExitedAt:    time.Now(),
 	}, cntr)
 }
--- a/pkg/server/events.go
+++ b/pkg/server/events.go
@ -50,13 +50,17 @@ const (
 	// Add a timeout for each event handling, events that timeout will be requeued and
 	// handled again in the future.
 	handleEventTimeout = 10 * time.Second
 	exitChannelSize = 1024
 )
 // eventMonitor monitors containerd event and updates internal state correspondingly.
 // TODO(random-liu): Handle event for each container in a separate goroutine.
 type eventMonitor struct {
-	c       *criService
+	c  *criService
-	ch      <-chan *events.Envelope
+	ch <-chan *events.Envelope
 	// exitCh receives container/sandbox exit events from exit monitors.
 	exitCh  chan *eventtypes.TaskExit
 	errCh   <-chan error
 	ctx     context.Context
 	cancel  context.CancelFunc
@ -89,6 +93,7 @@ func newEventMonitor(c *criService) *eventMonitor {
 		c:       c,
 		ctx:     ctx,
 		cancel:  cancel,
 		exitCh:  make(chan *eventtypes.TaskExit, exitChannelSize),
 		backOff: newBackOff(),
 	}
 }
@ -98,13 +103,38 @@ func (em *eventMonitor) subscribe(subscriber events.Subscriber) {
 	// note: filters are any match, if you want any match but not in namespace foo
 	// then you have to manually filter namespace foo
 	filters := []string{
 		`topic=="/tasks/exit"`,
 		`topic=="/tasks/oom"`,
 		`topic~="/images/"`,
 	}
 	em.ch, em.errCh = subscriber.Subscribe(em.ctx, filters...)
 }
 // startExitMonitor starts an exit monitor for a given container/sandbox.
 func (em *eventMonitor) startExitMonitor(ctx context.Context, id string, pid uint32, exitCh <-chan containerd.ExitStatus) <-chan struct{} {
 	stopCh := make(chan struct{})
 	go func() {
 		defer close(stopCh)
 		select {
 		case exitRes := <-exitCh:
 			exitStatus, exitedAt, err := exitRes.Result()
 			if err != nil {
 				logrus.WithError(err).Errorf("Failed to get task exit status for %q", id)
 				exitStatus = unknownExitCode
 				exitedAt = time.Now()
 			}
 			em.exitCh <- &eventtypes.TaskExit{
 				ContainerID: id,
 				ID:          id,
 				Pid:         pid,
 				ExitStatus:  exitStatus,
 				ExitedAt:    exitedAt,
 			}
 		case <-ctx.Done():
 		}
 	}()
 	return stopCh
 }
 func convertEvent(e *gogotypes.Any) (string, interface{}, error) {
 	id := ""
 	evt, err := typeurl.UnmarshalAny(e)
@ -113,8 +143,6 @@ func convertEvent(e *gogotypes.Any) (string, interface{}, error) {
 	}
 	switch e := evt.(type) {
 	case *eventtypes.TaskExit:
 		id = e.ContainerID
 	case *eventtypes.TaskOOM:
 		id = e.ContainerID
 	case *eventtypes.ImageCreate:
@ -142,6 +170,18 @@ func (em *eventMonitor) start() <-chan error {
 		defer close(errCh)
 		for {
 			select {
 			case e := <-em.exitCh:
 				logrus.Debugf("Received exit event %+v", e)
 				id := e.ID
 				if em.backOff.isInBackOff(id) {
 					logrus.Infof("Events for %q is in backoff, enqueue event %+v", id, e)
 					em.backOff.enBackOff(id, e)
 					break
 				}
 				if err := em.handleEvent(e); err != nil {
 					logrus.WithError(err).Errorf("Failed to handle exit event %+v for %s", e, id)
 					em.backOff.enBackOff(id, e)
 				}
 			case e := <-em.ch:
 				logrus.Debugf("Received containerd event timestamp - %v, namespace - %q, topic - %q", e.Timestamp, e.Namespace, e.Topic)
 				if e.Namespace != constants.K8sContainerdNamespace {
@ -213,8 +253,7 @@ func (em *eventMonitor) handleEvent(any interface{}) error {
 		} else if err != store.ErrNotExist {
 			return errors.Wrap(err, "can't find container for TaskExit event")
 		}
-		// Use GetAll to include sandbox in init state.
+		sb, err := em.c.sandboxStore.Get(e.ID)
 		sb, err := em.c.sandboxStore.GetAll(e.ID)
 		if err == nil {
 			if err := handleSandboxExit(ctx, e, sb); err != nil {
 				return errors.Wrap(err, "failed to handle sandbox TaskExit event")
@ -322,15 +361,7 @@ func handleSandboxExit(ctx context.Context, e *eventtypes.TaskExit, sb sandboxst
 		}
 	}
 	err = sb.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
-		// NOTE(random-liu): We SHOULD NOT change INIT state here.
+		status.State = sandboxstore.StateNotReady
 		// If sandbox state is INIT when event monitor receives an TaskExit event,
 		// it means that sandbox start has failed. In that case, `RunPodSandbox` will
 		// cleanup everything immediately.
 		// Once sandbox state goes out of INIT, it becomes visable to the user, which
 		// is not what we want.
 		if status.State != sandboxstore.StateInit {
 			status.State = sandboxstore.StateNotReady
 		}
 		status.Pid = 0
 		return status, nil
 	})
--- a/pkg/server/events_test.go
+++ b/pkg/server/events_test.go
@ -21,9 +21,7 @@ import (
 	"time"
 	eventtypes "github.com/containerd/containerd/api/events"
 	//"github.com/containerd/containerd/api/services/events/v1"
 	"github.com/containerd/typeurl"
 	//gogotypes "github.com/gogo/protobuf/types"
 	"github.com/stretchr/testify/assert"
 	"k8s.io/apimachinery/pkg/util/clock"
 )
@ -35,22 +33,22 @@ func TestBackOff(t *testing.T) {
 	inputQueues := map[string]*backOffQueue{
 		"container1": {
 			events: []interface{}{
-				&eventtypes.TaskExit{ContainerID: "container1", ID: "1"},
+				&eventtypes.TaskOOM{ContainerID: "container1"},
-				&eventtypes.TaskExit{ContainerID: "container1", ID: "2"},
+				&eventtypes.TaskOOM{ContainerID: "container1"},
 			},
 		},
 		"container2": {
 			events: []interface{}{
-				&eventtypes.TaskExit{ContainerID: "container2", ID: "1"},
+				&eventtypes.TaskOOM{ContainerID: "container2"},
-				&eventtypes.TaskExit{ContainerID: "container2", ID: "2"},
+				&eventtypes.TaskOOM{ContainerID: "container2"},
 			},
 		},
 	}
 	expectedQueues := map[string]*backOffQueue{
 		"container2": {
 			events: []interface{}{
-				&eventtypes.TaskExit{ContainerID: "container2", ID: "1"},
+				&eventtypes.TaskOOM{ContainerID: "container2"},
-				&eventtypes.TaskExit{ContainerID: "container2", ID: "2"},
+				&eventtypes.TaskOOM{ContainerID: "container2"},
 			},
 			expireTime: testClock.Now().Add(backOffInitDuration),
 			duration:   backOffInitDuration,
@ -58,8 +56,8 @@ func TestBackOff(t *testing.T) {
 		},
 		"container1": {
 			events: []interface{}{
-				&eventtypes.TaskExit{ContainerID: "container1", ID: "1"},
+				&eventtypes.TaskOOM{ContainerID: "container1"},
-				&eventtypes.TaskExit{ContainerID: "container1", ID: "2"},
+				&eventtypes.TaskOOM{ContainerID: "container1"},
 			},
 			expireTime: testClock.Now().Add(backOffInitDuration),
 			duration:   backOffInitDuration,
--- a/pkg/server/helpers.go
+++ b/pkg/server/helpers.go
@ -23,12 +23,9 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
 	"time"
 	"github.com/BurntSushi/toml"
 	"github.com/containerd/containerd"
 	"github.com/containerd/containerd/containers"
 	"github.com/containerd/containerd/errdefs"
 	"github.com/containerd/containerd/runtime/linux/runctypes"
 	runcoptions "github.com/containerd/containerd/runtime/v2/runc/options"
 	"github.com/containerd/typeurl"
@ -477,31 +474,6 @@ func unknownSandboxStatus() sandboxstore.Status {
 	}
 }
 // unknownExitStatus generates containerd.Status for container exited with unknown exit code.
 func unknownExitStatus() containerd.Status {
 	return containerd.Status{
 		Status:     containerd.Stopped,
 		ExitStatus: unknownExitCode,
 		ExitTime:   time.Now(),
 	}
 }
 // getTaskStatus returns status for a given task. It returns unknown exit status if
 // the task is nil or not found.
 func getTaskStatus(ctx context.Context, task containerd.Task) (containerd.Status, error) {
 	if task == nil {
 		return unknownExitStatus(), nil
 	}
 	status, err := task.Status(ctx)
 	if err != nil {
 		if !errdefs.IsNotFound(err) {
 			return containerd.Status{}, err
 		}
 		return unknownExitStatus(), nil
 	}
 	return status, nil
 }
 // getPassthroughAnnotations filters requested pod annotations by comparing
 // against permitted annotations for the given runtime.
 func getPassthroughAnnotations(podAnnotations map[string]string,
--- a/pkg/server/restart.go
+++ b/pkg/server/restart.go
@ -34,6 +34,7 @@ import (
 	"golang.org/x/net/context"
 	runtime "k8s.io/kubernetes/pkg/kubelet/apis/cri/runtime/v1alpha2"
 	ctrdutil "github.com/containerd/cri/pkg/containerd/util"
 	"github.com/containerd/cri/pkg/netns"
 	cio "github.com/containerd/cri/pkg/server/io"
 	containerstore "github.com/containerd/cri/pkg/store/container"
@ -57,7 +58,7 @@ func (c *criService) recover(ctx context.Context) error {
 		return errors.Wrap(err, "failed to list sandbox containers")
 	}
 	for _, sandbox := range sandboxes {
-		sb, err := loadSandbox(ctx, sandbox)
+		sb, err := c.loadSandbox(ctx, sandbox)
 		if err != nil {
 			logrus.WithError(err).Errorf("Failed to load sandbox %q", sandbox.ID())
 			continue
@ -275,6 +276,22 @@ func (c *criService) loadContainer(ctx context.Context, cntr containerd.Containe
 					status.StartedAt = time.Now().UnixNano()
 					status.Pid = t.Pid()
 				}
 				// Wait for the task for exit monitor.
 				// wait is a long running background request, no timeout needed.
 				exitCh, err := t.Wait(ctrdutil.NamespacedContext())
 				if err != nil {
 					if !errdefs.IsNotFound(err) {
 						return errors.Wrap(err, "failed to wait for task")
 					}
 					// Container was in running state, but its task has been deleted,
 					// set unknown exited state.
 					status.FinishedAt = time.Now().UnixNano()
 					status.ExitCode = unknownExitCode
 					status.Reason = unknownExitReason
 				} else {
 					// Start exit monitor.
 					c.eventMonitor.startExitMonitor(context.Background(), id, status.Pid, exitCh)
 				}
 			case containerd.Stopped:
 				// Task is stopped. Updata status and delete the task.
 				if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
@ -304,7 +321,7 @@ func (c *criService) loadContainer(ctx context.Context, cntr containerd.Containe
 }
 // loadSandbox loads sandbox from containerd.
-func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error) {
+func (c *criService) loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error) {
 	ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout)
 	defer cancel()
 	var sandbox sandboxstore.Sandbox
@ -358,9 +375,20 @@ func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.S
 			status.State = sandboxstore.StateNotReady
 		} else {
 			if taskStatus.Status == containerd.Running {
-				// Task is running, set sandbox state as READY.
+				// Wait for the task for sandbox monitor.
-				status.State = sandboxstore.StateReady
+				// wait is a long running background request, no timeout needed.
-				status.Pid = t.Pid()
+				exitCh, err := t.Wait(ctrdutil.NamespacedContext())
 				if err != nil {
 					if !errdefs.IsNotFound(err) {
 						return status, errors.Wrap(err, "failed to wait for task")
 					}
 					status.State = sandboxstore.StateNotReady
 				} else {
 					// Task is running, set sandbox state as READY.
 					status.State = sandboxstore.StateReady
 					status.Pid = t.Pid()
 					c.eventMonitor.startExitMonitor(context.Background(), meta.ID, status.Pid, exitCh)
 				}
 			} else {
 				// Task is not running. Delete the task and set sandbox state as NOTREADY.
 				if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
--- a/pkg/server/sandbox_run.go
+++ b/pkg/server/sandbox_run.go
@ -87,7 +87,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
 			RuntimeHandler: r.GetRuntimeHandler(),
 		},
 		sandboxstore.Status{
-			State: sandboxstore.StateInit,
+			State: sandboxstore.StateUnknown,
 		},
 	)
@ -253,88 +253,65 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to get sandbox container info")
 	}
 	// Create sandbox task in containerd.
 	log.Tracef("Create sandbox container (id=%q, name=%q).",
 		id, name)
 	var taskOpts []containerd.NewTaskOpts
 	// TODO(random-liu): Remove this after shim v1 is deprecated.
 	if c.config.NoPivot && ociRuntime.Type == linuxRuntime {
 		taskOpts = append(taskOpts, containerd.WithNoPivotRoot)
 	}
 	// We don't need stdio for sandbox container.
 	task, err := container.NewTask(ctx, containerdio.NullIO, taskOpts...)
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to create containerd task")
 	}
 	defer func() {
 		if retErr != nil {
 			deferCtx, deferCancel := ctrdutil.DeferContext()
 			defer deferCancel()
 			// Cleanup the sandbox container if an error is returned.
 			if _, err := task.Delete(deferCtx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
 				logrus.WithError(err).Errorf("Failed to delete sandbox container %q", id)
 			}
 		}
 	}()
 	// wait is a long running background request, no timeout needed.
 	exitCh, err := task.Wait(ctrdutil.NamespacedContext())
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to wait for sandbox container task")
 	}
 	if err := task.Start(ctx); err != nil {
 		return nil, errors.Wrapf(err, "failed to start sandbox container task %q", id)
 	}
 	if err := sandbox.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
 		// Set the pod sandbox as ready after successfully start sandbox container.
 		status.Pid = task.Pid()
 		status.State = sandboxstore.StateReady
 		status.CreatedAt = info.CreatedAt
 		return status, nil
 	}); err != nil {
-		return nil, errors.Wrap(err, "failed to update sandbox created timestamp")
+		return nil, errors.Wrap(err, "failed to update sandbox status")
 	}
 	// Add sandbox into sandbox store in INIT state.
 	sandbox.Container = container
 	if err := c.sandboxStore.Add(sandbox); err != nil {
 		return nil, errors.Wrapf(err, "failed to add sandbox %+v into store", sandbox)
 	}
-	defer func() {
+
-		// Delete sandbox from sandbox store if there is an error.
+	// start the monitor after adding sandbox into the store, this ensures
-		if retErr != nil {
+	// that sandbox is in the store, when event monitor receives the TaskExit event.
 			c.sandboxStore.Delete(id)
 		}
 	}()
 	// NOTE(random-liu): Sandbox state only stay in INIT state after this point
 	// and before the end of this function.
 	// * If `Update` succeeds, sandbox state will become READY in one transaction.
 	// * If `Update` fails, sandbox will be removed from the store in the defer above.
 	// * If containerd stops at any point before `Update` finishes, because sandbox
 	// state is not checkpointed, it will be recovered from corresponding containerd task
 	// status during restart:
 	//   * If the task is running, sandbox state will be READY,
 	//   * Or else, sandbox state will be NOTREADY.
 	//
-	// In any case, sandbox will leave INIT state, so it's safe to ignore sandbox
+	// TaskOOM from containerd may come before sandbox is added to store,
-	// in INIT state in other functions.
+	// but we don't care about sandbox TaskOOM right now, so it is fine.
-
+	c.eventMonitor.startExitMonitor(context.Background(), id, task.Pid(), exitCh)
 	// Start sandbox container in one transaction to avoid race condition with
 	// event monitor.
 	if err := sandbox.Status.Update(func(status sandboxstore.Status) (_ sandboxstore.Status, retErr error) {
 		// NOTE(random-liu): We should not change the sandbox state to NOTREADY
 		// if `Update` fails.
 		//
 		// If `Update` fails, the sandbox will be cleaned up by all the defers
 		// above. We should not let user see this sandbox, or else they will
 		// see the sandbox disappear after the defer clean up, which may confuse
 		// them.
 		//
 		// Given so, we should keep the sandbox in INIT state if `Update` fails,
 		// and ignore sandbox in INIT state in all the inspection functions.
 		// Create sandbox task in containerd.
 		log.Tracef("Create sandbox container (id=%q, name=%q).",
 			id, name)
 		var taskOpts []containerd.NewTaskOpts
 		// TODO(random-liu): Remove this after shim v1 is deprecated.
 		if c.config.NoPivot && ociRuntime.Type == linuxRuntime {
 			taskOpts = append(taskOpts, containerd.WithNoPivotRoot)
 		}
 		// We don't need stdio for sandbox container.
 		task, err := container.NewTask(ctx, containerdio.NullIO, taskOpts...)
 		if err != nil {
 			return status, errors.Wrap(err, "failed to create containerd task")
 		}
 		defer func() {
 			if retErr != nil {
 				deferCtx, deferCancel := ctrdutil.DeferContext()
 				defer deferCancel()
 				// Cleanup the sandbox container if an error is returned.
 				// It's possible that task is deleted by event monitor.
 				if _, err := task.Delete(deferCtx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
 					logrus.WithError(err).Errorf("Failed to delete sandbox container %q", id)
 				}
 			}
 		}()
 		if err := task.Start(ctx); err != nil {
 			return status, errors.Wrapf(err, "failed to start sandbox container task %q", id)
 		}
 		// Set the pod sandbox as ready after successfully start sandbox container.
 		status.Pid = task.Pid()
 		status.State = sandboxstore.StateReady
 		return status, nil
 	}); err != nil {
 		return nil, errors.Wrap(err, "failed to start sandbox container")
 	}
 	return &runtime.RunPodSandboxResponse{PodSandboxId: id}, nil
 }
--- a/pkg/server/sandbox_stop.go
+++ b/pkg/server/sandbox_stop.go
@ -19,7 +19,6 @@ package server
 import (
 	"time"
 	"github.com/containerd/containerd"
 	eventtypes "github.com/containerd/containerd/api/events"
 	"github.com/containerd/containerd/errdefs"
 	cni "github.com/containerd/go-cni"
@ -29,6 +28,7 @@ import (
 	"golang.org/x/sys/unix"
 	runtime "k8s.io/kubernetes/pkg/kubelet/apis/cri/runtime/v1alpha2"
 	ctrdutil "github.com/containerd/cri/pkg/containerd/util"
 	sandboxstore "github.com/containerd/cri/pkg/store/sandbox"
 )
@ -97,6 +97,7 @@ func (c *criService) StopPodSandbox(ctx context.Context, r *runtime.StopPodSandb
 // `task.Delete` is not called here because it will be called when
 // the event monitor handles the `TaskExit` event.
 func (c *criService) stopSandboxContainer(ctx context.Context, sandbox sandboxstore.Sandbox) error {
 	id := sandbox.ID
 	container := sandbox.Container
 	state := sandbox.Status.Get().State
 	task, err := container.Task(ctx, nil)
@ -105,29 +106,35 @@ func (c *criService) stopSandboxContainer(ctx context.Context, sandbox sandboxst
 			return errors.Wrap(err, "failed to get sandbox container")
 		}
 		// Don't return for unknown state, some cleanup needs to be done.
-		if state != sandboxstore.StateUnknown {
+		if state == sandboxstore.StateUnknown {
-			return nil
+			return cleanupUnknownSandbox(ctx, id, sandbox)
 		}
-		// Task is an interface, explicitly set it to nil just in case.
+		return nil
 		task = nil
 	}
 	// Handle unknown state.
 	// The cleanup logic is the same with container unknown state.
 	if state == sandboxstore.StateUnknown {
-		status, err := getTaskStatus(ctx, task)
+		// Start an exit handler for containers in unknown state.
 		waitCtx, waitCancel := context.WithCancel(ctrdutil.NamespacedContext())
 		defer waitCancel()
 		exitCh, err := task.Wait(waitCtx)
 		if err != nil {
-			return errors.Wrapf(err, "failed to get task status for %q", sandbox.ID)
+			if !errdefs.IsNotFound(err) {
-		}
+				return errors.Wrap(err, "failed to wait for task")
-		switch status.Status {
+			}
-		case containerd.Running, containerd.Created:
+			return cleanupUnknownSandbox(ctx, id, sandbox)
 			// The task is still running, continue stopping the task.
 		case containerd.Stopped:
 			// The task has exited, explicitly cleanup.
 			return cleanupUnknownSandbox(ctx, sandbox.ID, status, sandbox)
 		default:
 			return errors.Wrapf(err, "unsupported task status %q", status.Status)
 		}
 		exitCtx, exitCancel := context.WithCancel(context.Background())
 		stopCh := c.eventMonitor.startExitMonitor(exitCtx, id, task.Pid(), exitCh)
 		defer func() {
 			exitCancel()
 			// This ensures that exit monitor is stopped before
 			// `Wait` is cancelled, so no exit event is generated
 			// because of the `Wait` cancellation.
 			<-stopCh
 		}()
 	}
 	// Kill the sandbox container.
@ -166,14 +173,13 @@ func (c *criService) teardownPod(id string, path string, config *runtime.PodSand
 }
 // cleanupUnknownSandbox cleanup stopped sandbox in unknown state.
-func cleanupUnknownSandbox(ctx context.Context, id string, status containerd.Status,
+func cleanupUnknownSandbox(ctx context.Context, id string, sandbox sandboxstore.Sandbox) error {
 	sandbox sandboxstore.Sandbox) error {
 	// Reuse handleSandboxExit to do the cleanup.
 	return handleSandboxExit(ctx, &eventtypes.TaskExit{
 		ContainerID: id,
 		ID:          id,
 		Pid:         0,
-		ExitStatus:  status.ExitStatus,
+		ExitStatus:  unknownExitCode,
-		ExitedAt:    status.ExitTime,
+		ExitedAt:    time.Now(),
 	}, sandbox)
 }
--- a/pkg/store/container/container_test.go
+++ b/pkg/store/container/container_test.go
@ -112,7 +112,7 @@ func TestContainerStore(t *testing.T) {
 			ExitCode:   3,
 			Reason:     "TestReason-4a333",
 			Message:    "TestMessage-4a333",
-			Removing:   true,
+			Starting:   true,
 		},
 		"4abcd": {
 			Pid:        4,
--- a/pkg/store/container/status.go
+++ b/pkg/store/container/status.go
@ -88,6 +88,10 @@ type Status struct {
 	// Human-readable message indicating details about why container is in its
 	// current state.
 	Message string
 	// Starting indicates that the container is in starting state.
 	// This field doesn't need to be checkpointed.
 	// TODO(now): Add unit test.
 	Starting bool `json:"-"`
 	// Removing indicates that the container is in removing state.
 	// This field doesn't need to be checkpointed.
 	Removing bool `json:"-"`
--- a/pkg/store/container/status_test.go
+++ b/pkg/store/container/status_test.go
@ -75,6 +75,7 @@ func TestStatusEncodeDecode(t *testing.T) {
 		Reason:     "test-reason",
 		Message:    "test-message",
 		Removing:   true,
 		Starting:   true,
 	}
 	assert := assertlib.New(t)
 	data, err := s.encode()
@ -82,6 +83,7 @@ func TestStatusEncodeDecode(t *testing.T) {
 	newS := &Status{}
 	assert.NoError(newS.decode(data))
 	s.Removing = false // Removing should not be encoded.
 	s.Starting = false // Starting should not be encoded.
 	assert.Equal(s, newS)
 	unsupported, err := json.Marshal(&versionedStatus{
--- a/pkg/store/sandbox/sandbox.go
+++ b/pkg/store/sandbox/sandbox.go
@ -86,22 +86,9 @@ func (s *Store) Add(sb Sandbox) error {
 	return nil
 }
-// Get returns the sandbox with specified id. Returns store.ErrNotExist
+// Get returns the sandbox with specified id.
-// if the sandbox doesn't exist.
+// Returns store.ErrNotExist if the sandbox doesn't exist.
 func (s *Store) Get(id string) (Sandbox, error) {
 	sb, err := s.GetAll(id)
 	if err != nil {
 		return sb, err
 	}
 	if sb.Status.Get().State == StateInit {
 		return Sandbox{}, store.ErrNotExist
 	}
 	return sb, nil
 }
 // GetAll returns the sandbox with specified id, including sandbox in unknown
 // state. Returns store.ErrNotExist if the sandbox doesn't exist.
 func (s *Store) GetAll(id string) (Sandbox, error) {
 	s.lock.RLock()
 	defer s.lock.RUnlock()
 	id, err := s.idIndex.Get(id)
@ -123,9 +110,6 @@ func (s *Store) List() []Sandbox {
 	defer s.lock.RUnlock()
 	var sandboxes []Sandbox
 	for _, sb := range s.sandboxes {
 		if sb.Status.Get().State == StateInit {
 			continue
 		}
 		sandboxes = append(sandboxes, sb)
 	}
 	return sandboxes
--- a/pkg/store/sandbox/sandbox_test.go
+++ b/pkg/store/sandbox/sandbox_test.go
@ -106,7 +106,7 @@ func TestSandboxStore(t *testing.T) {
 			},
 			NetNSPath: "TestNetNS-3defg",
 		},
-		Status{State: StateInit},
+		Status{State: StateUnknown},
 	)
 	assert := assertlib.New(t)
 	s := NewStore()
@ -125,21 +125,16 @@ func TestSandboxStore(t *testing.T) {
 		assert.Equal(sb, got)
 	}
-	t.Logf("should not be able to get unknown sandbox")
+	t.Logf("should be able to get sandbox in unknown state with Get")
 	got, err := s.Get(unknown.ID)
 	assert.Equal(store.ErrNotExist, err)
 	assert.Equal(Sandbox{}, got)
 	t.Logf("should be able to get unknown sandbox with GetAll")
 	got, err = s.GetAll(unknown.ID)
 	assert.NoError(err)
 	assert.Equal(unknown, got)
 	t.Logf("should be able to list sandboxes")
 	sbNum := len(sandboxes) + 1
 	sbs := s.List()
-	assert.Len(sbs, len(sandboxes))
+	assert.Len(sbs, sbNum)
 	sbNum := len(sandboxes)
 	for testID, v := range sandboxes {
 		truncID := genTruncIndex(testID)
--- a/pkg/store/sandbox/status.go
+++ b/pkg/store/sandbox/status.go
@ -26,11 +26,11 @@ import (
 //                    |              |
 //                    | Create(Run)  | Load
 //                    |              |
-//      Start    +----v----+         |
+//      Start         |              |
-//     (failed)  |         |         |
+//     (failed)       |              |
-// +-------------+  INIT   |         +-----------+
+// +------------------+              +-----------+
-// |             |         |         |           |
+// |                  |              |           |
-// |             +----+----+         |           |
+// |                  |              |           |
 // |                  |              |           |
 // |                  | Start(Run)   |           |
 // |                  |              |           |
@ -53,23 +53,17 @@ import (
 // +-------------> DELETED
 // State is the sandbox state we use in containerd/cri.
-// It includes init and unknown, which are internal states not defined in CRI.
+// It includes unknown, which is internal states not defined in CRI.
 // The state mapping from internal states to CRI states:
 // * ready -> ready
 // * not ready -> not ready
 // * init -> not exist
 // * unknown -> not ready
 type State uint32
 const (
 	// StateInit is init state of sandbox. Sandbox
 	// is in init state before its corresponding sandbox container
 	// is created. Sandbox in init state should be ignored by most
 	// functions, unless the caller needs to update sandbox state.
 	StateInit State = iota
 	// StateReady is ready state, it means sandbox container
 	// is running.
-	StateReady
+	StateReady = iota
 	// StateNotReady is notready state, it ONLY means sandbox
 	// container is not running.
 	// StopPodSandbox should still be called for NOTREADY sandbox to
--- a/pkg/store/sandbox/status_test.go
+++ b/pkg/store/sandbox/status_test.go
@ -28,7 +28,7 @@ func TestStatus(t *testing.T) {
 	testStatus := Status{
 		Pid:       123,
 		CreatedAt: time.Now(),
-		State:     StateInit,
+		State:     StateUnknown,
 	}
 	updateStatus := Status{
 		Pid:       456,