Use wait instead of TaskExit.

Signed-off-by: Lantao Liu <lantaol@google.com>
2019-04-11 16:36:44 -07:00
parent ebb0928057
commit d1f9611cb0
18 changed files with 390 additions and 266 deletions
--- a/pkg/server/container_remove.go
+++ b/pkg/server/container_remove.go
@@ -105,6 +105,9 @@ func setContainerRemoving(container containerstore.Container) error {
 		if status.State() == runtime.ContainerState_CONTAINER_UNKNOWN {
 			return status, errors.New("container state is unknown, to stop first")
 		}
+		if status.Starting {
+			return status, errors.New("container is in starting state, can't be removed")
+		}
 		if status.Removing {
 			return status, errors.New("container is already in removing state")
 		}
--- a/pkg/server/container_remove_test.go
+++ b/pkg/server/container_remove_test.go
@@ -40,6 +40,13 @@ func TestSetContainerRemoving(t *testing.T) {
 			},
 			expectErr: true,
 		},
+		"should return error when container is in starting state": {
+			status: containerstore.Status{
+				CreatedAt: time.Now().UnixNano(),
+				Starting:  true,
+			},
+			expectErr: true,
+		},
 		"should return error when container is in removing state": {
 			status: containerstore.Status{
 				CreatedAt:  time.Now().UnixNano(),
@@ -71,6 +78,8 @@ func TestSetContainerRemoving(t *testing.T) {
 		} else {
 			assert.NoError(t, err)
 			assert.True(t, container.Status.Get().Removing, "removing should be set")
+			assert.NoError(t, resetContainerRemoving(container))
+			assert.False(t, container.Status.Get().Removing, "removing should be reset")
 		}
 	}
 }
--- a/pkg/server/container_start.go
+++ b/pkg/server/container_start.go
@@ -38,64 +38,48 @@ import (

 // StartContainer starts the container.
 func (c *criService) StartContainer(ctx context.Context, r *runtime.StartContainerRequest) (retRes *runtime.StartContainerResponse, retErr error) {
-	container, err := c.containerStore.Get(r.GetContainerId())
+	cntr, err := c.containerStore.Get(r.GetContainerId())
 	if err != nil {
 		return nil, errors.Wrapf(err, "an error occurred when try to find container %q", r.GetContainerId())
 	}

-	var startErr error
-	// update container status in one transaction to avoid race with event monitor.
-	if err := container.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
-		// Always apply status change no matter startContainer fails or not. Because startContainer
-		// may change container state no matter it fails or succeeds.
-		startErr = c.startContainer(ctx, container, &status)
-		return status, nil
-	}); startErr != nil {
-		return nil, startErr
-	} else if err != nil {
-		return nil, errors.Wrapf(err, "failed to update container %q metadata", container.ID)
-	}
-	return &runtime.StartContainerResponse{}, nil
-}
-
-// startContainer actually starts the container. The function needs to be run in one transaction. Any updates
-// to the status passed in will be applied no matter the function returns error or not.
-func (c *criService) startContainer(ctx context.Context,
-	cntr containerstore.Container,
-	status *containerstore.Status) (retErr error) {
 	id := cntr.ID
 	meta := cntr.Metadata
 	container := cntr.Container
 	config := meta.Config

-	// Return error if container is not in created state.
-	if status.State() != runtime.ContainerState_CONTAINER_CREATED {
-		return errors.Errorf("container %q is in %s state", id, criContainerStateToString(status.State()))
+	// Set starting state to prevent other start/remove operations against this container
+	// while it's being started.
+	if err := setContainerStarting(cntr); err != nil {
+		return nil, errors.Wrapf(err, "failed to set starting state for container %q", id)
 	}
-	// Do not start the container when there is a removal in progress.
-	if status.Removing {
-		return errors.Errorf("container %q is in removing state", id)
-	}
-
 	defer func() {
 		if retErr != nil {
 			// Set container to exited if fail to start.
-			status.Pid = 0
-			status.FinishedAt = time.Now().UnixNano()
-			status.ExitCode = errorStartExitCode
-			status.Reason = errorStartReason
-			status.Message = retErr.Error()
+			if err := cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
+				status.Pid = 0
+				status.FinishedAt = time.Now().UnixNano()
+				status.ExitCode = errorStartExitCode
+				status.Reason = errorStartReason
+				status.Message = retErr.Error()
+				return status, nil
+			}); err != nil {
+				logrus.WithError(err).Errorf("failed to set start failure state for container %q", id)
+			}
+		}
+		if err := resetContainerStarting(cntr); err != nil {
+			logrus.WithError(err).Errorf("failed to reset starting state for container %q", id)
 		}
 	}()

 	// Get sandbox config from sandbox store.
 	sandbox, err := c.sandboxStore.Get(meta.SandboxID)
 	if err != nil {
-		return errors.Wrapf(err, "sandbox %q not found", meta.SandboxID)
+		return nil, errors.Wrapf(err, "sandbox %q not found", meta.SandboxID)
 	}
 	sandboxID := meta.SandboxID
 	if sandbox.Status.Get().State != sandboxstore.StateReady {
-		return errors.Errorf("sandbox container %q is not running", sandboxID)
+		return nil, errors.Errorf("sandbox container %q is not running", sandboxID)
 	}

 	ioCreation := func(id string) (_ containerdio.IO, err error) {
@@ -110,7 +94,7 @@ func (c *criService) startContainer(ctx context.Context,

 	ctrInfo, err := container.Info(ctx)
 	if err != nil {
-		return errors.Wrap(err, "failed to get container info")
+		return nil, errors.Wrap(err, "failed to get container info")
 	}

 	var taskOpts []containerd.NewTaskOpts
@@ -120,7 +104,7 @@ func (c *criService) startContainer(ctx context.Context,
 	}
 	task, err := container.NewTask(ctx, ioCreation, taskOpts...)
 	if err != nil {
-		return errors.Wrap(err, "failed to create containerd task")
+		return nil, errors.Wrap(err, "failed to create containerd task")
 	}
 	defer func() {
 		if retErr != nil {
@@ -133,15 +117,61 @@ func (c *criService) startContainer(ctx context.Context,
 		}
 	}()

+	// wait is a long running background request, no timeout needed.
+	exitCh, err := task.Wait(ctrdutil.NamespacedContext())
+	if err != nil {
+		return nil, errors.Wrap(err, "failed to wait for containerd task")
+	}
+
 	// Start containerd task.
 	if err := task.Start(ctx); err != nil {
-		return errors.Wrapf(err, "failed to start containerd task %q", id)
+		return nil, errors.Wrapf(err, "failed to start containerd task %q", id)
 	}

 	// Update container start timestamp.
-	status.Pid = task.Pid()
-	status.StartedAt = time.Now().UnixNano()
-	return nil
+	if err := cntr.Status.UpdateSync(func(status containerstore.Status) (containerstore.Status, error) {
+		status.Pid = task.Pid()
+		status.StartedAt = time.Now().UnixNano()
+		return status, nil
+	}); err != nil {
+		return nil, errors.Wrapf(err, "failed to update container %q state", id)
+	}
+
+	// start the monitor after updating container state, this ensures that
+	// event monitor receives the TaskExit event and update container state
+	// after this.
+	c.eventMonitor.startExitMonitor(context.Background(), id, task.Pid(), exitCh)
+
+	return &runtime.StartContainerResponse{}, nil
+}
+
+// setContainerStarting sets the container into starting state. In starting state, the
+// container will not be removed or started again.
+func setContainerStarting(container containerstore.Container) error {
+	return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) {
+		// Return error if container is not in created state.
+		if status.State() != runtime.ContainerState_CONTAINER_CREATED {
+			return status, errors.Errorf("container is in %s state", criContainerStateToString(status.State()))
+		}
+		// Do not start the container when there is a removal in progress.
+		if status.Removing {
+			return status, errors.New("container is in removing state, can't be started")
+		}
+		if status.Starting {
+			return status, errors.New("container is already in starting state")
+		}
+		status.Starting = true
+		return status, nil
+	})
+}
+
+// resetContainerStarting resets the container starting state on start failure. So
+// that we could remove the container later.
+func resetContainerStarting(container containerstore.Container) error {
+	return container.Status.Update(func(status containerstore.Status) (containerstore.Status, error) {
+		status.Starting = false
+		return status, nil
+	})
 }

 // createContainerLoggers creates container loggers and return write closer for stdout and stderr.
--- a/pkg/server/container_start_test.go
+++ b/pkg/server/container_start_test.go
@@ -0,0 +1,98 @@
+/*
+Copyright 2019 The containerd Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package server
+
+import (
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/assert"
+
+	containerstore "github.com/containerd/cri/pkg/store/container"
+)
+
+// TestSetContainerStarting tests setContainerStarting sets removing
+// state correctly.
+func TestSetContainerStarting(t *testing.T) {
+	testID := "test-id"
+	for desc, test := range map[string]struct {
+		status    containerstore.Status
+		expectErr bool
+	}{
+
+		"should not return error when container is in created state": {
+			status: containerstore.Status{
+				CreatedAt: time.Now().UnixNano(),
+			},
+			expectErr: false,
+		},
+		"should return error when container is in running state": {
+			status: containerstore.Status{
+				CreatedAt: time.Now().UnixNano(),
+				StartedAt: time.Now().UnixNano(),
+			},
+			expectErr: true,
+		},
+		"should return error when container is in exited state": {
+			status: containerstore.Status{
+				CreatedAt:  time.Now().UnixNano(),
+				StartedAt:  time.Now().UnixNano(),
+				FinishedAt: time.Now().UnixNano(),
+			},
+			expectErr: true,
+		},
+		"should return error when container is in unknown state": {
+			status: containerstore.Status{
+				CreatedAt:  0,
+				StartedAt:  0,
+				FinishedAt: 0,
+			},
+			expectErr: true,
+		},
+		"should return error when container is in starting state": {
+			status: containerstore.Status{
+				CreatedAt: time.Now().UnixNano(),
+				Starting:  true,
+			},
+			expectErr: true,
+		},
+		"should return error when container is in removing state": {
+			status: containerstore.Status{
+				CreatedAt: time.Now().UnixNano(),
+				Removing:  true,
+			},
+			expectErr: true,
+		},
+	} {
+		t.Logf("TestCase %q", desc)
+		container, err := containerstore.NewContainer(
+			containerstore.Metadata{ID: testID},
+			containerstore.WithFakeStatus(test.status),
+		)
+		assert.NoError(t, err)
+		err = setContainerStarting(container)
+		if test.expectErr {
+			assert.Error(t, err)
+			assert.Equal(t, test.status, container.Status.Get(), "metadata should not be updated")
+		} else {
+			assert.NoError(t, err)
+			assert.True(t, container.Status.Get().Starting, "starting should be set")
+			assert.NoError(t, resetContainerStarting(container))
+			assert.False(t, container.Status.Get().Starting, "starting should be reset")
+		}
+	}
+}
--- a/pkg/server/container_stop.go
+++ b/pkg/server/container_stop.go
@@ -28,6 +28,7 @@ import (
 	"golang.org/x/sys/unix"
 	runtime "k8s.io/kubernetes/pkg/kubelet/apis/cri/runtime/v1alpha2"

+	ctrdutil "github.com/containerd/cri/pkg/containerd/util"
 	"github.com/containerd/cri/pkg/store"
 	containerstore "github.com/containerd/cri/pkg/store/container"
 )
@@ -74,36 +75,34 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore
 			return errors.Wrapf(err, "failed to get task for container %q", id)
 		}
 		// Don't return for unknown state, some cleanup needs to be done.
-		if state != runtime.ContainerState_CONTAINER_UNKNOWN {
-			return nil
+		if state == runtime.ContainerState_CONTAINER_UNKNOWN {
+			return cleanupUnknownContainer(ctx, id, container)
 		}
-		// Task is an interface, explicitly set it to nil just in case.
-		task = nil
+		return nil
 	}

 	// Handle unknown state.
 	if state == runtime.ContainerState_CONTAINER_UNKNOWN {
-		status, err := getTaskStatus(ctx, task)
+		// Start an exit handler for containers in unknown state.
+		waitCtx, waitCancel := context.WithCancel(ctrdutil.NamespacedContext())
+		defer waitCancel()
+		exitCh, err := task.Wait(waitCtx)
 		if err != nil {
-			return errors.Wrapf(err, "failed to get task status for %q", id)
-		}
-		switch status.Status {
-		case containerd.Running, containerd.Created:
-			// The task is still running, continue stopping the task.
-		case containerd.Stopped:
-			// The task has exited. If the task exited after containerd
-			// started, the event monitor will receive its exit event; if it
-			// exited before containerd started, the event monitor will never
-			// receive its exit event.
-			// However, we can't tell that because the task state was not
-			// successfully loaded during containerd start (container is
-			// in UNKNOWN state).
-			// So always do cleanup here, just in case that we've missed the
-			// exit event.
-			return cleanupUnknownContainer(ctx, id, status, container)
-		default:
-			return errors.Wrapf(err, "unsupported task status %q", status.Status)
+			if !errdefs.IsNotFound(err) {
+				return errors.Wrapf(err, "failed to wait for task for %q", id)
+			}
+			return cleanupUnknownContainer(ctx, id, container)
 		}
+
+		exitCtx, exitCancel := context.WithCancel(context.Background())
+		stopCh := c.eventMonitor.startExitMonitor(exitCtx, id, task.Pid(), exitCh)
+		defer func() {
+			exitCancel()
+			// This ensures that exit monitor is stopped before
+			// `Wait` is cancelled, so no exit event is generated
+			// because of the `Wait` cancellation.
+			<-stopCh
+		}()
 	}

 	// We only need to kill the task. The event handler will Delete the
@@ -176,19 +175,13 @@ func (c *criService) waitContainerStop(ctx context.Context, container containers
 }

 // cleanupUnknownContainer cleanup stopped container in unknown state.
-func cleanupUnknownContainer(ctx context.Context, id string, status containerd.Status,
-	cntr containerstore.Container) error {
+func cleanupUnknownContainer(ctx context.Context, id string, cntr containerstore.Container) error {
 	// Reuse handleContainerExit to do the cleanup.
-	// NOTE(random-liu): If the task did exit after containerd started, both
-	// the event monitor and the cleanup function would update the container
-	// state. The final container state will be whatever being updated first.
-	// There is no way to completely avoid this race condition, and for best
-	// effort unknown state container cleanup, this seems acceptable.
 	return handleContainerExit(ctx, &eventtypes.TaskExit{
 		ContainerID: id,
 		ID:          id,
 		Pid:         0,
-		ExitStatus:  status.ExitStatus,
-		ExitedAt:    status.ExitTime,
+		ExitStatus:  unknownExitCode,
+		ExitedAt:    time.Now(),
 	}, cntr)
 }
--- a/pkg/server/events.go
+++ b/pkg/server/events.go
@@ -50,13 +50,17 @@ const (
 	// Add a timeout for each event handling, events that timeout will be requeued and
 	// handled again in the future.
 	handleEventTimeout = 10 * time.Second
+
+	exitChannelSize = 1024
 )

 // eventMonitor monitors containerd event and updates internal state correspondingly.
 // TODO(random-liu): Handle event for each container in a separate goroutine.
 type eventMonitor struct {
-	c       *criService
-	ch      <-chan *events.Envelope
+	c  *criService
+	ch <-chan *events.Envelope
+	// exitCh receives container/sandbox exit events from exit monitors.
+	exitCh  chan *eventtypes.TaskExit
 	errCh   <-chan error
 	ctx     context.Context
 	cancel  context.CancelFunc
@@ -89,6 +93,7 @@ func newEventMonitor(c *criService) *eventMonitor {
 		c:       c,
 		ctx:     ctx,
 		cancel:  cancel,
+		exitCh:  make(chan *eventtypes.TaskExit, exitChannelSize),
 		backOff: newBackOff(),
 	}
 }
@@ -98,13 +103,38 @@ func (em *eventMonitor) subscribe(subscriber events.Subscriber) {
 	// note: filters are any match, if you want any match but not in namespace foo
 	// then you have to manually filter namespace foo
 	filters := []string{
-		`topic=="/tasks/exit"`,
 		`topic=="/tasks/oom"`,
 		`topic~="/images/"`,
 	}
 	em.ch, em.errCh = subscriber.Subscribe(em.ctx, filters...)
 }

+// startExitMonitor starts an exit monitor for a given container/sandbox.
+func (em *eventMonitor) startExitMonitor(ctx context.Context, id string, pid uint32, exitCh <-chan containerd.ExitStatus) <-chan struct{} {
+	stopCh := make(chan struct{})
+	go func() {
+		defer close(stopCh)
+		select {
+		case exitRes := <-exitCh:
+			exitStatus, exitedAt, err := exitRes.Result()
+			if err != nil {
+				logrus.WithError(err).Errorf("Failed to get task exit status for %q", id)
+				exitStatus = unknownExitCode
+				exitedAt = time.Now()
+			}
+			em.exitCh <- &eventtypes.TaskExit{
+				ContainerID: id,
+				ID:          id,
+				Pid:         pid,
+				ExitStatus:  exitStatus,
+				ExitedAt:    exitedAt,
+			}
+		case <-ctx.Done():
+		}
+	}()
+	return stopCh
+}
+
 func convertEvent(e *gogotypes.Any) (string, interface{}, error) {
 	id := ""
 	evt, err := typeurl.UnmarshalAny(e)
@@ -113,8 +143,6 @@ func convertEvent(e *gogotypes.Any) (string, interface{}, error) {
 	}

 	switch e := evt.(type) {
-	case *eventtypes.TaskExit:
-		id = e.ContainerID
 	case *eventtypes.TaskOOM:
 		id = e.ContainerID
 	case *eventtypes.ImageCreate:
@@ -142,6 +170,18 @@ func (em *eventMonitor) start() <-chan error {
 		defer close(errCh)
 		for {
 			select {
+			case e := <-em.exitCh:
+				logrus.Debugf("Received exit event %+v", e)
+				id := e.ID
+				if em.backOff.isInBackOff(id) {
+					logrus.Infof("Events for %q is in backoff, enqueue event %+v", id, e)
+					em.backOff.enBackOff(id, e)
+					break
+				}
+				if err := em.handleEvent(e); err != nil {
+					logrus.WithError(err).Errorf("Failed to handle exit event %+v for %s", e, id)
+					em.backOff.enBackOff(id, e)
+				}
 			case e := <-em.ch:
 				logrus.Debugf("Received containerd event timestamp - %v, namespace - %q, topic - %q", e.Timestamp, e.Namespace, e.Topic)
 				if e.Namespace != constants.K8sContainerdNamespace {
@@ -213,8 +253,7 @@ func (em *eventMonitor) handleEvent(any interface{}) error {
 		} else if err != store.ErrNotExist {
 			return errors.Wrap(err, "can't find container for TaskExit event")
 		}
-		// Use GetAll to include sandbox in init state.
-		sb, err := em.c.sandboxStore.GetAll(e.ID)
+		sb, err := em.c.sandboxStore.Get(e.ID)
 		if err == nil {
 			if err := handleSandboxExit(ctx, e, sb); err != nil {
 				return errors.Wrap(err, "failed to handle sandbox TaskExit event")
@@ -322,15 +361,7 @@ func handleSandboxExit(ctx context.Context, e *eventtypes.TaskExit, sb sandboxst
 		}
 	}
 	err = sb.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
-		// NOTE(random-liu): We SHOULD NOT change INIT state here.
-		// If sandbox state is INIT when event monitor receives an TaskExit event,
-		// it means that sandbox start has failed. In that case, `RunPodSandbox` will
-		// cleanup everything immediately.
-		// Once sandbox state goes out of INIT, it becomes visable to the user, which
-		// is not what we want.
-		if status.State != sandboxstore.StateInit {
-			status.State = sandboxstore.StateNotReady
-		}
+		status.State = sandboxstore.StateNotReady
 		status.Pid = 0
 		return status, nil
 	})
--- a/pkg/server/events_test.go
+++ b/pkg/server/events_test.go
@@ -21,9 +21,7 @@ import (
 	"time"

 	eventtypes "github.com/containerd/containerd/api/events"
-	//"github.com/containerd/containerd/api/services/events/v1"
 	"github.com/containerd/typeurl"
-	//gogotypes "github.com/gogo/protobuf/types"
 	"github.com/stretchr/testify/assert"
 	"k8s.io/apimachinery/pkg/util/clock"
 )
@@ -35,22 +33,22 @@ func TestBackOff(t *testing.T) {
 	inputQueues := map[string]*backOffQueue{
 		"container1": {
 			events: []interface{}{
-				&eventtypes.TaskExit{ContainerID: "container1", ID: "1"},
-				&eventtypes.TaskExit{ContainerID: "container1", ID: "2"},
+				&eventtypes.TaskOOM{ContainerID: "container1"},
+				&eventtypes.TaskOOM{ContainerID: "container1"},
 			},
 		},
 		"container2": {
 			events: []interface{}{
-				&eventtypes.TaskExit{ContainerID: "container2", ID: "1"},
-				&eventtypes.TaskExit{ContainerID: "container2", ID: "2"},
+				&eventtypes.TaskOOM{ContainerID: "container2"},
+				&eventtypes.TaskOOM{ContainerID: "container2"},
 			},
 		},
 	}
 	expectedQueues := map[string]*backOffQueue{
 		"container2": {
 			events: []interface{}{
-				&eventtypes.TaskExit{ContainerID: "container2", ID: "1"},
-				&eventtypes.TaskExit{ContainerID: "container2", ID: "2"},
+				&eventtypes.TaskOOM{ContainerID: "container2"},
+				&eventtypes.TaskOOM{ContainerID: "container2"},
 			},
 			expireTime: testClock.Now().Add(backOffInitDuration),
 			duration:   backOffInitDuration,
@@ -58,8 +56,8 @@ func TestBackOff(t *testing.T) {
 		},
 		"container1": {
 			events: []interface{}{
-				&eventtypes.TaskExit{ContainerID: "container1", ID: "1"},
-				&eventtypes.TaskExit{ContainerID: "container1", ID: "2"},
+				&eventtypes.TaskOOM{ContainerID: "container1"},
+				&eventtypes.TaskOOM{ContainerID: "container1"},
 			},
 			expireTime: testClock.Now().Add(backOffInitDuration),
 			duration:   backOffInitDuration,
--- a/pkg/server/helpers.go
+++ b/pkg/server/helpers.go
@@ -23,12 +23,9 @@ import (
 	"regexp"
 	"strconv"
 	"strings"
-	"time"

 	"github.com/BurntSushi/toml"
-	"github.com/containerd/containerd"
 	"github.com/containerd/containerd/containers"
-	"github.com/containerd/containerd/errdefs"
 	"github.com/containerd/containerd/runtime/linux/runctypes"
 	runcoptions "github.com/containerd/containerd/runtime/v2/runc/options"
 	"github.com/containerd/typeurl"
@@ -477,31 +474,6 @@ func unknownSandboxStatus() sandboxstore.Status {
 	}
 }

-// unknownExitStatus generates containerd.Status for container exited with unknown exit code.
-func unknownExitStatus() containerd.Status {
-	return containerd.Status{
-		Status:     containerd.Stopped,
-		ExitStatus: unknownExitCode,
-		ExitTime:   time.Now(),
-	}
-}
-
-// getTaskStatus returns status for a given task. It returns unknown exit status if
-// the task is nil or not found.
-func getTaskStatus(ctx context.Context, task containerd.Task) (containerd.Status, error) {
-	if task == nil {
-		return unknownExitStatus(), nil
-	}
-	status, err := task.Status(ctx)
-	if err != nil {
-		if !errdefs.IsNotFound(err) {
-			return containerd.Status{}, err
-		}
-		return unknownExitStatus(), nil
-	}
-	return status, nil
-}
-
 // getPassthroughAnnotations filters requested pod annotations by comparing
 // against permitted annotations for the given runtime.
 func getPassthroughAnnotations(podAnnotations map[string]string,
--- a/pkg/server/restart.go
+++ b/pkg/server/restart.go
@@ -34,6 +34,7 @@ import (
 	"golang.org/x/net/context"
 	runtime "k8s.io/kubernetes/pkg/kubelet/apis/cri/runtime/v1alpha2"

+	ctrdutil "github.com/containerd/cri/pkg/containerd/util"
 	"github.com/containerd/cri/pkg/netns"
 	cio "github.com/containerd/cri/pkg/server/io"
 	containerstore "github.com/containerd/cri/pkg/store/container"
@@ -57,7 +58,7 @@ func (c *criService) recover(ctx context.Context) error {
 		return errors.Wrap(err, "failed to list sandbox containers")
 	}
 	for _, sandbox := range sandboxes {
-		sb, err := loadSandbox(ctx, sandbox)
+		sb, err := c.loadSandbox(ctx, sandbox)
 		if err != nil {
 			logrus.WithError(err).Errorf("Failed to load sandbox %q", sandbox.ID())
 			continue
@@ -275,6 +276,22 @@ func (c *criService) loadContainer(ctx context.Context, cntr containerd.Containe
 					status.StartedAt = time.Now().UnixNano()
 					status.Pid = t.Pid()
 				}
+				// Wait for the task for exit monitor.
+				// wait is a long running background request, no timeout needed.
+				exitCh, err := t.Wait(ctrdutil.NamespacedContext())
+				if err != nil {
+					if !errdefs.IsNotFound(err) {
+						return errors.Wrap(err, "failed to wait for task")
+					}
+					// Container was in running state, but its task has been deleted,
+					// set unknown exited state.
+					status.FinishedAt = time.Now().UnixNano()
+					status.ExitCode = unknownExitCode
+					status.Reason = unknownExitReason
+				} else {
+					// Start exit monitor.
+					c.eventMonitor.startExitMonitor(context.Background(), id, status.Pid, exitCh)
+				}
 			case containerd.Stopped:
 				// Task is stopped. Updata status and delete the task.
 				if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
@@ -304,7 +321,7 @@ func (c *criService) loadContainer(ctx context.Context, cntr containerd.Containe
 }

 // loadSandbox loads sandbox from containerd.
-func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error) {
+func (c *criService) loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error) {
 	ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout)
 	defer cancel()
 	var sandbox sandboxstore.Sandbox
@@ -358,9 +375,20 @@ func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.S
 			status.State = sandboxstore.StateNotReady
 		} else {
 			if taskStatus.Status == containerd.Running {
-				// Task is running, set sandbox state as READY.
-				status.State = sandboxstore.StateReady
-				status.Pid = t.Pid()
+				// Wait for the task for sandbox monitor.
+				// wait is a long running background request, no timeout needed.
+				exitCh, err := t.Wait(ctrdutil.NamespacedContext())
+				if err != nil {
+					if !errdefs.IsNotFound(err) {
+						return status, errors.Wrap(err, "failed to wait for task")
+					}
+					status.State = sandboxstore.StateNotReady
+				} else {
+					// Task is running, set sandbox state as READY.
+					status.State = sandboxstore.StateReady
+					status.Pid = t.Pid()
+					c.eventMonitor.startExitMonitor(context.Background(), meta.ID, status.Pid, exitCh)
+				}
 			} else {
 				// Task is not running. Delete the task and set sandbox state as NOTREADY.
 				if _, err := t.Delete(ctx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
--- a/pkg/server/sandbox_run.go
+++ b/pkg/server/sandbox_run.go
@@ -87,7 +87,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
 			RuntimeHandler: r.GetRuntimeHandler(),
 		},
 		sandboxstore.Status{
-			State: sandboxstore.StateInit,
+			State: sandboxstore.StateUnknown,
 		},
 	)

@@ -253,88 +253,65 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
 	if err != nil {
 		return nil, errors.Wrap(err, "failed to get sandbox container info")
 	}
+
+	// Create sandbox task in containerd.
+	log.Tracef("Create sandbox container (id=%q, name=%q).",
+		id, name)
+
+	var taskOpts []containerd.NewTaskOpts
+	// TODO(random-liu): Remove this after shim v1 is deprecated.
+	if c.config.NoPivot && ociRuntime.Type == linuxRuntime {
+		taskOpts = append(taskOpts, containerd.WithNoPivotRoot)
+	}
+	// We don't need stdio for sandbox container.
+	task, err := container.NewTask(ctx, containerdio.NullIO, taskOpts...)
+	if err != nil {
+		return nil, errors.Wrap(err, "failed to create containerd task")
+	}
+	defer func() {
+		if retErr != nil {
+			deferCtx, deferCancel := ctrdutil.DeferContext()
+			defer deferCancel()
+			// Cleanup the sandbox container if an error is returned.
+			if _, err := task.Delete(deferCtx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
+				logrus.WithError(err).Errorf("Failed to delete sandbox container %q", id)
+			}
+		}
+	}()
+
+	// wait is a long running background request, no timeout needed.
+	exitCh, err := task.Wait(ctrdutil.NamespacedContext())
+	if err != nil {
+		return nil, errors.Wrap(err, "failed to wait for sandbox container task")
+	}
+
+	if err := task.Start(ctx); err != nil {
+		return nil, errors.Wrapf(err, "failed to start sandbox container task %q", id)
+	}
+
 	if err := sandbox.Status.Update(func(status sandboxstore.Status) (sandboxstore.Status, error) {
+		// Set the pod sandbox as ready after successfully start sandbox container.
+		status.Pid = task.Pid()
+		status.State = sandboxstore.StateReady
 		status.CreatedAt = info.CreatedAt
 		return status, nil
 	}); err != nil {
-		return nil, errors.Wrap(err, "failed to update sandbox created timestamp")
+		return nil, errors.Wrap(err, "failed to update sandbox status")
 	}

 	// Add sandbox into sandbox store in INIT state.
 	sandbox.Container = container
+
 	if err := c.sandboxStore.Add(sandbox); err != nil {
 		return nil, errors.Wrapf(err, "failed to add sandbox %+v into store", sandbox)
 	}
-	defer func() {
-		// Delete sandbox from sandbox store if there is an error.
-		if retErr != nil {
-			c.sandboxStore.Delete(id)
-		}
-	}()
-	// NOTE(random-liu): Sandbox state only stay in INIT state after this point
-	// and before the end of this function.
-	// * If `Update` succeeds, sandbox state will become READY in one transaction.
-	// * If `Update` fails, sandbox will be removed from the store in the defer above.
-	// * If containerd stops at any point before `Update` finishes, because sandbox
-	// state is not checkpointed, it will be recovered from corresponding containerd task
-	// status during restart:
-	//   * If the task is running, sandbox state will be READY,
-	//   * Or else, sandbox state will be NOTREADY.
+
+	// start the monitor after adding sandbox into the store, this ensures
+	// that sandbox is in the store, when event monitor receives the TaskExit event.
 	//
-	// In any case, sandbox will leave INIT state, so it's safe to ignore sandbox
-	// in INIT state in other functions.
-
-	// Start sandbox container in one transaction to avoid race condition with
-	// event monitor.
-	if err := sandbox.Status.Update(func(status sandboxstore.Status) (_ sandboxstore.Status, retErr error) {
-		// NOTE(random-liu): We should not change the sandbox state to NOTREADY
-		// if `Update` fails.
-		//
-		// If `Update` fails, the sandbox will be cleaned up by all the defers
-		// above. We should not let user see this sandbox, or else they will
-		// see the sandbox disappear after the defer clean up, which may confuse
-		// them.
-		//
-		// Given so, we should keep the sandbox in INIT state if `Update` fails,
-		// and ignore sandbox in INIT state in all the inspection functions.
-
-		// Create sandbox task in containerd.
-		log.Tracef("Create sandbox container (id=%q, name=%q).",
-			id, name)
-
-		var taskOpts []containerd.NewTaskOpts
-		// TODO(random-liu): Remove this after shim v1 is deprecated.
-		if c.config.NoPivot && ociRuntime.Type == linuxRuntime {
-			taskOpts = append(taskOpts, containerd.WithNoPivotRoot)
-		}
-		// We don't need stdio for sandbox container.
-		task, err := container.NewTask(ctx, containerdio.NullIO, taskOpts...)
-		if err != nil {
-			return status, errors.Wrap(err, "failed to create containerd task")
-		}
-		defer func() {
-			if retErr != nil {
-				deferCtx, deferCancel := ctrdutil.DeferContext()
-				defer deferCancel()
-				// Cleanup the sandbox container if an error is returned.
-				// It's possible that task is deleted by event monitor.
-				if _, err := task.Delete(deferCtx, containerd.WithProcessKill); err != nil && !errdefs.IsNotFound(err) {
-					logrus.WithError(err).Errorf("Failed to delete sandbox container %q", id)
-				}
-			}
-		}()
-
-		if err := task.Start(ctx); err != nil {
-			return status, errors.Wrapf(err, "failed to start sandbox container task %q", id)
-		}
-
-		// Set the pod sandbox as ready after successfully start sandbox container.
-		status.Pid = task.Pid()
-		status.State = sandboxstore.StateReady
-		return status, nil
-	}); err != nil {
-		return nil, errors.Wrap(err, "failed to start sandbox container")
-	}
+	// TaskOOM from containerd may come before sandbox is added to store,
+	// but we don't care about sandbox TaskOOM right now, so it is fine.
+	c.eventMonitor.startExitMonitor(context.Background(), id, task.Pid(), exitCh)

 	return &runtime.RunPodSandboxResponse{PodSandboxId: id}, nil
 }
--- a/pkg/server/sandbox_stop.go
+++ b/pkg/server/sandbox_stop.go
@@ -19,7 +19,6 @@ package server
 import (
 	"time"

-	"github.com/containerd/containerd"
 	eventtypes "github.com/containerd/containerd/api/events"
 	"github.com/containerd/containerd/errdefs"
 	cni "github.com/containerd/go-cni"
@@ -29,6 +28,7 @@ import (
 	"golang.org/x/sys/unix"
 	runtime "k8s.io/kubernetes/pkg/kubelet/apis/cri/runtime/v1alpha2"

+	ctrdutil "github.com/containerd/cri/pkg/containerd/util"
 	sandboxstore "github.com/containerd/cri/pkg/store/sandbox"
 )

@@ -97,6 +97,7 @@ func (c *criService) StopPodSandbox(ctx context.Context, r *runtime.StopPodSandb
 // `task.Delete` is not called here because it will be called when
 // the event monitor handles the `TaskExit` event.
 func (c *criService) stopSandboxContainer(ctx context.Context, sandbox sandboxstore.Sandbox) error {
+	id := sandbox.ID
 	container := sandbox.Container
 	state := sandbox.Status.Get().State
 	task, err := container.Task(ctx, nil)
@@ -105,29 +106,35 @@ func (c *criService) stopSandboxContainer(ctx context.Context, sandbox sandboxst
 			return errors.Wrap(err, "failed to get sandbox container")
 		}
 		// Don't return for unknown state, some cleanup needs to be done.
-		if state != sandboxstore.StateUnknown {
-			return nil
+		if state == sandboxstore.StateUnknown {
+			return cleanupUnknownSandbox(ctx, id, sandbox)
 		}
-		// Task is an interface, explicitly set it to nil just in case.
-		task = nil
+		return nil
 	}

 	// Handle unknown state.
 	// The cleanup logic is the same with container unknown state.
 	if state == sandboxstore.StateUnknown {
-		status, err := getTaskStatus(ctx, task)
+		// Start an exit handler for containers in unknown state.
+		waitCtx, waitCancel := context.WithCancel(ctrdutil.NamespacedContext())
+		defer waitCancel()
+		exitCh, err := task.Wait(waitCtx)
 		if err != nil {
-			return errors.Wrapf(err, "failed to get task status for %q", sandbox.ID)
-		}
-		switch status.Status {
-		case containerd.Running, containerd.Created:
-			// The task is still running, continue stopping the task.
-		case containerd.Stopped:
-			// The task has exited, explicitly cleanup.
-			return cleanupUnknownSandbox(ctx, sandbox.ID, status, sandbox)
-		default:
-			return errors.Wrapf(err, "unsupported task status %q", status.Status)
+			if !errdefs.IsNotFound(err) {
+				return errors.Wrap(err, "failed to wait for task")
+			}
+			return cleanupUnknownSandbox(ctx, id, sandbox)
 		}
+
+		exitCtx, exitCancel := context.WithCancel(context.Background())
+		stopCh := c.eventMonitor.startExitMonitor(exitCtx, id, task.Pid(), exitCh)
+		defer func() {
+			exitCancel()
+			// This ensures that exit monitor is stopped before
+			// `Wait` is cancelled, so no exit event is generated
+			// because of the `Wait` cancellation.
+			<-stopCh
+		}()
 	}

 	// Kill the sandbox container.
@@ -166,14 +173,13 @@ func (c *criService) teardownPod(id string, path string, config *runtime.PodSand
 }

 // cleanupUnknownSandbox cleanup stopped sandbox in unknown state.
-func cleanupUnknownSandbox(ctx context.Context, id string, status containerd.Status,
-	sandbox sandboxstore.Sandbox) error {
+func cleanupUnknownSandbox(ctx context.Context, id string, sandbox sandboxstore.Sandbox) error {
 	// Reuse handleSandboxExit to do the cleanup.
 	return handleSandboxExit(ctx, &eventtypes.TaskExit{
 		ContainerID: id,
 		ID:          id,
 		Pid:         0,
-		ExitStatus:  status.ExitStatus,
-		ExitedAt:    status.ExitTime,
+		ExitStatus:  unknownExitCode,
+		ExitedAt:    time.Now(),
 	}, sandbox)
 }
--- a/pkg/store/container/container_test.go
+++ b/pkg/store/container/container_test.go
@@ -112,7 +112,7 @@ func TestContainerStore(t *testing.T) {
 			ExitCode:   3,
 			Reason:     "TestReason-4a333",
 			Message:    "TestMessage-4a333",
-			Removing:   true,
+			Starting:   true,
 		},
 		"4abcd": {
 			Pid:        4,
--- a/pkg/store/container/status.go
+++ b/pkg/store/container/status.go
@@ -88,6 +88,10 @@ type Status struct {
 	// Human-readable message indicating details about why container is in its
 	// current state.
 	Message string
+	// Starting indicates that the container is in starting state.
+	// This field doesn't need to be checkpointed.
+	// TODO(now): Add unit test.
+	Starting bool `json:"-"`
 	// Removing indicates that the container is in removing state.
 	// This field doesn't need to be checkpointed.
 	Removing bool `json:"-"`
--- a/pkg/store/container/status_test.go
+++ b/pkg/store/container/status_test.go
@@ -75,6 +75,7 @@ func TestStatusEncodeDecode(t *testing.T) {
 		Reason:     "test-reason",
 		Message:    "test-message",
 		Removing:   true,
+		Starting:   true,
 	}
 	assert := assertlib.New(t)
 	data, err := s.encode()
@@ -82,6 +83,7 @@ func TestStatusEncodeDecode(t *testing.T) {
 	newS := &Status{}
 	assert.NoError(newS.decode(data))
 	s.Removing = false // Removing should not be encoded.
+	s.Starting = false // Starting should not be encoded.
 	assert.Equal(s, newS)

 	unsupported, err := json.Marshal(&versionedStatus{
--- a/pkg/store/sandbox/sandbox.go
+++ b/pkg/store/sandbox/sandbox.go
@@ -86,22 +86,9 @@ func (s *Store) Add(sb Sandbox) error {
 	return nil
 }

-// Get returns the sandbox with specified id. Returns store.ErrNotExist
-// if the sandbox doesn't exist.
+// Get returns the sandbox with specified id.
+// Returns store.ErrNotExist if the sandbox doesn't exist.
 func (s *Store) Get(id string) (Sandbox, error) {
-	sb, err := s.GetAll(id)
-	if err != nil {
-		return sb, err
-	}
-	if sb.Status.Get().State == StateInit {
-		return Sandbox{}, store.ErrNotExist
-	}
-	return sb, nil
-}
-
-// GetAll returns the sandbox with specified id, including sandbox in unknown
-// state. Returns store.ErrNotExist if the sandbox doesn't exist.
-func (s *Store) GetAll(id string) (Sandbox, error) {
 	s.lock.RLock()
 	defer s.lock.RUnlock()
 	id, err := s.idIndex.Get(id)
@@ -123,9 +110,6 @@ func (s *Store) List() []Sandbox {
 	defer s.lock.RUnlock()
 	var sandboxes []Sandbox
 	for _, sb := range s.sandboxes {
-		if sb.Status.Get().State == StateInit {
-			continue
-		}
 		sandboxes = append(sandboxes, sb)
 	}
 	return sandboxes
--- a/pkg/store/sandbox/sandbox_test.go
+++ b/pkg/store/sandbox/sandbox_test.go
@@ -106,7 +106,7 @@ func TestSandboxStore(t *testing.T) {
 			},
 			NetNSPath: "TestNetNS-3defg",
 		},
-		Status{State: StateInit},
+		Status{State: StateUnknown},
 	)
 	assert := assertlib.New(t)
 	s := NewStore()
@@ -125,21 +125,16 @@ func TestSandboxStore(t *testing.T) {
 		assert.Equal(sb, got)
 	}

-	t.Logf("should not be able to get unknown sandbox")
+	t.Logf("should be able to get sandbox in unknown state with Get")
 	got, err := s.Get(unknown.ID)
-	assert.Equal(store.ErrNotExist, err)
-	assert.Equal(Sandbox{}, got)
-
-	t.Logf("should be able to get unknown sandbox with GetAll")
-	got, err = s.GetAll(unknown.ID)
 	assert.NoError(err)
 	assert.Equal(unknown, got)

 	t.Logf("should be able to list sandboxes")
+	sbNum := len(sandboxes) + 1
 	sbs := s.List()
-	assert.Len(sbs, len(sandboxes))
+	assert.Len(sbs, sbNum)

-	sbNum := len(sandboxes)
 	for testID, v := range sandboxes {
 		truncID := genTruncIndex(testID)

--- a/pkg/store/sandbox/status.go
+++ b/pkg/store/sandbox/status.go
@@ -26,11 +26,11 @@ import (
 //                    |              |
 //                    | Create(Run)  | Load
 //                    |              |
-//      Start    +----v----+         |
-//     (failed)  |         |         |
-// +-------------+  INIT   |         +-----------+
-// |             |         |         |           |
-// |             +----+----+         |           |
+//      Start         |              |
+//     (failed)       |              |
+// +------------------+              +-----------+
+// |                  |              |           |
+// |                  |              |           |
 // |                  |              |           |
 // |                  | Start(Run)   |           |
 // |                  |              |           |
@@ -53,23 +53,17 @@ import (
 // +-------------> DELETED

 // State is the sandbox state we use in containerd/cri.
-// It includes init and unknown, which are internal states not defined in CRI.
+// It includes unknown, which is internal states not defined in CRI.
 // The state mapping from internal states to CRI states:
 // * ready -> ready
 // * not ready -> not ready
-// * init -> not exist
 // * unknown -> not ready
 type State uint32

 const (
-	// StateInit is init state of sandbox. Sandbox
-	// is in init state before its corresponding sandbox container
-	// is created. Sandbox in init state should be ignored by most
-	// functions, unless the caller needs to update sandbox state.
-	StateInit State = iota
 	// StateReady is ready state, it means sandbox container
 	// is running.
-	StateReady
+	StateReady = iota
 	// StateNotReady is notready state, it ONLY means sandbox
 	// container is not running.
 	// StopPodSandbox should still be called for NOTREADY sandbox to
--- a/pkg/store/sandbox/status_test.go
+++ b/pkg/store/sandbox/status_test.go
@@ -28,7 +28,7 @@ func TestStatus(t *testing.T) {
 	testStatus := Status{
 		Pid:       123,
 		CreatedAt: time.Now(),
-		State:     StateInit,
+		State:     StateUnknown,
 	}
 	updateStatus := Status{
 		Pid:       456,