Add timeout for container/sandbox recover and event monitor.

Signed-off-by: Lantao Liu <lantaol@google.com>
2018-08-23 21:34:59 -07:00 · 2018-08-23 21:34:59 -07:00 · 963a01735b
commit 963a01735b
parent 6de38f1f3a
3 changed files with 32 additions and 6 deletions
--- a/pkg/server/events.go
+++ b/pkg/server/events.go
@ -42,11 +42,17 @@ const (
 	backOffInitDuration        = 1 * time.Second
 	backOffMaxDuration         = 5 * time.Minute
 	backOffExpireCheckDuration = 1 * time.Second
+
+	// handleEventTimeout is the timeout for handling 1 event. Event monitor
+	// handles events in serial, if one event blocks the event monitor, no
+	// other events can be handled.
+	// Add a timeout for each event handling, events that timeout will be requeued and
+	// handled again in the future.
+	handleEventTimeout = 10 * time.Second
 )

 // eventMonitor monitors containerd event and updates internal state correspondingly.
-// TODO(random-liu): [P1] Figure out is it possible to drop event during containerd
-// is running. If it is, we should do periodically list to sync state with containerd.
+// TODO(random-liu): Handle event for each container in a separate goroutine.
 type eventMonitor struct {
 	containerStore *containerstore.Store
 	sandboxStore   *sandboxstore.Store
@ -189,6 +195,9 @@ func (em *eventMonitor) stop() {
 // handleEvent handles a containerd event.
 func (em *eventMonitor) handleEvent(any interface{}) error {
 	ctx := ctrdutil.NamespacedContext()
+	ctx, cancel := context.WithTimeout(ctx, handleEventTimeout)
+	defer cancel()
+
 	switch any.(type) {
 	// If containerd-shim exits unexpectedly, there will be no corresponding event.
 	// However, containerd could not retrieve container state in that case, so it's
--- a/pkg/server/restart.go
+++ b/pkg/server/restart.go
@ -136,8 +136,23 @@ func (c *criService) recover(ctx context.Context) error {
 	return nil
 }

+// loadContainerTimeout is the default timeout for loading a container/sandbox.
+// One container/sandbox hangs (e.g. containerd#2438) should not affect other
+// containers/sandboxes.
+// Most CRI container/sandbox related operations are per container, the ones
+// which handle multiple containers at a time are:
+// * ListPodSandboxes: Don't talk with containerd services.
+// * ListContainers: Don't talk with containerd services.
+// * ListContainerStats: Not in critical code path, a default timeout will
+// be applied at CRI level.
+// * Recovery logic: We should set a time for each container/sandbox recovery.
+// * Event montior: We should set a timeout for each container/sandbox event handling.
+const loadContainerTimeout = 10 * time.Second
+
 // loadContainer loads container from containerd and status checkpoint.
 func (c *criService) loadContainer(ctx context.Context, cntr containerd.Container) (containerstore.Container, error) {
+	ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout)
+	defer cancel()
 	id := cntr.ID()
 	containerDir := c.getContainerRootDir(id)
 	volatileContainerDir := c.getVolatileContainerRootDir(id)
@ -290,9 +305,9 @@ const (
 // unknownContainerStatus returns the default container status when its status is unknown.
 func unknownContainerStatus() containerstore.Status {
 	return containerstore.Status{
-		CreatedAt:  time.Now().UnixNano(),
-		StartedAt:  time.Now().UnixNano(),
-		FinishedAt: time.Now().UnixNano(),
+		CreatedAt:  0,
+		StartedAt:  0,
+		FinishedAt: 0,
 		ExitCode:   unknownExitCode,
 		Reason:     unknownExitReason,
 	}
@ -300,6 +315,8 @@ func unknownContainerStatus() containerstore.Status {

 // loadSandbox loads sandbox from containerd.
 func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error) {
+	ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout)
+	defer cancel()
 	var sandbox sandboxstore.Sandbox
 	// Load sandbox metadata.
 	exts, err := cntr.Extensions(ctx)
--- a/pkg/store/image/fake_image.go
+++ b/pkg/store/image/fake_image.go
@ -27,7 +27,7 @@ func NewFakeStore(images []Image) (*Store, error) {
 			s.refCache[ref] = i.ID
 		}
 		if err := s.store.add(i); err != nil {
-			return nil, errors.Wrapf(err, "add image %q", i)
+			return nil, errors.Wrapf(err, "add image %+v", i)
 		}
 	}
 	return s, nil