diff --git a/pkg/server/events.go b/pkg/server/events.go index c3ab1dfd5..e6e0321b6 100644 --- a/pkg/server/events.go +++ b/pkg/server/events.go @@ -42,11 +42,17 @@ const ( backOffInitDuration = 1 * time.Second backOffMaxDuration = 5 * time.Minute backOffExpireCheckDuration = 1 * time.Second + + // handleEventTimeout is the timeout for handling 1 event. Event monitor + // handles events in serial, if one event blocks the event monitor, no + // other events can be handled. + // Add a timeout for each event handling, events that timeout will be requeued and + // handled again in the future. + handleEventTimeout = 10 * time.Second ) // eventMonitor monitors containerd event and updates internal state correspondingly. -// TODO(random-liu): [P1] Figure out is it possible to drop event during containerd -// is running. If it is, we should do periodically list to sync state with containerd. +// TODO(random-liu): Handle event for each container in a separate goroutine. type eventMonitor struct { containerStore *containerstore.Store sandboxStore *sandboxstore.Store @@ -189,6 +195,9 @@ func (em *eventMonitor) stop() { // handleEvent handles a containerd event. func (em *eventMonitor) handleEvent(any interface{}) error { ctx := ctrdutil.NamespacedContext() + ctx, cancel := context.WithTimeout(ctx, handleEventTimeout) + defer cancel() + switch any.(type) { // If containerd-shim exits unexpectedly, there will be no corresponding event. // However, containerd could not retrieve container state in that case, so it's diff --git a/pkg/server/restart.go b/pkg/server/restart.go index 560c75f0f..bd25d152a 100644 --- a/pkg/server/restart.go +++ b/pkg/server/restart.go @@ -136,8 +136,23 @@ func (c *criService) recover(ctx context.Context) error { return nil } +// loadContainerTimeout is the default timeout for loading a container/sandbox. +// One container/sandbox hangs (e.g. containerd#2438) should not affect other +// containers/sandboxes. +// Most CRI container/sandbox related operations are per container, the ones +// which handle multiple containers at a time are: +// * ListPodSandboxes: Don't talk with containerd services. +// * ListContainers: Don't talk with containerd services. +// * ListContainerStats: Not in critical code path, a default timeout will +// be applied at CRI level. +// * Recovery logic: We should set a time for each container/sandbox recovery. +// * Event montior: We should set a timeout for each container/sandbox event handling. +const loadContainerTimeout = 10 * time.Second + // loadContainer loads container from containerd and status checkpoint. func (c *criService) loadContainer(ctx context.Context, cntr containerd.Container) (containerstore.Container, error) { + ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout) + defer cancel() id := cntr.ID() containerDir := c.getContainerRootDir(id) volatileContainerDir := c.getVolatileContainerRootDir(id) @@ -290,9 +305,9 @@ const ( // unknownContainerStatus returns the default container status when its status is unknown. func unknownContainerStatus() containerstore.Status { return containerstore.Status{ - CreatedAt: time.Now().UnixNano(), - StartedAt: time.Now().UnixNano(), - FinishedAt: time.Now().UnixNano(), + CreatedAt: 0, + StartedAt: 0, + FinishedAt: 0, ExitCode: unknownExitCode, Reason: unknownExitReason, } @@ -300,6 +315,8 @@ func unknownContainerStatus() containerstore.Status { // loadSandbox loads sandbox from containerd. func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error) { + ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout) + defer cancel() var sandbox sandboxstore.Sandbox // Load sandbox metadata. exts, err := cntr.Extensions(ctx) diff --git a/pkg/store/image/fake_image.go b/pkg/store/image/fake_image.go index b82d5c408..6d74d2417 100644 --- a/pkg/store/image/fake_image.go +++ b/pkg/store/image/fake_image.go @@ -27,7 +27,7 @@ func NewFakeStore(images []Image) (*Store, error) { s.refCache[ref] = i.ID } if err := s.store.add(i); err != nil { - return nil, errors.Wrapf(err, "add image %q", i) + return nil, errors.Wrapf(err, "add image %+v", i) } } return s, nil