Add timeout for container/sandbox recover and event monitor.

Signed-off-by: Lantao Liu <lantaol@google.com>
This commit is contained in:
Lantao Liu 2018-08-23 21:34:59 -07:00
parent 6de38f1f3a
commit 963a01735b
3 changed files with 32 additions and 6 deletions

View File

@ -42,11 +42,17 @@ const (
backOffInitDuration = 1 * time.Second backOffInitDuration = 1 * time.Second
backOffMaxDuration = 5 * time.Minute backOffMaxDuration = 5 * time.Minute
backOffExpireCheckDuration = 1 * time.Second backOffExpireCheckDuration = 1 * time.Second
// handleEventTimeout is the timeout for handling 1 event. Event monitor
// handles events in serial, if one event blocks the event monitor, no
// other events can be handled.
// Add a timeout for each event handling, events that timeout will be requeued and
// handled again in the future.
handleEventTimeout = 10 * time.Second
) )
// eventMonitor monitors containerd event and updates internal state correspondingly. // eventMonitor monitors containerd event and updates internal state correspondingly.
// TODO(random-liu): [P1] Figure out is it possible to drop event during containerd // TODO(random-liu): Handle event for each container in a separate goroutine.
// is running. If it is, we should do periodically list to sync state with containerd.
type eventMonitor struct { type eventMonitor struct {
containerStore *containerstore.Store containerStore *containerstore.Store
sandboxStore *sandboxstore.Store sandboxStore *sandboxstore.Store
@ -189,6 +195,9 @@ func (em *eventMonitor) stop() {
// handleEvent handles a containerd event. // handleEvent handles a containerd event.
func (em *eventMonitor) handleEvent(any interface{}) error { func (em *eventMonitor) handleEvent(any interface{}) error {
ctx := ctrdutil.NamespacedContext() ctx := ctrdutil.NamespacedContext()
ctx, cancel := context.WithTimeout(ctx, handleEventTimeout)
defer cancel()
switch any.(type) { switch any.(type) {
// If containerd-shim exits unexpectedly, there will be no corresponding event. // If containerd-shim exits unexpectedly, there will be no corresponding event.
// However, containerd could not retrieve container state in that case, so it's // However, containerd could not retrieve container state in that case, so it's

View File

@ -136,8 +136,23 @@ func (c *criService) recover(ctx context.Context) error {
return nil return nil
} }
// loadContainerTimeout is the default timeout for loading a container/sandbox.
// One container/sandbox hangs (e.g. containerd#2438) should not affect other
// containers/sandboxes.
// Most CRI container/sandbox related operations are per container, the ones
// which handle multiple containers at a time are:
// * ListPodSandboxes: Don't talk with containerd services.
// * ListContainers: Don't talk with containerd services.
// * ListContainerStats: Not in critical code path, a default timeout will
// be applied at CRI level.
// * Recovery logic: We should set a time for each container/sandbox recovery.
// * Event montior: We should set a timeout for each container/sandbox event handling.
const loadContainerTimeout = 10 * time.Second
// loadContainer loads container from containerd and status checkpoint. // loadContainer loads container from containerd and status checkpoint.
func (c *criService) loadContainer(ctx context.Context, cntr containerd.Container) (containerstore.Container, error) { func (c *criService) loadContainer(ctx context.Context, cntr containerd.Container) (containerstore.Container, error) {
ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout)
defer cancel()
id := cntr.ID() id := cntr.ID()
containerDir := c.getContainerRootDir(id) containerDir := c.getContainerRootDir(id)
volatileContainerDir := c.getVolatileContainerRootDir(id) volatileContainerDir := c.getVolatileContainerRootDir(id)
@ -290,9 +305,9 @@ const (
// unknownContainerStatus returns the default container status when its status is unknown. // unknownContainerStatus returns the default container status when its status is unknown.
func unknownContainerStatus() containerstore.Status { func unknownContainerStatus() containerstore.Status {
return containerstore.Status{ return containerstore.Status{
CreatedAt: time.Now().UnixNano(), CreatedAt: 0,
StartedAt: time.Now().UnixNano(), StartedAt: 0,
FinishedAt: time.Now().UnixNano(), FinishedAt: 0,
ExitCode: unknownExitCode, ExitCode: unknownExitCode,
Reason: unknownExitReason, Reason: unknownExitReason,
} }
@ -300,6 +315,8 @@ func unknownContainerStatus() containerstore.Status {
// loadSandbox loads sandbox from containerd. // loadSandbox loads sandbox from containerd.
func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error) { func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error) {
ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout)
defer cancel()
var sandbox sandboxstore.Sandbox var sandbox sandboxstore.Sandbox
// Load sandbox metadata. // Load sandbox metadata.
exts, err := cntr.Extensions(ctx) exts, err := cntr.Extensions(ctx)

View File

@ -27,7 +27,7 @@ func NewFakeStore(images []Image) (*Store, error) {
s.refCache[ref] = i.ID s.refCache[ref] = i.ID
} }
if err := s.store.add(i); err != nil { if err := s.store.add(i); err != nil {
return nil, errors.Wrapf(err, "add image %q", i) return nil, errors.Wrapf(err, "add image %+v", i)
} }
} }
return s, nil return s, nil