Add timeout for container/sandbox recover and event monitor.
Signed-off-by: Lantao Liu <lantaol@google.com>
This commit is contained in:
parent
6de38f1f3a
commit
963a01735b
@ -42,11 +42,17 @@ const (
|
|||||||
backOffInitDuration = 1 * time.Second
|
backOffInitDuration = 1 * time.Second
|
||||||
backOffMaxDuration = 5 * time.Minute
|
backOffMaxDuration = 5 * time.Minute
|
||||||
backOffExpireCheckDuration = 1 * time.Second
|
backOffExpireCheckDuration = 1 * time.Second
|
||||||
|
|
||||||
|
// handleEventTimeout is the timeout for handling 1 event. Event monitor
|
||||||
|
// handles events in serial, if one event blocks the event monitor, no
|
||||||
|
// other events can be handled.
|
||||||
|
// Add a timeout for each event handling, events that timeout will be requeued and
|
||||||
|
// handled again in the future.
|
||||||
|
handleEventTimeout = 10 * time.Second
|
||||||
)
|
)
|
||||||
|
|
||||||
// eventMonitor monitors containerd event and updates internal state correspondingly.
|
// eventMonitor monitors containerd event and updates internal state correspondingly.
|
||||||
// TODO(random-liu): [P1] Figure out is it possible to drop event during containerd
|
// TODO(random-liu): Handle event for each container in a separate goroutine.
|
||||||
// is running. If it is, we should do periodically list to sync state with containerd.
|
|
||||||
type eventMonitor struct {
|
type eventMonitor struct {
|
||||||
containerStore *containerstore.Store
|
containerStore *containerstore.Store
|
||||||
sandboxStore *sandboxstore.Store
|
sandboxStore *sandboxstore.Store
|
||||||
@ -189,6 +195,9 @@ func (em *eventMonitor) stop() {
|
|||||||
// handleEvent handles a containerd event.
|
// handleEvent handles a containerd event.
|
||||||
func (em *eventMonitor) handleEvent(any interface{}) error {
|
func (em *eventMonitor) handleEvent(any interface{}) error {
|
||||||
ctx := ctrdutil.NamespacedContext()
|
ctx := ctrdutil.NamespacedContext()
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, handleEventTimeout)
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
switch any.(type) {
|
switch any.(type) {
|
||||||
// If containerd-shim exits unexpectedly, there will be no corresponding event.
|
// If containerd-shim exits unexpectedly, there will be no corresponding event.
|
||||||
// However, containerd could not retrieve container state in that case, so it's
|
// However, containerd could not retrieve container state in that case, so it's
|
||||||
|
@ -136,8 +136,23 @@ func (c *criService) recover(ctx context.Context) error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// loadContainerTimeout is the default timeout for loading a container/sandbox.
|
||||||
|
// One container/sandbox hangs (e.g. containerd#2438) should not affect other
|
||||||
|
// containers/sandboxes.
|
||||||
|
// Most CRI container/sandbox related operations are per container, the ones
|
||||||
|
// which handle multiple containers at a time are:
|
||||||
|
// * ListPodSandboxes: Don't talk with containerd services.
|
||||||
|
// * ListContainers: Don't talk with containerd services.
|
||||||
|
// * ListContainerStats: Not in critical code path, a default timeout will
|
||||||
|
// be applied at CRI level.
|
||||||
|
// * Recovery logic: We should set a time for each container/sandbox recovery.
|
||||||
|
// * Event montior: We should set a timeout for each container/sandbox event handling.
|
||||||
|
const loadContainerTimeout = 10 * time.Second
|
||||||
|
|
||||||
// loadContainer loads container from containerd and status checkpoint.
|
// loadContainer loads container from containerd and status checkpoint.
|
||||||
func (c *criService) loadContainer(ctx context.Context, cntr containerd.Container) (containerstore.Container, error) {
|
func (c *criService) loadContainer(ctx context.Context, cntr containerd.Container) (containerstore.Container, error) {
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout)
|
||||||
|
defer cancel()
|
||||||
id := cntr.ID()
|
id := cntr.ID()
|
||||||
containerDir := c.getContainerRootDir(id)
|
containerDir := c.getContainerRootDir(id)
|
||||||
volatileContainerDir := c.getVolatileContainerRootDir(id)
|
volatileContainerDir := c.getVolatileContainerRootDir(id)
|
||||||
@ -290,9 +305,9 @@ const (
|
|||||||
// unknownContainerStatus returns the default container status when its status is unknown.
|
// unknownContainerStatus returns the default container status when its status is unknown.
|
||||||
func unknownContainerStatus() containerstore.Status {
|
func unknownContainerStatus() containerstore.Status {
|
||||||
return containerstore.Status{
|
return containerstore.Status{
|
||||||
CreatedAt: time.Now().UnixNano(),
|
CreatedAt: 0,
|
||||||
StartedAt: time.Now().UnixNano(),
|
StartedAt: 0,
|
||||||
FinishedAt: time.Now().UnixNano(),
|
FinishedAt: 0,
|
||||||
ExitCode: unknownExitCode,
|
ExitCode: unknownExitCode,
|
||||||
Reason: unknownExitReason,
|
Reason: unknownExitReason,
|
||||||
}
|
}
|
||||||
@ -300,6 +315,8 @@ func unknownContainerStatus() containerstore.Status {
|
|||||||
|
|
||||||
// loadSandbox loads sandbox from containerd.
|
// loadSandbox loads sandbox from containerd.
|
||||||
func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error) {
|
func loadSandbox(ctx context.Context, cntr containerd.Container) (sandboxstore.Sandbox, error) {
|
||||||
|
ctx, cancel := context.WithTimeout(ctx, loadContainerTimeout)
|
||||||
|
defer cancel()
|
||||||
var sandbox sandboxstore.Sandbox
|
var sandbox sandboxstore.Sandbox
|
||||||
// Load sandbox metadata.
|
// Load sandbox metadata.
|
||||||
exts, err := cntr.Extensions(ctx)
|
exts, err := cntr.Extensions(ctx)
|
||||||
|
@ -27,7 +27,7 @@ func NewFakeStore(images []Image) (*Store, error) {
|
|||||||
s.refCache[ref] = i.ID
|
s.refCache[ref] = i.ID
|
||||||
}
|
}
|
||||||
if err := s.store.add(i); err != nil {
|
if err := s.store.add(i); err != nil {
|
||||||
return nil, errors.Wrapf(err, "add image %q", i)
|
return nil, errors.Wrapf(err, "add image %+v", i)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return s, nil
|
return s, nil
|
||||||
|
Loading…
Reference in New Issue
Block a user