From d7c3ecd0fb56b06982b59c3a4021775a30a39da9 Mon Sep 17 00:00:00 2001 From: "Justin Terry (VM)" Date: Tue, 21 May 2019 13:49:34 -0700 Subject: [PATCH 1/3] RunPodSandbox should block unless client context is canceled A call to RunPodSandbox should only return timeout if the operation has timed out because the clients context deadline was exceeded. On client cancelation it should return gRPC Canceled otherwise it should block until the sandbox has exited. Signed-off-by: Justin Terry (VM) --- pkg/server/sandbox_stop.go | 13 +++++-------- pkg/server/sandbox_stop_test.go | 7 ++++++- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/pkg/server/sandbox_stop.go b/pkg/server/sandbox_stop.go index 2ca8c9661..a3d5a2f11 100644 --- a/pkg/server/sandbox_stop.go +++ b/pkg/server/sandbox_stop.go @@ -142,18 +142,15 @@ func (c *criService) stopSandboxContainer(ctx context.Context, sandbox sandboxst return errors.Wrap(err, "failed to kill sandbox container") } - return c.waitSandboxStop(ctx, sandbox, killContainerTimeout) + return c.waitSandboxStop(ctx, sandbox) } -// waitSandboxStop waits for sandbox to be stopped until timeout exceeds or context is cancelled. -func (c *criService) waitSandboxStop(ctx context.Context, sandbox sandboxstore.Sandbox, timeout time.Duration) error { - timeoutTimer := time.NewTimer(timeout) - defer timeoutTimer.Stop() +// waitSandboxStop waits for sandbox to be stopped until context is cancelled or +// the context deadline is exceeded. +func (c *criService) waitSandboxStop(ctx context.Context, sandbox sandboxstore.Sandbox) error { select { case <-ctx.Done(): - return errors.Wrapf(ctx.Err(), "wait sandbox container %q is cancelled", sandbox.ID) - case <-timeoutTimer.C: - return errors.Errorf("wait sandbox container %q stop timeout", sandbox.ID) + return errors.Wrapf(ctx.Err(), "wait sandbox container %q", sandbox.ID) case <-sandbox.Stopped(): return nil } diff --git a/pkg/server/sandbox_stop_test.go b/pkg/server/sandbox_stop_test.go index 93a4f29a8..a1ba500bf 100644 --- a/pkg/server/sandbox_stop_test.go +++ b/pkg/server/sandbox_stop_test.go @@ -62,7 +62,12 @@ func TestWaitSandboxStop(t *testing.T) { cancel() ctx = cancelledCtx } - err := c.waitSandboxStop(ctx, sandbox, test.timeout) + if test.timeout > 0 { + timeoutCtx, cancel := context.WithTimeout(ctx, test.timeout) + defer cancel() + ctx = timeoutCtx + } + err := c.waitSandboxStop(ctx, sandbox) assert.Equal(t, test.expectErr, err != nil, desc) } } From 71cecedc4404b0851675de05db321f0a2d37d2d0 Mon Sep 17 00:00:00 2001 From: "Justin Terry (VM)" Date: Tue, 21 May 2019 13:53:46 -0700 Subject: [PATCH 2/3] StopContainer should block unless client context is canceled A call to StopContainer should only return if the client context is canceled or its deadline was exceeded. The Timeout parameter on StopContainerRequest is now used as the time AFTER sending the stop signal before the SIGKILL is delivered. The call will remain until the container has exited or the client context has finished. Signed-off-by: Justin Terry (VM) --- pkg/server/container_stop.go | 43 ++++++++++++++++--------------- pkg/server/container_stop_test.go | 7 ++++- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/pkg/server/container_stop.go b/pkg/server/container_stop.go index 0165bcf46..388ea65c7 100644 --- a/pkg/server/container_stop.go +++ b/pkg/server/container_stop.go @@ -33,13 +33,6 @@ import ( containerstore "github.com/containerd/cri/pkg/store/container" ) -// killContainerTimeout is the timeout that we wait for the container to -// be SIGKILLed. -// The timeout is set to 1 min, because the default CRI operation timeout -// for StopContainer is (2 min + stop timeout). Set to 1 min, so that we -// have enough time for kill(all=true) and kill(all=false). -const killContainerTimeout = 1 * time.Minute - // StopContainer stops a running container with a grace period (i.e., timeout). func (c *criService) StopContainer(ctx context.Context, r *runtime.StopContainerRequest) (*runtime.StopContainerResponse, error) { // Get container config from container store. @@ -141,11 +134,21 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore return errors.Wrapf(err, "failed to stop container %q", id) } - if err = c.waitContainerStop(ctx, container, timeout); err == nil || errors.Cause(err) == ctx.Err() { - // Do not SIGKILL container if the context is cancelled. - return err + sigTermCtx, sigTermCtxCancel := context.WithTimeout(ctx, timeout) + err = c.waitContainerStop(sigTermCtx, container) + if err == nil { + // Container stopped on first signal no need for SIGKILL + sigTermCtxCancel() + return nil } - logrus.WithError(err).Errorf("An error occurs during waiting for container %q to be stopped", id) + // If the parent context was cancelled or exceeded return immediately + if ctx.Err() != nil { + sigTermCtxCancel() + return ctx.Err() + } + sigTermCtxCancel() + // sigTermCtx was exceeded. Send SIGKILL + logrus.Debugf("Stop container %q with signal %v timed out", id, sig) } logrus.Infof("Kill container %q", id) @@ -154,21 +157,19 @@ func (c *criService) stopContainer(ctx context.Context, container containerstore } // Wait for a fixed timeout until container stop is observed by event monitor. - if err = c.waitContainerStop(ctx, container, killContainerTimeout); err == nil { - return nil + err = c.waitContainerStop(ctx, container) + if err != nil { + return errors.Wrapf(err, "an error occurs during waiting for container %q to be killed", id) } - return errors.Wrapf(err, "an error occurs during waiting for container %q to be killed", id) + return nil } -// waitContainerStop waits for container to be stopped until timeout exceeds or context is cancelled. -func (c *criService) waitContainerStop(ctx context.Context, container containerstore.Container, timeout time.Duration) error { - timeoutTimer := time.NewTimer(timeout) - defer timeoutTimer.Stop() +// waitContainerStop waits for container to be stopped until context is +// cancelled or the context deadline is exceeded. +func (c *criService) waitContainerStop(ctx context.Context, container containerstore.Container) error { select { case <-ctx.Done(): - return errors.Wrapf(ctx.Err(), "wait container %q is cancelled", container.ID) - case <-timeoutTimer.C: - return errors.Errorf("wait container %q stop timeout", container.ID) + return errors.Wrapf(ctx.Err(), "wait container %q", container.ID) case <-container.Stopped(): return nil } diff --git a/pkg/server/container_stop_test.go b/pkg/server/container_stop_test.go index 9da58a0c4..7274a05f5 100644 --- a/pkg/server/container_stop_test.go +++ b/pkg/server/container_stop_test.go @@ -74,7 +74,12 @@ func TestWaitContainerStop(t *testing.T) { cancel() ctx = cancelledCtx } - err = c.waitContainerStop(ctx, container, test.timeout) + if test.timeout > 0 { + timeoutCtx, cancel := context.WithTimeout(ctx, test.timeout) + defer cancel() + ctx = timeoutCtx + } + err = c.waitContainerStop(ctx, container) assert.Equal(t, test.expectErr, err != nil, desc) } } From 7b0c78bacd477b2c2f7f1dd92e65e6d91d9a51e4 Mon Sep 17 00:00:00 2001 From: "Justin Terry (VM)" Date: Tue, 21 May 2019 13:57:42 -0700 Subject: [PATCH 3/3] ExecSync should block unless client context is canceled A call to ExecSync should only return if the client context was canceled or exceeded. The Timeout parameter to ExecSyncRequest is now used to send SIGKILL if the exec'd process does not exit within Timeout but all paths wait for the exec to exit. Signed-off-by: Justin Terry (VM) --- pkg/server/container_execsync.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/pkg/server/container_execsync.go b/pkg/server/container_execsync.go index 6116ee523..2358b9941 100644 --- a/pkg/server/container_execsync.go +++ b/pkg/server/container_execsync.go @@ -164,16 +164,15 @@ func (c *criService) execInContainer(ctx context.Context, id string, opts execOp }, }) - var timeoutCh <-chan time.Time - if opts.timeout == 0 { - // Do not set timeout if it's 0. - timeoutCh = make(chan time.Time) - } else { - timeoutCh = time.After(opts.timeout) + execCtx := ctx + if opts.timeout > 0 { + var execCtxCancel context.CancelFunc + execCtx, execCtxCancel = context.WithTimeout(ctx, opts.timeout) + defer execCtxCancel() } + select { - case <-timeoutCh: - //TODO(Abhi) Use context.WithDeadline instead of timeout. + case <-execCtx.Done(): // Ignore the not found error because the process may exit itself before killing. if err := process.Kill(ctx, syscall.SIGKILL); err != nil && !errdefs.IsNotFound(err) { return nil, errors.Wrapf(err, "failed to kill exec %q", execID) @@ -184,7 +183,7 @@ func (c *criService) execInContainer(ctx context.Context, id string, opts execOp execID, exitRes.ExitCode(), exitRes.Error()) <-attachDone logrus.Debugf("Stream pipe for exec process %q done", execID) - return nil, errors.Errorf("timeout %v exceeded", opts.timeout) + return nil, errors.Wrapf(execCtx.Err(), "timeout %v exceeded", opts.timeout) case exitRes := <-exitCh: code, _, err := exitRes.Result() logrus.Infof("Exec process %q exits with exit code %d and error %v", execID, code, err)