From 3e95727f390b530b91908aedf20a2769435d93bf Mon Sep 17 00:00:00 2001 From: Ashray Jain Date: Wed, 22 Apr 2020 13:47:47 +0100 Subject: [PATCH] Make killing shims more resilient Currently, we send a single SIGKILL to the shim process once and then we spin in a loop where we use kill(pid, 0) to detect when the pid has disappeared completely. Unfortunately, this has a race condition since pids can be reused causing us to spin in an infinite loop when that happens. This adds a timeout to this loop which logs a warning and exits the infinite loop. Signed-off-by: Ashray Jain --- runtime/v1/shim/client/client.go | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/runtime/v1/shim/client/client.go b/runtime/v1/shim/client/client.go index 116e84083..562ee6ca4 100644 --- a/runtime/v1/shim/client/client.go +++ b/runtime/v1/shim/client/client.go @@ -324,21 +324,31 @@ func (c *Client) signalShim(ctx context.Context, sig syscall.Signal) error { select { case <-ctx.Done(): return ctx.Err() - case <-c.waitForExit(pid): + case <-c.waitForExit(ctx, pid): return nil } } -func (c *Client) waitForExit(pid int) <-chan struct{} { - c.exitOnce.Do(func() { +func (c *Client) waitForExit(ctx context.Context, pid int) <-chan struct{} { + go c.exitOnce.Do(func() { + defer close(c.exitCh) + + ticker := time.NewTicker(10 * time.Millisecond) + defer ticker.Stop() + for { // use kill(pid, 0) here because the shim could have been reparented // and we are no longer able to waitpid(pid, ...) on the shim if err := unix.Kill(pid, 0); err == unix.ESRCH { - close(c.exitCh) return } - time.Sleep(10 * time.Millisecond) + + select { + case <-ticker.C: + case <-ctx.Done(): + log.G(ctx).WithField("pid", pid).Warn("timed out while waiting for shim to exit") + return + } } }) return c.exitCh