From 3e95727f390b530b91908aedf20a2769435d93bf Mon Sep 17 00:00:00 2001
From: Ashray Jain <ashrayj@palantir.com>
Date: Wed, 22 Apr 2020 13:47:47 +0100
Subject: [PATCH] Make killing shims more resilient

Currently, we send a single SIGKILL to the shim process
once and then we spin in a loop where we use kill(pid, 0)
to detect when the pid has disappeared completely.

Unfortunately, this has a race condition since pids can be reused causing us
to spin in an infinite loop when that happens.

This adds a timeout to this loop which logs a warning and exits the
infinite loop.

Signed-off-by: Ashray Jain <ashrayj@palantir.com>
---
 runtime/v1/shim/client/client.go | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/runtime/v1/shim/client/client.go b/runtime/v1/shim/client/client.go
index 116e84083..562ee6ca4 100644
--- a/runtime/v1/shim/client/client.go
+++ b/runtime/v1/shim/client/client.go
@@ -324,21 +324,31 @@ func (c *Client) signalShim(ctx context.Context, sig syscall.Signal) error {
 	select {
 	case <-ctx.Done():
 		return ctx.Err()
-	case <-c.waitForExit(pid):
+	case <-c.waitForExit(ctx, pid):
 		return nil
 	}
 }
 
-func (c *Client) waitForExit(pid int) <-chan struct{} {
-	c.exitOnce.Do(func() {
+func (c *Client) waitForExit(ctx context.Context, pid int) <-chan struct{} {
+	go c.exitOnce.Do(func() {
+		defer close(c.exitCh)
+
+		ticker := time.NewTicker(10 * time.Millisecond)
+		defer ticker.Stop()
+
 		for {
 			// use kill(pid, 0) here because the shim could have been reparented
 			// and we are no longer able to waitpid(pid, ...) on the shim
 			if err := unix.Kill(pid, 0); err == unix.ESRCH {
-				close(c.exitCh)
 				return
 			}
-			time.Sleep(10 * time.Millisecond)
+
+			select {
+			case <-ticker.C:
+			case <-ctx.Done():
+				log.G(ctx).WithField("pid", pid).Warn("timed out while waiting for shim to exit")
+				return
+			}
 		}
 	})
 	return c.exitCh