Make killing shims more resilient

Currently, we send a single SIGKILL to the shim process
once and then we spin in a loop where we use kill(pid, 0)
to detect when the pid has disappeared completely.

Unfortunately, this has a race condition since pids can be reused causing us
to spin in an infinite loop when that happens.

This adds a timeout to this loop which logs a warning and exits the
infinite loop.

Signed-off-by: Ashray Jain <ashrayj@palantir.com>
This commit is contained in:
Ashray Jain 2020-04-22 13:47:47 +01:00
parent e094d363ac
commit 3e95727f39

View File

@ -324,21 +324,31 @@ func (c *Client) signalShim(ctx context.Context, sig syscall.Signal) error {
select { select {
case <-ctx.Done(): case <-ctx.Done():
return ctx.Err() return ctx.Err()
case <-c.waitForExit(pid): case <-c.waitForExit(ctx, pid):
return nil return nil
} }
} }
func (c *Client) waitForExit(pid int) <-chan struct{} { func (c *Client) waitForExit(ctx context.Context, pid int) <-chan struct{} {
c.exitOnce.Do(func() { go c.exitOnce.Do(func() {
defer close(c.exitCh)
ticker := time.NewTicker(10 * time.Millisecond)
defer ticker.Stop()
for { for {
// use kill(pid, 0) here because the shim could have been reparented // use kill(pid, 0) here because the shim could have been reparented
// and we are no longer able to waitpid(pid, ...) on the shim // and we are no longer able to waitpid(pid, ...) on the shim
if err := unix.Kill(pid, 0); err == unix.ESRCH { if err := unix.Kill(pid, 0); err == unix.ESRCH {
close(c.exitCh)
return return
} }
time.Sleep(10 * time.Millisecond)
select {
case <-ticker.C:
case <-ctx.Done():
log.G(ctx).WithField("pid", pid).Warn("timed out while waiting for shim to exit")
return
}
} }
}) })
return c.exitCh return c.exitCh