Make killing shims more resilient
Currently, we send a single SIGKILL to the shim process once and then we spin in a loop where we use kill(pid, 0) to detect when the pid has disappeared completely. Unfortunately, this has a race condition since pids can be reused causing us to spin in an infinite loop when that happens. This adds a timeout to this loop which logs a warning and exits the infinite loop. Signed-off-by: Ashray Jain <ashrayj@palantir.com>
This commit is contained in:
parent
e094d363ac
commit
3e95727f39
@ -324,21 +324,31 @@ func (c *Client) signalShim(ctx context.Context, sig syscall.Signal) error {
|
|||||||
select {
|
select {
|
||||||
case <-ctx.Done():
|
case <-ctx.Done():
|
||||||
return ctx.Err()
|
return ctx.Err()
|
||||||
case <-c.waitForExit(pid):
|
case <-c.waitForExit(ctx, pid):
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Client) waitForExit(pid int) <-chan struct{} {
|
func (c *Client) waitForExit(ctx context.Context, pid int) <-chan struct{} {
|
||||||
c.exitOnce.Do(func() {
|
go c.exitOnce.Do(func() {
|
||||||
|
defer close(c.exitCh)
|
||||||
|
|
||||||
|
ticker := time.NewTicker(10 * time.Millisecond)
|
||||||
|
defer ticker.Stop()
|
||||||
|
|
||||||
for {
|
for {
|
||||||
// use kill(pid, 0) here because the shim could have been reparented
|
// use kill(pid, 0) here because the shim could have been reparented
|
||||||
// and we are no longer able to waitpid(pid, ...) on the shim
|
// and we are no longer able to waitpid(pid, ...) on the shim
|
||||||
if err := unix.Kill(pid, 0); err == unix.ESRCH {
|
if err := unix.Kill(pid, 0); err == unix.ESRCH {
|
||||||
close(c.exitCh)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
time.Sleep(10 * time.Millisecond)
|
|
||||||
|
select {
|
||||||
|
case <-ticker.C:
|
||||||
|
case <-ctx.Done():
|
||||||
|
log.G(ctx).WithField("pid", pid).Warn("timed out while waiting for shim to exit")
|
||||||
|
return
|
||||||
|
}
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
return c.exitCh
|
return c.exitCh
|
||||||
|
Loading…
Reference in New Issue
Block a user