Make killing shims more resilient
Currently, we send a single SIGKILL to the shim process once and then we spin in a loop where we use kill(pid, 0) to detect when the pid has disappeared completely. Unfortunately, this has a race condition since pids can be reused causing us to spin in an infinite loop when that happens. This adds a timeout to this loop which logs a warning and exits the infinite loop. Signed-off-by: Ashray Jain <ashrayj@palantir.com>
This commit is contained in:
		| @@ -324,21 +324,31 @@ func (c *Client) signalShim(ctx context.Context, sig syscall.Signal) error { | ||||
| 	select { | ||||
| 	case <-ctx.Done(): | ||||
| 		return ctx.Err() | ||||
| 	case <-c.waitForExit(pid): | ||||
| 	case <-c.waitForExit(ctx, pid): | ||||
| 		return nil | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func (c *Client) waitForExit(pid int) <-chan struct{} { | ||||
| 	c.exitOnce.Do(func() { | ||||
| func (c *Client) waitForExit(ctx context.Context, pid int) <-chan struct{} { | ||||
| 	go c.exitOnce.Do(func() { | ||||
| 		defer close(c.exitCh) | ||||
|  | ||||
| 		ticker := time.NewTicker(10 * time.Millisecond) | ||||
| 		defer ticker.Stop() | ||||
|  | ||||
| 		for { | ||||
| 			// use kill(pid, 0) here because the shim could have been reparented | ||||
| 			// and we are no longer able to waitpid(pid, ...) on the shim | ||||
| 			if err := unix.Kill(pid, 0); err == unix.ESRCH { | ||||
| 				close(c.exitCh) | ||||
| 				return | ||||
| 			} | ||||
| 			time.Sleep(10 * time.Millisecond) | ||||
|  | ||||
| 			select { | ||||
| 			case <-ticker.C: | ||||
| 			case <-ctx.Done(): | ||||
| 				log.G(ctx).WithField("pid", pid).Warn("timed out while waiting for shim to exit") | ||||
| 				return | ||||
| 			} | ||||
| 		} | ||||
| 	}) | ||||
| 	return c.exitCh | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Ashray Jain
					Ashray Jain