Fix race with task checkpoint

Because runc will delete a container after a successful checkpoint we
need to handle a NotFound error from runc on delete.

There is also a race between SIGKILL'ing the shim and it actually
exiting to unmount the tasks rootfs, we need to loop and wait for the
task to actually be reaped before trying to delete the rootfs+bundle.

Signed-off-by: Michael Crosby <crosbymichael@gmail.com>
This commit is contained in:
Michael Crosby 2017-08-09 15:44:42 -04:00
parent 258c719cbc
commit d513dd2bfd
5 changed files with 43 additions and 21 deletions

View File

@ -132,7 +132,9 @@ func TestMain(m *testing.M) {
}
}
if err := cmd.Wait(); err != nil {
fmt.Fprintln(os.Stderr, "failed to wait for containerd", err)
if _, ok := err.(*exec.ExitError); !ok {
fmt.Fprintln(os.Stderr, "failed to wait for containerd", err)
}
}
if err := os.RemoveAll(defaultRoot); err != nil {
fmt.Fprintln(os.Stderr, "failed to remove test root dir", err)

View File

@ -211,9 +211,14 @@ func (r *Runtime) Delete(ctx context.Context, c runtime.Task) (*runtime.Exit, er
}
r.tasks.Delete(ctx, lc)
bundle := loadBundle(filepath.Join(r.state, namespace, lc.id), filepath.Join(r.root, namespace, lc.id), namespace, r.events)
bundle := loadBundle(
filepath.Join(r.state, namespace, lc.id),
filepath.Join(r.root, namespace, lc.id),
namespace,
r.events,
)
if err := bundle.Delete(); err != nil {
return nil, err
log.G(ctx).WithError(err).Error("failed to delete bundle")
}
return &runtime.Exit{
Status: rsp.ExitStatus,

View File

@ -209,7 +209,18 @@ func (c *Client) KillShim(ctx context.Context) error {
if os.Getpid() == pid {
return nil
}
return unix.Kill(pid, unix.SIGKILL)
if err := unix.Kill(pid, unix.SIGKILL); err != nil {
return err
}
// wait for shim to die after being SIGKILL'd
for {
// use kill(pid, 0) here because the shim could have been reparented
// and we are no longer able to waitpid(pid, ...) on the shim
if err := unix.Kill(pid, 0); err != nil && err == unix.ESRCH {
return nil
}
time.Sleep(10 * time.Millisecond)
}
}
func (c *Client) Close() error {

View File

@ -237,31 +237,33 @@ func (p *initProcess) SetExited(status int) {
}
func (p *initProcess) Delete(context context.Context) error {
status, err := p.Status(context)
if err != nil {
return err
}
if status != "stopped" {
return fmt.Errorf("cannot delete a running container")
}
p.killAll(context)
p.Wait()
err = p.runtime.Delete(context, p.id, nil)
err := p.runtime.Delete(context, p.id, nil)
// ignore errors if a runtime has already deleted the process
// but we still hold metadata and pipes
//
// this is common during a checkpoint, runc will delete the container state
// after a checkpoint and the container will no longer exist within runc
if err != nil {
if strings.Contains(err.Error(), "does not exist") {
err = nil
} else {
err = p.runtimeError(err, "failed to delete task")
}
}
if p.io != nil {
for _, c := range p.closers {
c.Close()
}
p.io.Close()
}
err = p.runtimeError(err, "OCI runtime delete failed")
if err2 := mount.UnmountAll(p.rootfs, 0); err2 != nil {
log.G(context).WithError(err2).Warn("Failed to cleanup rootfs mount")
log.G(context).WithError(err2).Warn("failed to cleanup rootfs mount")
if err == nil {
err = errors.Wrap(err2, "Failed rootfs umount")
err = errors.Wrap(err2, "failed rootfs umount")
}
}
return err
}

View File

@ -147,8 +147,9 @@ func (s *Service) Delete(ctx context.Context, r *google_protobuf.Empty) (*shimap
return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created")
}
p := s.initProcess
// TODO (@crosbymichael): how to handle errors here
p.Delete(ctx)
if err := p.Delete(ctx); err != nil {
return nil, err
}
s.mu.Lock()
delete(s.processes, p.ID())
s.mu.Unlock()
@ -178,8 +179,9 @@ func (s *Service) DeleteProcess(ctx context.Context, r *shimapi.DeleteProcessReq
if !ok {
return nil, errors.Wrapf(errdefs.ErrNotFound, "process %s not found", r.ID)
}
// TODO (@crosbymichael): how to handle errors here
p.Delete(ctx)
if err := p.Delete(ctx); err != nil {
return nil, err
}
s.mu.Lock()
delete(s.processes, p.ID())
s.mu.Unlock()