From d513dd2bfd5e930d1fc8e7bbf332762c706c820d Mon Sep 17 00:00:00 2001 From: Michael Crosby Date: Wed, 9 Aug 2017 15:44:42 -0400 Subject: [PATCH] Fix race with task checkpoint Because runc will delete a container after a successful checkpoint we need to handle a NotFound error from runc on delete. There is also a race between SIGKILL'ing the shim and it actually exiting to unmount the tasks rootfs, we need to loop and wait for the task to actually be reaped before trying to delete the rootfs+bundle. Signed-off-by: Michael Crosby --- client_test.go | 4 +++- linux/runtime.go | 9 +++++++-- linux/shim/client.go | 13 ++++++++++++- linux/shim/init.go | 28 +++++++++++++++------------- linux/shim/service.go | 10 ++++++---- 5 files changed, 43 insertions(+), 21 deletions(-) diff --git a/client_test.go b/client_test.go index 7c277e2d9..22495735b 100644 --- a/client_test.go +++ b/client_test.go @@ -132,7 +132,9 @@ func TestMain(m *testing.M) { } } if err := cmd.Wait(); err != nil { - fmt.Fprintln(os.Stderr, "failed to wait for containerd", err) + if _, ok := err.(*exec.ExitError); !ok { + fmt.Fprintln(os.Stderr, "failed to wait for containerd", err) + } } if err := os.RemoveAll(defaultRoot); err != nil { fmt.Fprintln(os.Stderr, "failed to remove test root dir", err) diff --git a/linux/runtime.go b/linux/runtime.go index 43c0b20ed..0430c6943 100644 --- a/linux/runtime.go +++ b/linux/runtime.go @@ -211,9 +211,14 @@ func (r *Runtime) Delete(ctx context.Context, c runtime.Task) (*runtime.Exit, er } r.tasks.Delete(ctx, lc) - bundle := loadBundle(filepath.Join(r.state, namespace, lc.id), filepath.Join(r.root, namespace, lc.id), namespace, r.events) + bundle := loadBundle( + filepath.Join(r.state, namespace, lc.id), + filepath.Join(r.root, namespace, lc.id), + namespace, + r.events, + ) if err := bundle.Delete(); err != nil { - return nil, err + log.G(ctx).WithError(err).Error("failed to delete bundle") } return &runtime.Exit{ Status: rsp.ExitStatus, diff --git a/linux/shim/client.go b/linux/shim/client.go index ea634a8f8..2b5016fc5 100644 --- a/linux/shim/client.go +++ b/linux/shim/client.go @@ -209,7 +209,18 @@ func (c *Client) KillShim(ctx context.Context) error { if os.Getpid() == pid { return nil } - return unix.Kill(pid, unix.SIGKILL) + if err := unix.Kill(pid, unix.SIGKILL); err != nil { + return err + } + // wait for shim to die after being SIGKILL'd + for { + // use kill(pid, 0) here because the shim could have been reparented + // and we are no longer able to waitpid(pid, ...) on the shim + if err := unix.Kill(pid, 0); err != nil && err == unix.ESRCH { + return nil + } + time.Sleep(10 * time.Millisecond) + } } func (c *Client) Close() error { diff --git a/linux/shim/init.go b/linux/shim/init.go index 4f4f42933..6777940c8 100644 --- a/linux/shim/init.go +++ b/linux/shim/init.go @@ -237,31 +237,33 @@ func (p *initProcess) SetExited(status int) { } func (p *initProcess) Delete(context context.Context) error { - status, err := p.Status(context) - if err != nil { - return err - } - if status != "stopped" { - return fmt.Errorf("cannot delete a running container") - } p.killAll(context) p.Wait() - err = p.runtime.Delete(context, p.id, nil) + err := p.runtime.Delete(context, p.id, nil) + // ignore errors if a runtime has already deleted the process + // but we still hold metadata and pipes + // + // this is common during a checkpoint, runc will delete the container state + // after a checkpoint and the container will no longer exist within runc + if err != nil { + if strings.Contains(err.Error(), "does not exist") { + err = nil + } else { + err = p.runtimeError(err, "failed to delete task") + } + } if p.io != nil { for _, c := range p.closers { c.Close() } p.io.Close() } - err = p.runtimeError(err, "OCI runtime delete failed") - if err2 := mount.UnmountAll(p.rootfs, 0); err2 != nil { - log.G(context).WithError(err2).Warn("Failed to cleanup rootfs mount") + log.G(context).WithError(err2).Warn("failed to cleanup rootfs mount") if err == nil { - err = errors.Wrap(err2, "Failed rootfs umount") + err = errors.Wrap(err2, "failed rootfs umount") } } - return err } diff --git a/linux/shim/service.go b/linux/shim/service.go index be54104dc..7406035a1 100644 --- a/linux/shim/service.go +++ b/linux/shim/service.go @@ -147,8 +147,9 @@ func (s *Service) Delete(ctx context.Context, r *google_protobuf.Empty) (*shimap return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } p := s.initProcess - // TODO (@crosbymichael): how to handle errors here - p.Delete(ctx) + if err := p.Delete(ctx); err != nil { + return nil, err + } s.mu.Lock() delete(s.processes, p.ID()) s.mu.Unlock() @@ -178,8 +179,9 @@ func (s *Service) DeleteProcess(ctx context.Context, r *shimapi.DeleteProcessReq if !ok { return nil, errors.Wrapf(errdefs.ErrNotFound, "process %s not found", r.ID) } - // TODO (@crosbymichael): how to handle errors here - p.Delete(ctx) + if err := p.Delete(ctx); err != nil { + return nil, err + } s.mu.Lock() delete(s.processes, p.ID()) s.mu.Unlock()