Fix race with task checkpoint
Because runc will delete a container after a successful checkpoint we need to handle a NotFound error from runc on delete. There is also a race between SIGKILL'ing the shim and it actually exiting to unmount the tasks rootfs, we need to loop and wait for the task to actually be reaped before trying to delete the rootfs+bundle. Signed-off-by: Michael Crosby <crosbymichael@gmail.com>
This commit is contained in:
parent
258c719cbc
commit
d513dd2bfd
@ -132,7 +132,9 @@ func TestMain(m *testing.M) {
|
||||
}
|
||||
}
|
||||
if err := cmd.Wait(); err != nil {
|
||||
fmt.Fprintln(os.Stderr, "failed to wait for containerd", err)
|
||||
if _, ok := err.(*exec.ExitError); !ok {
|
||||
fmt.Fprintln(os.Stderr, "failed to wait for containerd", err)
|
||||
}
|
||||
}
|
||||
if err := os.RemoveAll(defaultRoot); err != nil {
|
||||
fmt.Fprintln(os.Stderr, "failed to remove test root dir", err)
|
||||
|
@ -211,9 +211,14 @@ func (r *Runtime) Delete(ctx context.Context, c runtime.Task) (*runtime.Exit, er
|
||||
}
|
||||
r.tasks.Delete(ctx, lc)
|
||||
|
||||
bundle := loadBundle(filepath.Join(r.state, namespace, lc.id), filepath.Join(r.root, namespace, lc.id), namespace, r.events)
|
||||
bundle := loadBundle(
|
||||
filepath.Join(r.state, namespace, lc.id),
|
||||
filepath.Join(r.root, namespace, lc.id),
|
||||
namespace,
|
||||
r.events,
|
||||
)
|
||||
if err := bundle.Delete(); err != nil {
|
||||
return nil, err
|
||||
log.G(ctx).WithError(err).Error("failed to delete bundle")
|
||||
}
|
||||
return &runtime.Exit{
|
||||
Status: rsp.ExitStatus,
|
||||
|
@ -209,7 +209,18 @@ func (c *Client) KillShim(ctx context.Context) error {
|
||||
if os.Getpid() == pid {
|
||||
return nil
|
||||
}
|
||||
return unix.Kill(pid, unix.SIGKILL)
|
||||
if err := unix.Kill(pid, unix.SIGKILL); err != nil {
|
||||
return err
|
||||
}
|
||||
// wait for shim to die after being SIGKILL'd
|
||||
for {
|
||||
// use kill(pid, 0) here because the shim could have been reparented
|
||||
// and we are no longer able to waitpid(pid, ...) on the shim
|
||||
if err := unix.Kill(pid, 0); err != nil && err == unix.ESRCH {
|
||||
return nil
|
||||
}
|
||||
time.Sleep(10 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Client) Close() error {
|
||||
|
@ -237,31 +237,33 @@ func (p *initProcess) SetExited(status int) {
|
||||
}
|
||||
|
||||
func (p *initProcess) Delete(context context.Context) error {
|
||||
status, err := p.Status(context)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if status != "stopped" {
|
||||
return fmt.Errorf("cannot delete a running container")
|
||||
}
|
||||
p.killAll(context)
|
||||
p.Wait()
|
||||
err = p.runtime.Delete(context, p.id, nil)
|
||||
err := p.runtime.Delete(context, p.id, nil)
|
||||
// ignore errors if a runtime has already deleted the process
|
||||
// but we still hold metadata and pipes
|
||||
//
|
||||
// this is common during a checkpoint, runc will delete the container state
|
||||
// after a checkpoint and the container will no longer exist within runc
|
||||
if err != nil {
|
||||
if strings.Contains(err.Error(), "does not exist") {
|
||||
err = nil
|
||||
} else {
|
||||
err = p.runtimeError(err, "failed to delete task")
|
||||
}
|
||||
}
|
||||
if p.io != nil {
|
||||
for _, c := range p.closers {
|
||||
c.Close()
|
||||
}
|
||||
p.io.Close()
|
||||
}
|
||||
err = p.runtimeError(err, "OCI runtime delete failed")
|
||||
|
||||
if err2 := mount.UnmountAll(p.rootfs, 0); err2 != nil {
|
||||
log.G(context).WithError(err2).Warn("Failed to cleanup rootfs mount")
|
||||
log.G(context).WithError(err2).Warn("failed to cleanup rootfs mount")
|
||||
if err == nil {
|
||||
err = errors.Wrap(err2, "Failed rootfs umount")
|
||||
err = errors.Wrap(err2, "failed rootfs umount")
|
||||
}
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -147,8 +147,9 @@ func (s *Service) Delete(ctx context.Context, r *google_protobuf.Empty) (*shimap
|
||||
return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created")
|
||||
}
|
||||
p := s.initProcess
|
||||
// TODO (@crosbymichael): how to handle errors here
|
||||
p.Delete(ctx)
|
||||
if err := p.Delete(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
s.mu.Lock()
|
||||
delete(s.processes, p.ID())
|
||||
s.mu.Unlock()
|
||||
@ -178,8 +179,9 @@ func (s *Service) DeleteProcess(ctx context.Context, r *shimapi.DeleteProcessReq
|
||||
if !ok {
|
||||
return nil, errors.Wrapf(errdefs.ErrNotFound, "process %s not found", r.ID)
|
||||
}
|
||||
// TODO (@crosbymichael): how to handle errors here
|
||||
p.Delete(ctx)
|
||||
if err := p.Delete(ctx); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
s.mu.Lock()
|
||||
delete(s.processes, p.ID())
|
||||
s.mu.Unlock()
|
||||
|
Loading…
Reference in New Issue
Block a user