// +build linux /* Copyright The containerd Authors. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ package runc import ( "context" "encoding/json" "io/ioutil" "os" "os/exec" "path/filepath" "sync" "syscall" "time" "github.com/containerd/cgroups" "github.com/containerd/console" eventstypes "github.com/containerd/containerd/api/events" "github.com/containerd/containerd/api/types/task" "github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/events" "github.com/containerd/containerd/log" "github.com/containerd/containerd/mount" "github.com/containerd/containerd/namespaces" "github.com/containerd/containerd/runtime" rproc "github.com/containerd/containerd/runtime/proc" "github.com/containerd/containerd/runtime/v1/linux/proc" "github.com/containerd/containerd/runtime/v2/runc/options" "github.com/containerd/containerd/runtime/v2/shim" taskAPI "github.com/containerd/containerd/runtime/v2/task" runcC "github.com/containerd/go-runc" "github.com/containerd/typeurl" ptypes "github.com/gogo/protobuf/types" specs "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) var ( empty = &ptypes.Empty{} bufPool = sync.Pool{ New: func() interface{} { buffer := make([]byte, 32<<10) return &buffer }, } ) var _ = (taskAPI.TaskService)(&service{}) // New returns a new shim service that can be used via GRPC func New(ctx context.Context, id string, publisher events.Publisher) (shim.Shim, error) { ep, err := newOOMEpoller(publisher) if err != nil { return nil, err } ctx, cancel := context.WithCancel(ctx) go ep.run(ctx) s := &service{ id: id, context: ctx, processes: make(map[string]rproc.Process), events: make(chan interface{}, 128), ec: shim.Default.Subscribe(), ep: ep, cancel: cancel, } go s.processExits() runcC.Monitor = shim.Default if err := s.initPlatform(); err != nil { cancel() return nil, errors.Wrap(err, "failed to initialized platform behavior") } go s.forward(publisher) return s, nil } // service is the shim implementation of a remote shim over GRPC type service struct { mu sync.Mutex context context.Context task rproc.Process processes map[string]rproc.Process events chan interface{} platform rproc.Platform ec chan runcC.Exit ep *epoller id string bundle string cg cgroups.Cgroup cancel func() } func newCommand(ctx context.Context, containerdBinary, containerdAddress string) (*exec.Cmd, error) { ns, err := namespaces.NamespaceRequired(ctx) if err != nil { return nil, err } self, err := os.Executable() if err != nil { return nil, err } cwd, err := os.Getwd() if err != nil { return nil, err } args := []string{ "-namespace", ns, "-address", containerdAddress, "-publish-binary", containerdBinary, } cmd := exec.Command(self, args...) cmd.Dir = cwd cmd.Env = append(os.Environ(), "GOMAXPROCS=2") cmd.SysProcAttr = &syscall.SysProcAttr{ Setpgid: true, } return cmd, nil } func (s *service) StartShim(ctx context.Context, id, containerdBinary, containerdAddress string) (string, error) { cmd, err := newCommand(ctx, containerdBinary, containerdAddress) if err != nil { return "", err } address, err := shim.SocketAddress(ctx, id) if err != nil { return "", err } socket, err := shim.NewSocket(address) if err != nil { return "", err } defer socket.Close() f, err := socket.File() if err != nil { return "", err } defer f.Close() cmd.ExtraFiles = append(cmd.ExtraFiles, f) if err := cmd.Start(); err != nil { return "", err } defer func() { if err != nil { cmd.Process.Kill() } }() // make sure to wait after start go cmd.Wait() if err := shim.WritePidFile("shim.pid", cmd.Process.Pid); err != nil { return "", err } if err := shim.WriteAddress("address", address); err != nil { return "", err } if err := shim.SetScore(cmd.Process.Pid); err != nil { return "", errors.Wrap(err, "failed to set OOM Score on shim") } return address, nil } func (s *service) Cleanup(ctx context.Context) (*taskAPI.DeleteResponse, error) { path, err := os.Getwd() if err != nil { return nil, err } ns, err := namespaces.NamespaceRequired(ctx) if err != nil { return nil, err } runtime, err := s.readRuntime(path) if err != nil { return nil, err } r := proc.NewRunc(proc.RuncRoot, path, ns, runtime, "", false) if err := r.Delete(ctx, s.id, &runcC.DeleteOpts{ Force: true, }); err != nil { logrus.WithError(err).Warn("failed to remove runc container") } if err := mount.UnmountAll(filepath.Join(path, "rootfs"), 0); err != nil { logrus.WithError(err).Warn("failed to cleanup rootfs mount") } return &taskAPI.DeleteResponse{ ExitedAt: time.Now(), ExitStatus: 128 + uint32(unix.SIGKILL), }, nil } func (s *service) readRuntime(path string) (string, error) { data, err := ioutil.ReadFile(filepath.Join(path, "runtime")) if err != nil { return "", err } return string(data), nil } func (s *service) writeRuntime(path, runtime string) error { return ioutil.WriteFile(filepath.Join(path, "runtime"), []byte(runtime), 0600) } // Create a new initial process and container with the underlying OCI runtime func (s *service) Create(ctx context.Context, r *taskAPI.CreateTaskRequest) (_ *taskAPI.CreateTaskResponse, err error) { s.mu.Lock() defer s.mu.Unlock() ns, err := namespaces.NamespaceRequired(ctx) if err != nil { return nil, errors.Wrap(err, "create namespace") } var opts options.Options if r.Options != nil { v, err := typeurl.UnmarshalAny(r.Options) if err != nil { return nil, err } opts = *v.(*options.Options) } var mounts []proc.Mount for _, m := range r.Rootfs { mounts = append(mounts, proc.Mount{ Type: m.Type, Source: m.Source, Target: m.Target, Options: m.Options, }) } config := &proc.CreateConfig{ ID: r.ID, Bundle: r.Bundle, Runtime: opts.BinaryName, Rootfs: mounts, Terminal: r.Terminal, Stdin: r.Stdin, Stdout: r.Stdout, Stderr: r.Stderr, Checkpoint: r.Checkpoint, ParentCheckpoint: r.ParentCheckpoint, Options: r.Options, } if err := s.writeRuntime(r.Bundle, opts.BinaryName); err != nil { return nil, err } rootfs := filepath.Join(r.Bundle, "rootfs") defer func() { if err != nil { if err2 := mount.UnmountAll(rootfs, 0); err2 != nil { logrus.WithError(err2).Warn("failed to cleanup rootfs mount") } } }() for _, rm := range mounts { m := &mount.Mount{ Type: rm.Type, Source: rm.Source, Options: rm.Options, } if err := m.Mount(rootfs); err != nil { return nil, errors.Wrapf(err, "failed to mount rootfs component %v", m) } } process, err := newInit( ctx, r.Bundle, filepath.Join(r.Bundle, "work"), ns, s.platform, config, &opts, ) if err != nil { return nil, errdefs.ToGRPC(err) } if err := process.Create(ctx, config); err != nil { return nil, errdefs.ToGRPC(err) } // save the main task id and bundle to the shim for additional requests s.id = r.ID s.bundle = r.Bundle pid := process.Pid() if pid > 0 { cg, err := cgroups.Load(cgroups.V1, cgroups.PidPath(pid)) if err != nil { logrus.WithError(err).Errorf("loading cgroup for %d", pid) } s.cg = cg } s.task = process return &taskAPI.CreateTaskResponse{ Pid: uint32(pid), }, nil } // Start a process func (s *service) Start(ctx context.Context, r *taskAPI.StartRequest) (*taskAPI.StartResponse, error) { p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } if err := p.Start(ctx); err != nil { return nil, err } // case for restore if s.getCgroup() == nil && p.Pid() > 0 { cg, err := cgroups.Load(cgroups.V1, cgroups.PidPath(p.Pid())) if err != nil { logrus.WithError(err).Errorf("loading cgroup for %d", p.Pid()) } s.setCgroup(cg) } return &taskAPI.StartResponse{ Pid: uint32(p.Pid()), }, nil } // Delete the initial process and container func (s *service) Delete(ctx context.Context, r *taskAPI.DeleteRequest) (*taskAPI.DeleteResponse, error) { p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } if p == nil { return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } if err := p.Delete(ctx); err != nil { return nil, err } isTask := r.ExecID == "" if !isTask { s.mu.Lock() delete(s.processes, r.ExecID) s.mu.Unlock() } if isTask && s.platform != nil { s.platform.Close() } return &taskAPI.DeleteResponse{ ExitStatus: uint32(p.ExitStatus()), ExitedAt: p.ExitedAt(), Pid: uint32(p.Pid()), }, nil } // Exec an additional process inside the container func (s *service) Exec(ctx context.Context, r *taskAPI.ExecProcessRequest) (*ptypes.Empty, error) { s.mu.Lock() p := s.processes[r.ExecID] s.mu.Unlock() if p != nil { return nil, errdefs.ToGRPCf(errdefs.ErrAlreadyExists, "id %s", r.ExecID) } p = s.task if p == nil { return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } process, err := p.(*proc.Init).Exec(ctx, s.bundle, &proc.ExecConfig{ ID: r.ExecID, Terminal: r.Terminal, Stdin: r.Stdin, Stdout: r.Stdout, Stderr: r.Stderr, Spec: r.Spec, }) if err != nil { return nil, errdefs.ToGRPC(err) } s.mu.Lock() s.processes[r.ExecID] = process s.mu.Unlock() return empty, nil } // ResizePty of a process func (s *service) ResizePty(ctx context.Context, r *taskAPI.ResizePtyRequest) (*ptypes.Empty, error) { p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } ws := console.WinSize{ Width: uint16(r.Width), Height: uint16(r.Height), } if err := p.Resize(ws); err != nil { return nil, errdefs.ToGRPC(err) } return empty, nil } // State returns runtime state information for a process func (s *service) State(ctx context.Context, r *taskAPI.StateRequest) (*taskAPI.StateResponse, error) { p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } st, err := p.Status(ctx) if err != nil { return nil, err } status := task.StatusUnknown switch st { case "created": status = task.StatusCreated case "running": status = task.StatusRunning case "stopped": status = task.StatusStopped case "paused": status = task.StatusPaused case "pausing": status = task.StatusPausing } sio := p.Stdio() return &taskAPI.StateResponse{ ID: p.ID(), Bundle: s.bundle, Pid: uint32(p.Pid()), Status: status, Stdin: sio.Stdin, Stdout: sio.Stdout, Stderr: sio.Stderr, Terminal: sio.Terminal, ExitStatus: uint32(p.ExitStatus()), ExitedAt: p.ExitedAt(), }, nil } // Pause the container func (s *service) Pause(ctx context.Context, r *taskAPI.PauseRequest) (*ptypes.Empty, error) { s.mu.Lock() p := s.task s.mu.Unlock() if p == nil { return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } if err := p.(*proc.Init).Pause(ctx); err != nil { return nil, err } return empty, nil } // Resume the container func (s *service) Resume(ctx context.Context, r *taskAPI.ResumeRequest) (*ptypes.Empty, error) { s.mu.Lock() p := s.task s.mu.Unlock() if p == nil { return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } if err := p.(*proc.Init).Resume(ctx); err != nil { return nil, err } return empty, nil } // Kill a process with the provided signal func (s *service) Kill(ctx context.Context, r *taskAPI.KillRequest) (*ptypes.Empty, error) { p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } if p == nil { return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } if err := p.Kill(ctx, r.Signal, r.All); err != nil { return nil, errdefs.ToGRPC(err) } return empty, nil } // Pids returns all pids inside the container func (s *service) Pids(ctx context.Context, r *taskAPI.PidsRequest) (*taskAPI.PidsResponse, error) { pids, err := s.getContainerPids(ctx, r.ID) if err != nil { return nil, errdefs.ToGRPC(err) } var processes []*task.ProcessInfo for _, pid := range pids { pInfo := task.ProcessInfo{ Pid: pid, } for _, p := range s.processes { if p.Pid() == int(pid) { d := &options.ProcessDetails{ ExecID: p.ID(), } a, err := typeurl.MarshalAny(d) if err != nil { return nil, errors.Wrapf(err, "failed to marshal process %d info", pid) } pInfo.Info = a break } } processes = append(processes, &pInfo) } return &taskAPI.PidsResponse{ Processes: processes, }, nil } // CloseIO of a process func (s *service) CloseIO(ctx context.Context, r *taskAPI.CloseIORequest) (*ptypes.Empty, error) { p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } if stdin := p.Stdin(); stdin != nil { if err := stdin.Close(); err != nil { return nil, errors.Wrap(err, "close stdin") } } return empty, nil } // Checkpoint the container func (s *service) Checkpoint(ctx context.Context, r *taskAPI.CheckpointTaskRequest) (*ptypes.Empty, error) { s.mu.Lock() p := s.task s.mu.Unlock() if p == nil { return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } var opts options.CheckpointOptions if r.Options != nil { v, err := typeurl.UnmarshalAny(r.Options) if err != nil { return nil, err } opts = *v.(*options.CheckpointOptions) } if err := p.(*proc.Init).Checkpoint(ctx, &proc.CheckpointConfig{ Path: r.Path, Exit: opts.Exit, AllowOpenTCP: opts.OpenTcp, AllowExternalUnixSockets: opts.ExternalUnixSockets, AllowTerminal: opts.Terminal, FileLocks: opts.FileLocks, EmptyNamespaces: opts.EmptyNamespaces, }); err != nil { return nil, errdefs.ToGRPC(err) } return empty, nil } // Connect returns shim information such as the shim's pid func (s *service) Connect(ctx context.Context, r *taskAPI.ConnectRequest) (*taskAPI.ConnectResponse, error) { var pid int if s.task != nil { pid = s.task.Pid() } return &taskAPI.ConnectResponse{ ShimPid: uint32(os.Getpid()), TaskPid: uint32(pid), }, nil } func (s *service) Shutdown(ctx context.Context, r *taskAPI.ShutdownRequest) (*ptypes.Empty, error) { s.cancel() os.Exit(0) return empty, nil } func (s *service) Stats(ctx context.Context, r *taskAPI.StatsRequest) (*taskAPI.StatsResponse, error) { cg := s.getCgroup() if cg == nil { return nil, errdefs.ToGRPCf(errdefs.ErrNotFound, "cgroup does not exist") } stats, err := cg.Stat(cgroups.IgnoreNotExist) if err != nil { return nil, err } data, err := typeurl.MarshalAny(stats) if err != nil { return nil, err } return &taskAPI.StatsResponse{ Stats: data, }, nil } // Update a running container func (s *service) Update(ctx context.Context, r *taskAPI.UpdateTaskRequest) (*ptypes.Empty, error) { s.mu.Lock() p := s.task s.mu.Unlock() if p == nil { return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } if err := p.(*proc.Init).Update(ctx, r.Resources); err != nil { return nil, errdefs.ToGRPC(err) } return empty, nil } // Wait for a process to exit func (s *service) Wait(ctx context.Context, r *taskAPI.WaitRequest) (*taskAPI.WaitResponse, error) { p, err := s.getProcess(r.ExecID) if err != nil { return nil, err } if p == nil { return nil, errdefs.ToGRPCf(errdefs.ErrFailedPrecondition, "container must be created") } p.Wait() return &taskAPI.WaitResponse{ ExitStatus: uint32(p.ExitStatus()), ExitedAt: p.ExitedAt(), }, nil } func (s *service) processExits() { for e := range s.ec { s.checkProcesses(e) } } func (s *service) checkProcesses(e runcC.Exit) { shouldKillAll, err := shouldKillAllOnExit(s.bundle) if err != nil { log.G(s.context).WithError(err).Error("failed to check shouldKillAll") } for _, p := range s.allProcesses() { if p.Pid() == e.Pid { if shouldKillAll { if ip, ok := p.(*proc.Init); ok { // Ensure all children are killed if err := ip.KillAll(s.context); err != nil { logrus.WithError(err).WithField("id", ip.ID()). Error("failed to kill init's children") } } } p.SetExited(e.Status) s.events <- &eventstypes.TaskExit{ ContainerID: s.id, ID: p.ID(), Pid: uint32(e.Pid), ExitStatus: uint32(e.Status), ExitedAt: p.ExitedAt(), } return } } } func shouldKillAllOnExit(bundlePath string) (bool, error) { var bundleSpec specs.Spec bundleConfigContents, err := ioutil.ReadFile(filepath.Join(bundlePath, "config.json")) if err != nil { return false, err } json.Unmarshal(bundleConfigContents, &bundleSpec) if bundleSpec.Linux != nil { for _, ns := range bundleSpec.Linux.Namespaces { if ns.Type == specs.PIDNamespace { return false, nil } } } return true, nil } func (s *service) allProcesses() (o []rproc.Process) { s.mu.Lock() defer s.mu.Unlock() for _, p := range s.processes { o = append(o, p) } if s.task != nil { o = append(o, s.task) } return o } func (s *service) getContainerPids(ctx context.Context, id string) ([]uint32, error) { s.mu.Lock() p := s.task s.mu.Unlock() if p == nil { return nil, errors.Wrapf(errdefs.ErrFailedPrecondition, "container must be created") } ps, err := p.(*proc.Init).Runtime().Ps(ctx, id) if err != nil { return nil, err } pids := make([]uint32, 0, len(ps)) for _, pid := range ps { pids = append(pids, uint32(pid)) } return pids, nil } func (s *service) forward(publisher events.Publisher) { for e := range s.events { ctx, cancel := context.WithTimeout(s.context, 5*time.Second) err := publisher.Publish(ctx, getTopic(e), e) cancel() if err != nil { logrus.WithError(err).Error("post event") } } } func (s *service) getProcess(execID string) (rproc.Process, error) { s.mu.Lock() defer s.mu.Unlock() if execID == "" { return s.task, nil } p := s.processes[execID] if p == nil { return nil, errdefs.ToGRPCf(errdefs.ErrNotFound, "process does not exist %s", execID) } return p, nil } func (s *service) getCgroup() cgroups.Cgroup { s.mu.Lock() defer s.mu.Unlock() return s.cg } func (s *service) setCgroup(cg cgroups.Cgroup) { s.mu.Lock() s.cg = cg s.mu.Unlock() if err := s.ep.add(s.id, cg); err != nil { logrus.WithError(err).Error("add cg to OOM monitor") } } func getTopic(e interface{}) string { switch e.(type) { case *eventstypes.TaskCreate: return runtime.TaskCreateEventTopic case *eventstypes.TaskStart: return runtime.TaskStartEventTopic case *eventstypes.TaskOOM: return runtime.TaskOOMEventTopic case *eventstypes.TaskExit: return runtime.TaskExitEventTopic case *eventstypes.TaskDelete: return runtime.TaskDeleteEventTopic case *eventstypes.TaskExecAdded: return runtime.TaskExecAddedEventTopic case *eventstypes.TaskExecStarted: return runtime.TaskExecStartedEventTopic case *eventstypes.TaskPaused: return runtime.TaskPausedEventTopic case *eventstypes.TaskResumed: return runtime.TaskResumedEventTopic case *eventstypes.TaskCheckpointed: return runtime.TaskCheckpointedEventTopic default: logrus.Warnf("no topic for type %#v", e) } return runtime.TaskUnknownTopic } func newInit(ctx context.Context, path, workDir, namespace string, platform rproc.Platform, r *proc.CreateConfig, options *options.Options) (*proc.Init, error) { rootfs := filepath.Join(path, "rootfs") runtime := proc.NewRunc(options.Root, path, namespace, options.BinaryName, options.CriuPath, options.SystemdCgroup) p := proc.New(r.ID, runtime, rproc.Stdio{ Stdin: r.Stdin, Stdout: r.Stdout, Stderr: r.Stderr, Terminal: r.Terminal, }) p.Bundle = r.Bundle p.Platform = platform p.Rootfs = rootfs p.WorkDir = workDir p.IoUID = int(options.IoUid) p.IoGID = int(options.IoGid) p.NoPivotRoot = options.NoPivotRoot p.NoNewKeyring = options.NoNewKeyring return p, nil }