Update libcontainer dependency

This commit is contained in:
Buddha Prakash
2016-07-10 20:29:06 -07:00
parent 710374b65f
commit f6186afe99
60 changed files with 2415 additions and 946 deletions

View File

@@ -15,30 +15,35 @@ import (
"strings"
"sync"
"syscall"
"time"
"github.com/Sirupsen/logrus"
"github.com/golang/protobuf/proto"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/criurpc"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/syndtr/gocapability/capability"
"github.com/vishvananda/netlink/nl"
)
const stdioFdCount = 3
type linuxContainer struct {
id string
root string
config *configs.Config
cgroupManager cgroups.Manager
initPath string
initArgs []string
initProcess parentProcess
criuPath string
m sync.Mutex
criuVersion int
state containerState
id string
root string
config *configs.Config
cgroupManager cgroups.Manager
initPath string
initArgs []string
initProcess parentProcess
initProcessStartTime string
criuPath string
m sync.Mutex
criuVersion int
state containerState
created time.Time
}
// State represents a running container's state
@@ -59,7 +64,7 @@ type State struct {
ExternalDescriptors []string `json:"external_descriptors,omitempty"`
}
// A libcontainer container object.
// Container is a libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
@@ -75,13 +80,13 @@ type Container interface {
// Systemerror - System error.
Checkpoint(criuOpts *CriuOpts) error
// Restore restores the checkpointed container to a running state using the criu(8) utiity.
// Restore restores the checkpointed container to a running state using the criu(8) utility.
//
// errors:
// Systemerror - System error.
Restore(process *Process, criuOpts *CriuOpts) error
// If the Container state is RUNNING or PAUSING, sets the Container state to PAUSING and pauses
// If the Container state is RUNNING, sets the Container state to PAUSING and pauses
// the execution of any user processes. Asynchronously, when the container finished being paused the
// state is changed to PAUSED.
// If the Container state is PAUSED, do nothing.
@@ -138,7 +143,7 @@ func (c *linuxContainer) State() (*State, error) {
func (c *linuxContainer) Processes() ([]int, error) {
pids, err := c.cgroupManager.GetAllPids()
if err != nil {
return nil, newSystemError(err)
return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
}
return pids, nil
}
@@ -149,14 +154,14 @@ func (c *linuxContainer) Stats() (*Stats, error) {
stats = &Stats{}
)
if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
return stats, newSystemError(err)
return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
}
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
if err != nil {
return stats, newSystemError(err)
return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
}
stats.Interfaces = append(stats.Interfaces, istats)
}
@@ -167,6 +172,13 @@ func (c *linuxContainer) Stats() (*Stats, error) {
func (c *linuxContainer) Set(config configs.Config) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
if status == Stopped {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
}
c.config = &config
return c.cgroupManager.Set(c.config)
}
@@ -178,38 +190,90 @@ func (c *linuxContainer) Start(process *Process) error {
if err != nil {
return err
}
doInit := status == Destroyed
parent, err := c.newParentProcess(process, doInit)
return c.start(process, status == Stopped)
}
func (c *linuxContainer) Run(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return newSystemError(err)
return err
}
if err := c.start(process, status == Stopped); err != nil {
return err
}
if status == Stopped {
return c.exec()
}
return nil
}
func (c *linuxContainer) Exec() error {
c.m.Lock()
defer c.m.Unlock()
return c.exec()
}
func (c *linuxContainer) exec() error {
path := filepath.Join(c.root, execFifoFilename)
f, err := os.OpenFile(path, os.O_RDONLY, 0)
if err != nil {
return newSystemErrorWithCause(err, "open exec fifo for reading")
}
defer f.Close()
data, err := ioutil.ReadAll(f)
if err != nil {
return err
}
if len(data) > 0 {
os.Remove(path)
return nil
}
return fmt.Errorf("cannot start an already running container")
}
func (c *linuxContainer) start(process *Process, isInit bool) error {
parent, err := c.newParentProcess(process, isInit)
if err != nil {
return newSystemErrorWithCause(err, "creating new parent process")
}
if err := parent.start(); err != nil {
// terminate the process to ensure that it properly is reaped.
if err := parent.terminate(); err != nil {
logrus.Warn(err)
}
return newSystemError(err)
return newSystemErrorWithCause(err, "starting container process")
}
// generate a timestamp indicating when the container was started
c.created = time.Now().UTC()
c.state = &runningState{
c: c,
}
if doInit {
if err := c.updateState(parent); err != nil {
if isInit {
c.state = &createdState{
c: c,
}
state, err := c.updateState(parent)
if err != nil {
return err
}
c.initProcessStartTime = state.InitProcessStartTime
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Pid: parent.pid(),
Root: c.config.Rootfs,
Version: c.config.Version,
ID: c.id,
Pid: parent.pid(),
Root: c.config.Rootfs,
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
}
for _, hook := range c.config.Hooks.Poststart {
for i, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil {
if err := parent.terminate(); err != nil {
logrus.Warn(err)
}
return newSystemError(err)
return newSystemErrorWithCausef(err, "running poststart hook %d", i)
}
}
}
@@ -219,7 +283,7 @@ func (c *linuxContainer) Start(process *Process) error {
func (c *linuxContainer) Signal(s os.Signal) error {
if err := c.initProcess.signal(s); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "signaling init process")
}
return nil
}
@@ -227,19 +291,23 @@ func (c *linuxContainer) Signal(s os.Signal) error {
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
parentPipe, childPipe, err := newPipe()
if err != nil {
return nil, newSystemError(err)
return nil, newSystemErrorWithCause(err, "creating new init pipe")
}
cmd, err := c.commandTemplate(p, childPipe)
rootDir, err := os.Open(c.root)
if err != nil {
return nil, newSystemError(err)
return nil, err
}
cmd, err := c.commandTemplate(p, childPipe, rootDir)
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new command template")
}
if !doInit {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir)
}
return c.newInitProcess(p, cmd, parentPipe, childPipe)
return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
}
func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) {
cmd := &exec.Cmd{
Path: c.initPath,
Args: c.initArgs,
@@ -251,8 +319,10 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{}
}
cmd.ExtraFiles = append(p.ExtraFiles, childPipe)
cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir)
cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2),
fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
// NOTE: when running a container with no PID namespace and the parent process spawning the container is
// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
// even with the parent still running.
@@ -262,38 +332,42 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
return cmd, nil
}
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
t := "_LIBCONTAINER_INITTYPE=" + string(initStandard)
cloneFlags := c.config.Namespaces.CloneFlags()
if cloneFlags&syscall.CLONE_NEWUSER != 0 {
if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil {
// user mappings are not supported
return nil, err
}
enableSetgroups(cmd.SysProcAttr)
// Default to root user when user namespaces are enabled.
if cmd.SysProcAttr.Credential == nil {
cmd.SysProcAttr.Credential = &syscall.Credential{}
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
if ns.Path != "" {
nsMaps[ns.Type] = ns.Path
}
}
cmd.Env = append(cmd.Env, t)
cmd.SysProcAttr.Cloneflags = cloneFlags
_, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "")
if err != nil {
return nil, err
}
return &initProcess{
cmd: cmd,
childPipe: childPipe,
parentPipe: parentPipe,
manager: c.cgroupManager,
config: c.newInitConfig(p),
container: c,
process: p,
cmd: cmd,
childPipe: childPipe,
parentPipe: parentPipe,
manager: c.cgroupManager,
config: c.newInitConfig(p),
container: c,
process: p,
bootstrapData: data,
sharePidns: sharePidns,
rootDir: rootDir,
}, nil
}
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
state, err := c.currentState()
if err != nil {
return nil, newSystemErrorWithCause(err, "getting container's current state")
}
// for setns process, we dont have to set cloneflags as the process namespaces
// will only be set via setns syscall
data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath)
data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath)
if err != nil {
return nil, err
}
@@ -306,11 +380,12 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe,
config: c.newInitConfig(p),
process: p,
bootstrapData: data,
rootDir: rootDir,
}, nil
}
func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
return &initConfig{
cfg := &initConfig{
Config: c.config,
Args: process.Args,
Env: process.Env,
@@ -319,7 +394,26 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
Console: process.consolePath,
Capabilities: process.Capabilities,
PassedFilesCount: len(process.ExtraFiles),
ContainerId: c.ID(),
NoNewPrivileges: c.config.NoNewPrivileges,
AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits,
ExecFifoPath: filepath.Join(c.root, execFifoFilename),
}
if process.NoNewPrivileges != nil {
cfg.NoNewPrivileges = *process.NoNewPrivileges
}
if process.AppArmorProfile != "" {
cfg.AppArmorProfile = process.AppArmorProfile
}
if process.Label != "" {
cfg.ProcessLabel = process.Label
}
if len(process.Rlimits) > 0 {
cfg.Rlimits = process.Rlimits
}
return cfg
}
func newPipe() (parent *os.File, child *os.File, err error) {
@@ -343,15 +437,16 @@ func (c *linuxContainer) Pause() error {
if err != nil {
return err
}
if status != Running {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
switch status {
case Running, Created:
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err
}
return c.state.transition(&pausedState{
c: c,
})
}
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err
}
return c.state.transition(&pausedState{
c: c,
})
return newGenericError(fmt.Errorf("container not running: %s", status), ContainerNotRunning)
}
func (c *linuxContainer) Resume() error {
@@ -380,23 +475,13 @@ func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struc
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
}
// XXX debug support, remove when debugging done.
func addArgsFromEnv(evar string, args *[]string) {
if e := os.Getenv(evar); e != "" {
for _, f := range strings.Fields(e) {
*args = append(*args, f)
}
}
fmt.Printf(">>> criu %v\n", *args)
}
// check Criu version greater than or equal to min_version
func (c *linuxContainer) checkCriuVersion(min_version string) error {
// checkCriuVersion checks Criu version greater than or equal to minVersion
func (c *linuxContainer) checkCriuVersion(minVersion string) error {
var x, y, z, versionReq int
_, err := fmt.Sscanf(min_version, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
_, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
if err != nil {
_, err = fmt.Sscanf(min_version, "Version: %d.%d\n", &x, &y) // 1.6
_, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6
}
versionReq = x*10000 + y*100 + z
@@ -441,7 +526,7 @@ func (c *linuxContainer) checkCriuVersion(min_version string) error {
c.criuVersion = x*10000 + y*100 + z
if c.criuVersion < versionReq {
return fmt.Errorf("CRIU version must be %s or higher", min_version)
return fmt.Errorf("CRIU version must be %s or higher", minVersion)
}
return nil
@@ -514,6 +599,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
FileLocks: proto.Bool(criuOpts.FileLocks),
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
}
// append optional criu opts, e.g., page-server and port
@@ -529,7 +615,8 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
if err := c.checkCriuVersion("1.7"); err != nil {
return err
}
rpcOpts.ManageCgroupsMode = proto.Uint32(uint32(criuOpts.ManageCgroupsMode))
mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
rpcOpts.ManageCgroupsMode = &mode
}
t := criurpc.CriuReqType_DUMP
@@ -587,6 +674,27 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(iface.HostInterfaceName)
veth.IfIn = proto.String(iface.Name)
req.Opts.Veths = append(req.Opts.Veths, veth)
break
case "loopback":
break
}
}
for _, i := range criuOpts.VethPairs {
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(i.HostInterfaceName)
veth.IfIn = proto.String(i.ContainerInterfaceName)
req.Opts.Veths = append(req.Opts.Veths, veth)
}
}
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
@@ -650,6 +758,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
FileLocks: proto.Bool(criuOpts.FileLocks),
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
},
}
@@ -669,23 +778,9 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
break
}
}
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(iface.HostInterfaceName)
veth.IfIn = proto.String(iface.Name)
req.Opts.Veths = append(req.Opts.Veths, veth)
break
case "loopback":
break
}
}
for _, i := range criuOpts.VethPairs {
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(i.HostInterfaceName)
veth.IfIn = proto.String(i.ContainerInterfaceName)
req.Opts.Veths = append(req.Opts.Veths, veth)
if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
c.restoreNetwork(req, criuOpts)
}
// append optional manage cgroups mode
@@ -693,7 +788,8 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
if err := c.checkCriuVersion("1.7"); err != nil {
return err
}
req.Opts.ManageCgroupsMode = proto.Uint32(uint32(criuOpts.ManageCgroupsMode))
mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
req.Opts.ManageCgroupsMode = &mode
}
var (
@@ -850,7 +946,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
if err != nil {
return err
}
n, err = criuClient.Write(data)
_, err = criuClient.Write(data)
if err != nil {
return err
}
@@ -925,6 +1021,20 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
if err := lockNetwork(c.config); err != nil {
return err
}
case notify.GetScript() == "setup-namespaces":
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Pid: int(notify.GetPid()),
Root: c.config.Rootfs,
}
for i, hook := range c.config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
case notify.GetScript() == "post-restore":
pid := notify.GetPid()
r, err := newRestoredProcess(int(pid), fds)
@@ -938,7 +1048,7 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
}); err != nil {
return err
}
if err := c.updateState(r); err != nil {
if _, err := c.updateState(r); err != nil {
return err
}
if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
@@ -950,13 +1060,17 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
return nil
}
func (c *linuxContainer) updateState(process parentProcess) error {
func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
c.initProcess = process
state, err := c.currentState()
if err != nil {
return err
return nil, err
}
return c.saveState(state)
err = c.saveState(state)
if err != nil {
return nil, err
}
return state, nil
}
func (c *linuxContainer) saveState(s *State) error {
@@ -991,37 +1105,75 @@ func (c *linuxContainer) refreshState() error {
if paused {
return c.state.transition(&pausedState{c: c})
}
running, err := c.isRunning()
t, err := c.runType()
if err != nil {
return err
}
if running {
switch t {
case Created:
return c.state.transition(&createdState{c: c})
case Running:
return c.state.transition(&runningState{c: c})
}
return c.state.transition(&stoppedState{c: c})
}
func (c *linuxContainer) isRunning() (bool, error) {
if c.initProcess == nil {
// doesInitProcessExist checks if the init process is still the same process
// as the initial one, it could happen that the original process has exited
// and a new process has been created with the same pid, in this case, the
// container would already be stopped.
func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) {
startTime, err := system.GetProcessStartTime(initPid)
if err != nil {
return false, newSystemErrorWithCausef(err, "getting init process %d start time", initPid)
}
if c.initProcessStartTime != startTime {
return false, nil
}
// return Running if the init process is alive
if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
if err == syscall.ESRCH {
return false, nil
}
return false, newSystemError(err)
}
return true, nil
}
func (c *linuxContainer) runType() (Status, error) {
if c.initProcess == nil {
return Stopped, nil
}
pid := c.initProcess.pid()
// return Running if the init process is alive
if err := syscall.Kill(pid, 0); err != nil {
if err == syscall.ESRCH {
// It means the process does not exist anymore, could happen when the
// process exited just when we call the function, we should not return
// error in this case.
return Stopped, nil
}
return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid)
}
// check if the process is still the original init process.
exist, err := c.doesInitProcessExist(pid)
if !exist || err != nil {
return Stopped, err
}
// check if the process that is running is the init process or the user's process.
// this is the difference between the container Running and Created.
environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid))
if err != nil {
return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid)
}
check := []byte("_LIBCONTAINER")
if bytes.Contains(environ, check) {
return Created, nil
}
return Running, nil
}
func (c *linuxContainer) isPaused() (bool, error) {
data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
if err != nil {
// If freezer cgroup is not mounted, the container would just be not paused.
if os.IsNotExist(err) {
return false, nil
}
return false, newSystemError(err)
return false, newSystemErrorWithCause(err, "checking if container is paused")
}
return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
}
@@ -1043,6 +1195,7 @@ func (c *linuxContainer) currentState() (*State, error) {
Config: *c.config,
InitProcessPid: pid,
InitProcessStartTime: startTime,
Created: c.created,
},
CgroupPaths: c.cgroupManager.GetPaths(),
NamespacePaths: make(map[configs.NamespaceType]string),
@@ -1053,6 +1206,9 @@ func (c *linuxContainer) currentState() (*State, error) {
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
}
for _, nsType := range configs.NamespaceTypes() {
if !configs.IsNamespaceSupported(nsType) {
continue
}
if _, ok := state.NamespacePaths[nsType]; !ok {
ns := configs.Namespace{Type: nsType}
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
@@ -1062,18 +1218,69 @@ func (c *linuxContainer) currentState() (*State, error) {
return state, nil
}
// bootstrapData encodes the necessary data in netlink binary format as a io.Reader.
// Consumer can write the data to a bootstrap program such as one that uses
// nsenter package to bootstrap the container's init process correctly, i.e. with
// correct namespaces, uid/gid mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath string) (io.Reader, error) {
// orderNamespacePaths sorts namespace paths into a list of paths that we
// can setns in order.
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
paths := []string{}
nsTypes := []configs.NamespaceType{
configs.NEWIPC,
configs.NEWUTS,
configs.NEWNET,
configs.NEWPID,
configs.NEWNS,
}
// join userns if the init process explicitly requires NEWUSER
if c.config.Namespaces.Contains(configs.NEWUSER) {
nsTypes = append(nsTypes, configs.NEWUSER)
}
for _, nsType := range nsTypes {
if p, ok := namespaces[nsType]; ok && p != "" {
// check if the requested namespace is supported
if !configs.IsNamespaceSupported(nsType) {
return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType))
}
// only set to join this namespace if it exists
if _, err := os.Lstat(p); err != nil {
return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
}
// do not allow namespace path with comma as we use it to separate
// the namespace paths
if strings.ContainsRune(p, ',') {
return nil, newSystemError(fmt.Errorf("invalid path %s", p))
}
paths = append(paths, p)
}
}
return paths, nil
}
func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
data := bytes.NewBuffer(nil)
for _, im := range idMap {
line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
if _, err := data.WriteString(line); err != nil {
return nil, err
}
}
return data.Bytes(), nil
}
// bootstrapData encodes the necessary data in netlink binary format
// as a io.Reader.
// Consumer can write the data to a bootstrap program
// such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) {
// create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0)
// write pid
// write cloneFlags
r.AddData(&Int32msg{
Type: PidAttr,
Value: uint32(pid),
Type: CloneFlagsAttr,
Value: uint32(cloneFlags),
})
// write console path
if consolePath != "" {
r.AddData(&Bytemsg{
@@ -1081,5 +1288,57 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath
Value: []byte(consolePath),
})
}
// write custom namespace paths
if len(nsMaps) > 0 {
nsPaths, err := c.orderNamespacePaths(nsMaps)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: NsPathsAttr,
Value: []byte(strings.Join(nsPaths, ",")),
})
}
// write namespace paths only when we are not joining an existing user ns
_, joinExistingUser := nsMaps[configs.NEWUSER]
if !joinExistingUser {
// write uid mappings
if len(c.config.UidMappings) > 0 {
b, err := encodeIDMapping(c.config.UidMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: UidmapAttr,
Value: b,
})
}
// write gid mappings
if len(c.config.GidMappings) > 0 {
b, err := encodeIDMapping(c.config.GidMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: GidmapAttr,
Value: b,
})
// check if we have CAP_SETGID to setgroup properly
pid, err := capability.NewPid(os.Getpid())
if err != nil {
return nil, err
}
if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
r.AddData(&Boolmsg{
Type: SetgroupAttr,
Value: true,
})
}
}
}
return bytes.NewReader(r.Serialize()), nil
}