Update libcontainer dependency

This commit is contained in:
Buddha Prakash
2016-07-10 20:29:06 -07:00
parent 710374b65f
commit f6186afe99
60 changed files with 2415 additions and 946 deletions

View File

@@ -76,7 +76,7 @@ config := &configs.Config{
Name: "test-container",
Parent: "system",
Resources: &configs.Resources{
MemorySwappiness: -1,
MemorySwappiness: nil,
AllowAllDevices: false,
AllowedDevices: configs.DefaultAllowedDevices,
},
@@ -133,15 +133,15 @@ config := &configs.Config{
UidMappings: []configs.IDMap{
{
ContainerID: 0,
Host: 1000,
size: 65536,
HostID: 1000,
Size: 65536,
},
},
GidMappings: []configs.IDMap{
{
ContainerID: 0,
Host: 1000,
size: 65536,
HostID: 1000,
Size: 65536,
},
},
Networks: []*configs.Network{
@@ -186,8 +186,8 @@ process := &libcontainer.Process{
err := container.Start(process)
if err != nil {
logrus.Fatal(err)
container.Destroy()
logrus.Fatal(err)
return
}
@@ -216,6 +216,12 @@ container.Pause()
// resume all paused processes.
container.Resume()
// send signal to container's init process.
container.Signal(signal)
// update container resource constraints.
container.Set(config)
```

View File

@@ -90,7 +90,7 @@ in tmpfs.
After `/dev/null` has been setup we check for any external links between
the container's io, STDIN, STDOUT, STDERR. If the container's io is pointing
to `/dev/null` outside the container we close and `dup2` the the `/dev/null`
to `/dev/null` outside the container we close and `dup2` the `/dev/null`
that is local to the container's rootfs.
@@ -142,6 +142,7 @@ system resources like cpu, memory, and device access.
| perf_event | 1 |
| freezer | 1 |
| hugetlb | 1 |
| pids | 1 |
All cgroup subsystem are joined so that statistics can be collected from
@@ -199,7 +200,7 @@ provide a good default for security and flexibility for the applications.
| CAP_SYS_BOOT | 0 |
| CAP_LEASE | 0 |
| CAP_WAKE_ALARM | 0 |
| CAP_BLOCK_SUSPE | 0 |
| CAP_BLOCK_SUSPEND | 0 |
Additional security layers like [apparmor](https://wiki.ubuntu.com/AppArmor)
@@ -296,7 +297,7 @@ a container.
| -------------- | ------------------------------------------------------------------ |
| Get processes | Return all the pids for processes running inside a container |
| Get Stats | Return resource statistics for the container as a whole |
| Wait | Wait waits on the container's init process ( pid 1 ) |
| Wait | Waits on the container's init process ( pid 1 ) |
| Wait Process | Wait on any of the container's processes returning the exit status |
| Destroy | Kill the container's init process and remove any filesystem state |
| Signal | Send a signal to the container's init process |

View File

@@ -7,6 +7,7 @@ package apparmor
// #include <stdlib.h>
import "C"
import (
"fmt"
"io/ioutil"
"os"
"unsafe"
@@ -32,7 +33,7 @@ func ApplyProfile(name string) error {
cName := C.CString(name)
defer C.free(unsafe.Pointer(cName))
if _, err := C.aa_change_onexec(cName); err != nil {
return err
return fmt.Errorf("apparmor failed to apply profile: %s", err)
}
return nil
}

View File

@@ -9,7 +9,7 @@ import (
)
type Manager interface {
// Apply cgroup configuration to the process with the specified pid
// Applies cgroup configuration to the process with the specified pid
Apply(pid int) error
// Returns the PIDs inside the cgroup set

View File

@@ -14,6 +14,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
)
var (
@@ -30,6 +31,7 @@ var (
&NetPrioGroup{},
&PerfEventGroup{},
&FreezerGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
}
CgroupProcesses = "cgroup.procs"
HugePageSizes, _ = cgroups.GetHugePageSize()
@@ -94,11 +96,10 @@ func getCgroupRoot() (string, error) {
}
type cgroupData struct {
root string
parent string
name string
config *configs.Cgroup
pid int
root string
innerPath string
config *configs.Cgroup
pid int
}
func (m *Manager) Apply(pid int) (err error) {
@@ -129,12 +130,9 @@ func (m *Manager) Apply(pid int) (err error) {
return cgroups.EnterPid(m.Paths, pid)
}
m.mu.Lock()
defer m.mu.Unlock()
paths := make(map[string]string)
defer func() {
if err != nil {
cgroups.RemovePaths(paths)
}
}()
for _, sys := range subsystems {
if err := sys.Apply(d); err != nil {
return err
@@ -144,7 +142,9 @@ func (m *Manager) Apply(pid int) (err error) {
// created then join consists of writing the process pids to cgroup.procs
p, err := d.path(sys.Name())
if err != nil {
if cgroups.IsNotFound(err) {
// The non-presence of the devices subsystem is
// considered fatal for security reasons.
if cgroups.IsNotFound(err) && sys.Name() != "devices" {
continue
}
return err
@@ -267,45 +267,31 @@ func getCgroupPath(c *configs.Cgroup) (string, error) {
return d.path("devices")
}
// pathClean makes a path safe for use with filepath.Join. This is done by not
// only cleaning the path, but also (if the path is relative) adding a leading
// '/' and cleaning it (then removing the leading '/'). This ensures that a
// path resulting from prepending another path will always resolve to lexically
// be a subdirectory of the prefixed path. This is all done lexically, so paths
// that include symlinks won't be safe as a result of using pathClean.
func pathClean(path string) string {
// Ensure that all paths are cleaned (especially problematic ones like
// "/../../../../../" which can cause lots of issues).
path = filepath.Clean(path)
// If the path isn't absolute, we need to do more processing to fix paths
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
// paths to relative ones.
if !filepath.IsAbs(path) {
path = filepath.Clean(string(os.PathSeparator) + path)
// This can't fail, as (by definition) all paths are relative to root.
path, _ = filepath.Rel(string(os.PathSeparator), path)
}
// Clean the path again for good measure.
return filepath.Clean(path)
}
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
root, err := getCgroupRoot()
if err != nil {
return nil, err
}
// Clean the parent slice path.
c.Parent = pathClean(c.Parent)
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
}
// XXX: Do not remove this code. Path safety is important! -- cyphar
cgPath := libcontainerUtils.CleanPath(c.Path)
cgParent := libcontainerUtils.CleanPath(c.Parent)
cgName := libcontainerUtils.CleanPath(c.Name)
innerPath := cgPath
if innerPath == "" {
innerPath = filepath.Join(cgParent, cgName)
}
return &cgroupData{
root: root,
parent: c.Parent,
name: c.Name,
config: c,
pid: pid,
root: root,
innerPath: innerPath,
config: c,
pid: pid,
}, nil
}
@@ -333,11 +319,10 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
return "", err
}
cgPath := filepath.Join(raw.parent, raw.name)
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(cgPath) {
if filepath.IsAbs(raw.innerPath) {
// Sometimes subsystems can be mounted togethger as 'cpu,cpuacct'.
return filepath.Join(raw.root, filepath.Base(mnt), cgPath), nil
return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
}
parentPath, err := raw.parentPath(subsystem, mnt, root)
@@ -345,7 +330,7 @@ func (raw *cgroupData) path(subsystem string) (string, error) {
return "", err
}
return filepath.Join(parentPath, cgPath), nil
return filepath.Join(parentPath, raw.innerPath), nil
}
func (raw *cgroupData) join(subsystem string) (string, error) {
@@ -366,9 +351,12 @@ func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s.", file)
return fmt.Errorf("no such directory for %s", file)
}
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", data, file, err)
}
return nil
}
func readFile(dir, file string) (string, error) {

View File

@@ -12,6 +12,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
)
type CpusetGroup struct {
@@ -88,7 +89,7 @@ func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []b
// it's parent.
func (s *CpusetGroup) ensureParent(current, root string) error {
parent := filepath.Dir(current)
if filepath.Clean(parent) == root {
if libcontainerUtils.CleanPath(parent) == root {
return nil
}
// Avoid infinite recursion.

View File

@@ -5,6 +5,7 @@ package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
)
type DevicesGroup struct {
@@ -25,6 +26,23 @@ func (s *DevicesGroup) Apply(d *cgroupData) error {
}
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
if system.RunningInUserNS() {
return nil
}
devices := cgroup.Resources.Devices
if len(devices) > 0 {
for _, dev := range devices {
file := "devices.deny"
if dev.Allow {
file = "devices.allow"
}
if err := writeFile(path, file, dev.CgroupString()); err != nil {
return err
}
}
return nil
}
if !cgroup.Resources.AllowAllDevices {
if err := writeFile(path, "devices.deny", "a"); err != nil {
return err

View File

@@ -9,6 +9,7 @@ import (
"path/filepath"
"strconv"
"strings"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
@@ -26,38 +27,75 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if memoryAssigned(d.config) {
if path != "" {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
}
// We have to set kernel memory here, as we can't change it once
// processes have been attached.
if err := s.SetKernelMemory(path, d.config); err != nil {
return err
}
// reset error.
err = nil
if path == "" {
// Invalid input.
return fmt.Errorf("invalid path for memory cgroups: %+v", d)
}
defer func() {
if err != nil {
os.RemoveAll(path)
}
}()
if !cgroups.PathExists(path) {
if err = os.MkdirAll(path, 0755); err != nil {
return err
}
}
if memoryAssigned(d.config) {
// We have to set kernel memory here, as we can't change it once
// processes have been attached to the cgroup.
if err = s.SetKernelMemory(path, d.config); err != nil {
return err
}
}
// We need to join memory cgroup after set memory limits, because
// kmem.limit_in_bytes can only be set when the cgroup is empty.
_, err = d.join("memory")
if err != nil && !cgroups.IsNotFound(err) {
if _, jerr := d.join("memory"); jerr != nil && !cgroups.IsNotFound(jerr) {
err = jerr
return err
}
return nil
}
func getModifyTime(path string) (time.Time, error) {
stat, err := os.Stat(path)
if err != nil {
return time.Time{}, fmt.Errorf("failed to get memory cgroups creation time: %v", err)
}
return stat.ModTime(), nil
}
func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error {
// This has to be done separately because it has special constraints (it
// can't be done after there are processes attached to the cgroup).
if cgroup.Resources.KernelMemory > 0 {
// This has to be done separately because it has special
// constraints (it can only be initialized before setting up a
// hierarchy or adding a task to the cgroups. However, if
// sucessfully initialized, it can be updated anytime afterwards)
if cgroup.Resources.KernelMemory != 0 {
// Is kmem.limit_in_bytes already set?
// memory.kmem.max_usage_in_bytes is a read-only file. Use it to get cgroups creation time.
kmemCreationTime, err := getModifyTime(filepath.Join(path, "memory.kmem.max_usage_in_bytes"))
if err != nil {
return err
}
kmemLimitsUpdateTime, err := getModifyTime(filepath.Join(path, "memory.kmem.limit_in_bytes"))
if err != nil {
return err
}
// kmem.limit_in_bytes has already been set if its update time is after that of creation time.
// We use `!=` op instead of `>` because updates are losing precision compared to creation.
kmemInitialized := !kmemLimitsUpdateTime.Equal(kmemCreationTime)
if !kmemInitialized {
// If there's already tasks in the cgroup, we can't change the limit either
tasks, err := getCgroupParamString(path, "tasks")
if err != nil {
return err
}
if tasks != "" {
return fmt.Errorf("cannot set kmem.limit_in_bytes after task have joined this cgroup")
}
}
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
return err
}
@@ -65,19 +103,65 @@ func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error
return nil
}
func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.Memory != 0 {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
// When memory and swap memory are both set, we need to handle the cases
// for updating container.
if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap > 0 {
memoryUsage, err := getMemoryData(path, "")
if err != nil {
return err
}
// When update memory limit, we should adapt the write sequence
// for memory and swap memory, so it won't fail because the new
// value and the old value don't fit kernel's validation.
if memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
} else {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
}
} else {
if cgroup.Resources.Memory != 0 {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
}
if cgroup.Resources.MemorySwap > 0 {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
}
}
return nil
}
func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
if err := setMemoryAndSwap(path, cgroup); err != nil {
return err
}
if err := s.SetKernelMemory(path, cgroup); err != nil {
return err
}
if cgroup.Resources.MemoryReservation != 0 {
if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
return err
}
}
if cgroup.Resources.MemorySwap > 0 {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
if cgroup.Resources.KernelMemoryTCP != 0 {
if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
return err
}
}
@@ -86,14 +170,14 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
return err
}
}
if cgroup.Resources.MemorySwappiness >= 0 && cgroup.Resources.MemorySwappiness <= 100 {
if err := writeFile(path, "memory.swappiness", strconv.FormatInt(cgroup.Resources.MemorySwappiness, 10)); err != nil {
if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
return nil
} else if int64(*cgroup.Resources.MemorySwappiness) >= 0 && int64(*cgroup.Resources.MemorySwappiness) <= 100 {
if err := writeFile(path, "memory.swappiness", strconv.FormatInt(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
return err
}
} else if cgroup.Resources.MemorySwappiness == -1 {
return nil
} else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", cgroup.Resources.MemorySwappiness)
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", int64(*cgroup.Resources.MemorySwappiness))
}
return nil
@@ -139,6 +223,11 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
return err
}
stats.MemoryStats.KernelUsage = kernelUsage
kernelTCPUsage, err := getMemoryData(path, "kmem.tcp")
if err != nil {
return err
}
stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
return nil
}
@@ -148,8 +237,9 @@ func memoryAssigned(cgroup *configs.Cgroup) bool {
cgroup.Resources.MemoryReservation != 0 ||
cgroup.Resources.MemorySwap > 0 ||
cgroup.Resources.KernelMemory > 0 ||
cgroup.Resources.KernelMemoryTCP > 0 ||
cgroup.Resources.OomKillDisable ||
cgroup.Resources.MemorySwappiness != -1
(cgroup.Resources.MemorySwappiness != nil && *cgroup.Resources.MemorySwappiness != -1)
}
func getMemoryData(path, name string) (cgroups.MemoryData, error) {
@@ -162,6 +252,7 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
value, err := getCgroupParamUint(path, usage)
if err != nil {
@@ -187,6 +278,14 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
}
memoryData.Failcnt = value
value, err = getCgroupParamUint(path, limit)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
}
memoryData.Limit = value
return memoryData, nil
}

View File

@@ -9,6 +9,7 @@ import (
type NameGroup struct {
GroupName string
Join bool
}
func (s *NameGroup) Name() string {
@@ -16,6 +17,10 @@ func (s *NameGroup) Name() string {
}
func (s *NameGroup) Apply(d *cgroupData) error {
if s.Join {
// ignore errors if the named cgroup does not exist
d.join(s.GroupName)
}
return nil
}
@@ -24,6 +29,9 @@ func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error {
}
func (s *NameGroup) Remove(d *cgroupData) error {
if s.Join {
removePath(d.path(s.GroupName))
}
return nil
}

View File

@@ -4,6 +4,7 @@ package fs
import (
"fmt"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -47,11 +48,26 @@ func (s *PidsGroup) Remove(d *cgroupData) error {
}
func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
value, err := getCgroupParamUint(path, "pids.current")
current, err := getCgroupParamUint(path, "pids.current")
if err != nil {
return fmt.Errorf("failed to parse pids.current - %s", err)
}
stats.PidsStats.Current = value
maxString, err := getCgroupParamString(path, "pids.max")
if err != nil {
return fmt.Errorf("failed to parse pids.max - %s", err)
}
// Default if pids.max == "max" is 0 -- which represents "no limit".
var max uint64
if maxString != "max" {
max, err = parseUint(maxString, 10, 64)
if err != nil {
return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max"))
}
}
stats.PidsStats.Current = current
stats.PidsStats.Limit = max
return nil
}

View File

@@ -12,7 +12,6 @@ import (
)
var (
ErrNotSupportStat = errors.New("stats are not supported for subsystem")
ErrNotValidFormat = errors.New("line is not a valid key value format")
)

View File

@@ -11,6 +11,7 @@ type ThrottlingData struct {
ThrottledTime uint64 `json:"throttled_time,omitempty"`
}
// CpuUsage denotes the usage of a CPU.
// All CPU stats are aggregate since container inception.
type CpuUsage struct {
// Total CPU time consumed.
@@ -36,7 +37,9 @@ type MemoryData struct {
Usage uint64 `json:"usage,omitempty"`
MaxUsage uint64 `json:"max_usage,omitempty"`
Failcnt uint64 `json:"failcnt"`
Limit uint64 `json:"limit"`
}
type MemoryStats struct {
// memory used for cache
Cache uint64 `json:"cache,omitempty"`
@@ -44,14 +47,19 @@ type MemoryStats struct {
Usage MemoryData `json:"usage,omitempty"`
// usage of memory + swap
SwapUsage MemoryData `json:"swap_usage,omitempty"`
// usafe of kernel memory
KernelUsage MemoryData `json:"kernel_usage,omitempty"`
Stats map[string]uint64 `json:"stats,omitempty"`
// usage of kernel memory
KernelUsage MemoryData `json:"kernel_usage,omitempty"`
// usage of kernel TCP memory
KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
Stats map[string]uint64 `json:"stats,omitempty"`
}
type PidsStats struct {
// number of pids in the cgroup
Current uint64 `json:"current,omitempty"`
// active pids hard limit
Limit uint64 `json:"limit,omitempty"`
}
type BlkioStatEntry struct {
@@ -78,7 +86,7 @@ type HugetlbStats struct {
Usage uint64 `json:"usage,omitempty"`
// maximum usage ever recorded.
MaxUsage uint64 `json:"max_usage,omitempty"`
// number of times htgetlb usage allocation failure.
// number of times hugetlb usage allocation failure.
Failcnt uint64 `json:"failcnt"`
}

View File

@@ -74,6 +74,7 @@ var (
theConn *systemdDbus.Conn
hasStartTransientUnit bool
hasTransientDefaultDependencies bool
hasDelegate bool
)
func newProp(name string, units interface{}) systemdDbus.Property {
@@ -146,20 +147,24 @@ func UseSystemd() bool {
// Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil)
// Assume StartTransientUnit on a scope allows Delegate
hasDelegate = true
dl := newProp("Delegate", true)
if _, err := theConn.StartTransientUnit(scope, "replace", []systemdDbus.Property{dl}, nil); err != nil {
if dbusError, ok := err.(dbus.Error); ok {
if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") {
hasDelegate = false
}
}
}
// Not critical because of the stop unit logic above.
theConn.StopUnit(scope, "replace", nil)
}
return hasStartTransientUnit
}
func getIfaceForUnit(unitName string) string {
if strings.HasSuffix(unitName, ".scope") {
return "Scope"
}
if strings.HasSuffix(unitName, ".service") {
return "Service"
}
return "Unit"
}
func (m *Manager) Apply(pid int) error {
var (
c = m.Cgroups
@@ -195,6 +200,11 @@ func (m *Manager) Apply(pid int) error {
newProp("PIDs", []uint32{uint32(pid)}),
)
if hasDelegate {
// This is only supported on systemd versions 218 and above.
properties = append(properties, newProp("Delegate", true))
}
// Always enable accounting, this gets us the same behaviour as the fs implementation,
// plus the kernel has some problems with joining the memory cgroup at a later time.
properties = append(properties,
@@ -222,11 +232,9 @@ func (m *Manager) Apply(pid int) error {
newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
}
// We need to set kernel memory before processes join cgroup because
// kmem.limit_in_bytes can only be set when the cgroup is empty.
// And swap memory limit needs to be set after memory limit, only
// memory limit is handled by systemd, so it's kind of ugly here.
if c.Resources.KernelMemory > 0 {
// We have to set kernel memory here, as we can't change it once
// processes have been attached to the cgroup.
if c.Resources.KernelMemory != 0 {
if err := setKernelMemory(c); err != nil {
return err
}
@@ -236,53 +244,7 @@ func (m *Manager) Apply(pid int) error {
return err
}
if err := joinDevices(c, pid); err != nil {
return err
}
// TODO: CpuQuota and CpuPeriod not available in systemd
// we need to manually join the cpu.cfs_quota_us and cpu.cfs_period_us
if err := joinCpu(c, pid); err != nil {
return err
}
// TODO: MemoryReservation and MemorySwap not available in systemd
if err := joinMemory(c, pid); err != nil {
return err
}
// we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd
// because it does not currently support it via the dbus api.
if err := joinFreezer(c, pid); err != nil {
return err
}
if err := joinNetPrio(c, pid); err != nil {
return err
}
if err := joinNetCls(c, pid); err != nil {
return err
}
if err := joinPids(c, pid); err != nil {
return err
}
if err := joinCpuset(c, pid); err != nil {
return err
}
if err := joinHugetlb(c, pid); err != nil {
return err
}
if err := joinPerfEvent(c, pid); err != nil {
return err
}
// FIXME: Systemd does have `BlockIODeviceWeight` property, but we got problem
// using that (at least on systemd 208, see https://github.com/opencontainers/runc/libcontainer/pull/354),
// so use fs work around for now.
if err := joinBlkio(c, pid); err != nil {
if err := joinCgroups(c, pid); err != nil {
return err
}
@@ -327,7 +289,7 @@ func writeFile(dir, file, data string) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s.", file)
return fmt.Errorf("no such directory for %s", file)
}
return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700)
}
@@ -347,43 +309,41 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
return path, nil
}
func joinCpu(c *configs.Cgroup, pid int) error {
_, err := join(c, "cpu", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
func joinCgroups(c *configs.Cgroup, pid int) error {
for _, sys := range subsystems {
name := sys.Name()
switch name {
case "name=systemd":
// let systemd handle this
break
case "cpuset":
path, err := getSubsystemPath(c, name)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
s := &fs.CpusetGroup{}
if err := s.ApplyDir(path, c, pid); err != nil {
return err
}
break
default:
_, err := join(c, name, pid)
if err != nil {
// Even if it's `not found` error, we'll return err
// because devices cgroup is hard requirement for
// container security.
if name == "devices" {
return err
}
// For other subsystems, omit the `not found` error
// because they are optional.
if !cgroups.IsNotFound(err) {
return err
}
}
}
}
return nil
}
func joinFreezer(c *configs.Cgroup, pid int) error {
_, err := join(c, "freezer", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinNetPrio(c *configs.Cgroup, pid int) error {
_, err := join(c, "net_prio", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinNetCls(c *configs.Cgroup, pid int) error {
_, err := join(c, "net_cls", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinPids(c *configs.Cgroup, pid int) error {
_, err := join(c, "pids", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
@@ -392,9 +352,18 @@ func joinPids(c *configs.Cgroup, pid int) error {
// test.slice/test-a.slice/test-a-b.slice.
func expandSlice(slice string) (string, error) {
suffix := ".slice"
sliceName := strings.TrimSuffix(slice, suffix)
// Name has to end with ".slice", but can't be just ".slice".
if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
// Path-separators are not allowed.
if strings.Contains(slice, "/") {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
var path, prefix string
sliceName := strings.TrimSuffix(slice, suffix)
for _, component := range strings.Split(sliceName, "-") {
// test--a.slice isn't permitted, nor is -test.slice.
if component == "" {
@@ -510,87 +479,11 @@ func getUnitName(c *configs.Cgroup) string {
return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
}
// Atm we can't use the systemd device support because of two missing things:
// * Support for wildcards to allow mknod on any device
// * Support for wildcards to allow /dev/pts support
//
// The second is available in more recent systemd as "char-pts", but not in e.g. v208 which is
// in wide use. When both these are available we will be able to switch, but need to keep the old
// implementation for backwards compat.
//
// Note: we can't use systemd to set up the initial limits, and then change the cgroup
// because systemd will re-write the device settings if it needs to re-apply the cgroup context.
// This happens at least for v208 when any sibling unit is started.
func joinDevices(c *configs.Cgroup, pid int) error {
_, err := join(c, "devices", pid)
// Even if it's `not found` error, we'll return err because devices cgroup
// is hard requirement for container security.
if err != nil {
return err
}
return nil
}
func setKernelMemory(c *configs.Cgroup) error {
path, err := getSubsystemPath(c, "memory")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
// This doesn't get called by manager.Set, so we need to do it here.
s := &fs.MemoryGroup{}
return s.SetKernelMemory(path, c)
}
func joinMemory(c *configs.Cgroup, pid int) error {
_, err := join(c, "memory", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
// systemd does not atm set up the cpuset controller, so we must manually
// join it. Additionally that is a very finicky controller where each
// level must have a full setup as the default for a new directory is "no cpus"
func joinCpuset(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "cpuset")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
s := &fs.CpusetGroup{}
return s.ApplyDir(path, c, pid)
}
// `BlockIODeviceWeight` property of systemd does not work properly, and systemd
// expects device path instead of major minor numbers, which is also confusing
// for users. So we use fs work around for now.
func joinBlkio(c *configs.Cgroup, pid int) error {
_, err := join(c, "blkio", pid)
if err != nil {
return err
}
return nil
}
func joinHugetlb(c *configs.Cgroup, pid int) error {
_, err := join(c, "hugetlb", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func joinPerfEvent(c *configs.Cgroup, pid int) error {
_, err := join(c, "perf_event", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
return os.MkdirAll(path, 0755)
}

View File

@@ -5,6 +5,7 @@ package cgroups
import (
"bufio"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
@@ -12,17 +13,19 @@ import (
"strings"
"time"
"github.com/docker/docker/pkg/mount"
"github.com/docker/go-units"
)
const cgroupNamePrefix = "name="
// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt
// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
func FindCgroupMountpoint(subsystem string) (string, error) {
// We are not using mount.GetMounts() because it's super-inefficient,
// parsing it directly sped up x10 times because of not using Sscanf.
// It was one of two major performance drawbacks in container start.
if !isSubsystemAvailable(subsystem) {
return "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", err
@@ -47,6 +50,9 @@ func FindCgroupMountpoint(subsystem string) (string, error) {
}
func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
if !isSubsystemAvailable(subsystem) {
return "", "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return "", "", err
@@ -70,6 +76,15 @@ func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
return "", "", NewNotFoundError(subsystem)
}
func isSubsystemAvailable(subsystem string) bool {
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return false
}
_, avail := cgroups[subsystem]
return avail
}
func FindCgroupMountpointDir() (string, error) {
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
@@ -121,42 +136,63 @@ func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) {
return getControllerPath(m.Subsystems[0], cgroups)
}
func getCgroupMountsHelper(ss map[string]bool, mi io.Reader) ([]Mount, error) {
res := make([]Mount, 0, len(ss))
scanner := bufio.NewScanner(mi)
numFound := 0
for scanner.Scan() && numFound < len(ss) {
txt := scanner.Text()
sepIdx := strings.Index(txt, " - ")
if sepIdx == -1 {
return nil, fmt.Errorf("invalid mountinfo format")
}
if txt[sepIdx+3:sepIdx+9] != "cgroup" {
continue
}
fields := strings.Split(txt, " ")
m := Mount{
Mountpoint: fields[4],
Root: fields[3],
}
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
if !ss[opt] {
continue
}
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
} else {
m.Subsystems = append(m.Subsystems, opt)
}
numFound++
}
res = append(res, m)
}
if err := scanner.Err(); err != nil {
return nil, err
}
return res, nil
}
func GetCgroupMounts() ([]Mount, error) {
mounts, err := mount.GetMounts()
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return nil, err
}
defer f.Close()
all, err := GetAllSubsystems()
all, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return nil, err
}
allMap := make(map[string]bool)
for _, s := range all {
for s := range all {
allMap[s] = true
}
res := []Mount{}
for _, mount := range mounts {
if mount.Fstype == "cgroup" {
m := Mount{Mountpoint: mount.Mountpoint, Root: mount.Root}
for _, opt := range strings.Split(mount.VfsOpts, ",") {
if strings.HasPrefix(opt, cgroupNamePrefix) {
m.Subsystems = append(m.Subsystems, opt[len(cgroupNamePrefix):])
}
if allMap[opt] {
m.Subsystems = append(m.Subsystems, opt)
}
}
res = append(res, m)
}
}
return res, nil
return getCgroupMountsHelper(allMap, f)
}
// Returns all the cgroup subsystems supported by the kernel
// GetAllSubsystems returns all the cgroup subsystems supported by the kernel
func GetAllSubsystems() ([]string, error) {
f, err := os.Open("/proc/cgroups")
if err != nil {
@@ -182,7 +218,7 @@ func GetAllSubsystems() ([]string, error) {
return subsystems, nil
}
// Returns the relative path to the cgroup docker is running in.
// GetThisCgroupDir returns the relative path to the cgroup docker is running in.
func GetThisCgroupDir(subsystem string) (string, error) {
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
@@ -226,6 +262,8 @@ func readProcsFile(dir string) ([]int, error) {
return out, nil
}
// ParseCgroupFile parses the given cgroup file, typically from
// /proc/<pid>/cgroup, into a map of subgroups to cgroup names.
func ParseCgroupFile(path string) (map[string]string, error) {
f, err := os.Open(path)
if err != nil {
@@ -233,7 +271,12 @@ func ParseCgroupFile(path string) (map[string]string, error) {
}
defer f.Close()
s := bufio.NewScanner(f)
return parseCgroupFromReader(f)
}
// helper function for ParseCgroupFile to make testing easier
func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
s := bufio.NewScanner(r)
cgroups := make(map[string]string)
for s.Scan() {
@@ -242,7 +285,16 @@ func ParseCgroupFile(path string) (map[string]string, error) {
}
text := s.Text()
parts := strings.Split(text, ":")
// from cgroups(7):
// /proc/[pid]/cgroup
// ...
// For each cgroup hierarchy ... there is one entry
// containing three colon-separated fields of the form:
// hierarchy-ID:subsystem-list:cgroup-path
parts := strings.SplitN(text, ":", 3)
if len(parts) < 3 {
return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
}
for _, subs := range strings.Split(parts[1], ",") {
cgroups[subs] = parts[2]
@@ -309,7 +361,7 @@ func RemovePaths(paths map[string]string) (err error) {
return nil
}
}
return fmt.Errorf("Failed to remove paths: %s", paths)
return fmt.Errorf("Failed to remove paths: %v", paths)
}
func GetHugePageSize() ([]string, error) {

View File

@@ -11,15 +11,22 @@ const (
)
type Cgroup struct {
Name string `json:"name"`
// Deprecated, use Path instead
Name string `json:"name,omitempty"`
// name of parent cgroup or slice
Parent string `json:"parent"`
// name of parent of cgroup or slice
// Deprecated, use Path instead
Parent string `json:"parent,omitempty"`
// Path specifies the path to cgroups that are created and/or joined by the container.
// The path is assumed to be relative to the host system cgroup mountpoint.
Path string `json:"path"`
// ScopePrefix decribes prefix for the scope name
ScopePrefix string `json:"scope_prefix"`
// Paths represent the cgroups paths to join
// Paths represent the absolute cgroups paths to join.
// This takes precedence over Path.
Paths map[string]string
// Resources contains various cgroups settings to apply
@@ -28,11 +35,14 @@ type Cgroup struct {
type Resources struct {
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
AllowAllDevices bool `json:"allow_all_devices"`
// Deprecated
AllowAllDevices bool `json:"allow_all_devices,omitempty"`
// Deprecated
AllowedDevices []*Device `json:"allowed_devices,omitempty"`
// Deprecated
DeniedDevices []*Device `json:"denied_devices,omitempty"`
AllowedDevices []*Device `json:"allowed_devices"`
DeniedDevices []*Device `json:"denied_devices"`
Devices []*Device `json:"devices"`
// Memory limit (in bytes)
Memory int64 `json:"memory"`
@@ -46,6 +56,9 @@ type Resources struct {
// Kernel memory limit (in bytes)
KernelMemory int64 `json:"kernel_memory"`
// Kernel memory limit for TCP use (in bytes)
KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
// CPU shares (relative weight vs. other containers)
CpuShares int64 `json:"cpu_shares"`
@@ -101,7 +114,7 @@ type Resources struct {
OomKillDisable bool `json:"oom_kill_disable"`
// Tuning swappiness behaviour per cgroup
MemorySwappiness int64 `json:"memory_swappiness"`
MemorySwappiness *int64 `json:"memory_swappiness"`
// Set priority of network traffic for container
NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`

View File

@@ -3,7 +3,11 @@ package configs
import (
"bytes"
"encoding/json"
"fmt"
"os/exec"
"time"
"github.com/Sirupsen/logrus"
)
type Rlimit struct {
@@ -29,7 +33,7 @@ type Seccomp struct {
Syscalls []*Syscall `json:"syscalls"`
}
// An action to be taken upon rule match in Seccomp
// Action is taken upon rule match in Seccomp
type Action int
const (
@@ -40,7 +44,7 @@ const (
Trace
)
// A comparison operator to be used when matching syscall arguments in Seccomp
// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
type Operator int
const (
@@ -53,7 +57,7 @@ const (
MaskEqualTo
)
// A rule to match a specific syscall argument in Seccomp
// Arg is a rule to match a specific syscall argument in Seccomp
type Arg struct {
Index uint `json:"index"`
Value uint64 `json:"value"`
@@ -61,7 +65,7 @@ type Arg struct {
Op Operator `json:"op"`
}
// An rule to match a syscall in Seccomp
// Syscall is a rule to match a syscall in Seccomp
type Syscall struct {
Name string `json:"name"`
Action Action `json:"action"`
@@ -128,15 +132,15 @@ type Config struct {
// AppArmorProfile specifies the profile to apply to the process running in the container and is
// change at the time the process is execed
AppArmorProfile string `json:"apparmor_profile"`
AppArmorProfile string `json:"apparmor_profile,omitempty"`
// ProcessLabel specifies the label to apply to the process running in the container. It is
// commonly used by selinux
ProcessLabel string `json:"process_label"`
ProcessLabel string `json:"process_label,omitempty"`
// Rlimits specifies the resource limits, such as max open files, to set in the container
// If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits []Rlimit `json:"rlimits"`
Rlimits []Rlimit `json:"rlimits,omitempty"`
// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
// for a process. Valid values are between the range [-1000, '1000'], where processes with
@@ -171,12 +175,22 @@ type Config struct {
// A default action to be taken if no rules match is also given.
Seccomp *Seccomp `json:"seccomp"`
// NoNewPrivileges controls whether processes in the container can gain additional privileges.
NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
// Hooks are a collection of actions to perform at various container lifecycle events.
// Hooks are not able to be marshaled to json but they are also not needed to.
Hooks *Hooks `json:"-"`
// CommandHooks are serialized to JSON, but other hooks are not.
Hooks *Hooks
// Version is the version of opencontainer specification that is supported.
Version string `json:"version"`
// Labels are user defined metadata that is stored in the config and populated on the state
Labels []string `json:"labels"`
// NoNewKeyring will not allocated a new session keyring for the container. It will use the
// callers keyring in this case.
NoNewKeyring bool `json:"no_new_keyring"`
}
type Hooks struct {
@@ -191,12 +205,59 @@ type Hooks struct {
Poststop []Hook
}
func (hooks *Hooks) UnmarshalJSON(b []byte) error {
var state struct {
Prestart []CommandHook
Poststart []CommandHook
Poststop []CommandHook
}
if err := json.Unmarshal(b, &state); err != nil {
return err
}
deserialize := func(shooks []CommandHook) (hooks []Hook) {
for _, shook := range shooks {
hooks = append(hooks, shook)
}
return hooks
}
hooks.Prestart = deserialize(state.Prestart)
hooks.Poststart = deserialize(state.Poststart)
hooks.Poststop = deserialize(state.Poststop)
return nil
}
func (hooks Hooks) MarshalJSON() ([]byte, error) {
serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
for _, hook := range hooks {
switch chook := hook.(type) {
case CommandHook:
serializableHooks = append(serializableHooks, chook)
default:
logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
}
}
return serializableHooks
}
return json.Marshal(map[string]interface{}{
"prestart": serialize(hooks.Prestart),
"poststart": serialize(hooks.Poststart),
"poststop": serialize(hooks.Poststop),
})
}
// HookState is the payload provided to a hook on execution.
type HookState struct {
Version string `json:"version"`
ID string `json:"id"`
Pid int `json:"pid"`
Root string `json:"root"`
Version string `json:"ociVersion"`
ID string `json:"id"`
Pid int `json:"pid"`
Root string `json:"root"`
BundlePath string `json:"bundlePath"`
}
type Hook interface {
@@ -204,7 +265,7 @@ type Hook interface {
Run(HookState) error
}
// NewFunctionHooks will call the provided function when the hook is run.
// NewFunctionHook will call the provided function when the hook is run.
func NewFunctionHook(f func(HookState) error) FuncHook {
return FuncHook{
run: f,
@@ -220,13 +281,14 @@ func (f FuncHook) Run(s HookState) error {
}
type Command struct {
Path string `json:"path"`
Args []string `json:"args"`
Env []string `json:"env"`
Dir string `json:"dir"`
Path string `json:"path"`
Args []string `json:"args"`
Env []string `json:"env"`
Dir string `json:"dir"`
Timeout *time.Duration `json:"timeout"`
}
// NewCommandHooks will execute the provided command when the hook is run.
// NewCommandHook will execute the provided command when the hook is run.
func NewCommandHook(cmd Command) CommandHook {
return CommandHook{
Command: cmd,
@@ -248,5 +310,23 @@ func (c Command) Run(s HookState) error {
Env: c.Env,
Stdin: bytes.NewReader(b),
}
return cmd.Run()
errC := make(chan error, 1)
go func() {
out, err := cmd.CombinedOutput()
if err != nil {
err = fmt.Errorf("%s: %s", err, out)
}
errC <- err
}()
if c.Timeout != nil {
select {
case err := <-errC:
return err
case <-time.After(*c.Timeout):
cmd.Process.Kill()
cmd.Wait()
return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
}
}
return <-errC
}

View File

@@ -4,7 +4,7 @@ package configs
import "fmt"
// Gets the root uid for the process on host which could be non-zero
// HostUID gets the root uid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostUID() (int, error) {
if c.Namespaces.Contains(NEWUSER) {
@@ -21,7 +21,7 @@ func (c Config) HostUID() (int, error) {
return 0, nil
}
// Gets the root gid for the process on host which could be non-zero
// HostGID gets the root gid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostGID() (int, error) {
if c.Namespaces.Contains(NEWUSER) {

View File

@@ -35,6 +35,9 @@ type Device struct {
// Gid of the device.
Gid uint32 `json:"gid"`
// Write the file to the allowed list
Allow bool `json:"allow"`
}
func (d *Device) CgroupString() string {

View File

@@ -3,7 +3,7 @@
package configs
var (
// These are devices that are to be both allowed and created.
// DefaultSimpleDevices are devices that are to be both allowed and created.
DefaultSimpleDevices = []*Device{
// /dev/null and zero
{

View File

@@ -18,7 +18,7 @@ var namespaceInfo = map[NamespaceType]int{
}
// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This functions returns flags only for new namespaces.
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
var flag int
for _, v := range *n {

View File

@@ -8,7 +8,7 @@ func (n *Namespace) Syscall() int {
}
// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This functions returns flags only for new namespaces.
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
panic("No namespace syscall support")
return uintptr(0)

View File

@@ -2,7 +2,11 @@
package configs
import "fmt"
import (
"fmt"
"os"
"sync"
)
const (
NEWNET NamespaceType = "NEWNET"
@@ -13,6 +17,51 @@ const (
NEWUSER NamespaceType = "NEWUSER"
)
var (
nsLock sync.Mutex
supportedNamespaces = make(map[NamespaceType]bool)
)
// nsToFile converts the namespace type to its filename
func nsToFile(ns NamespaceType) string {
switch ns {
case NEWNET:
return "net"
case NEWNS:
return "mnt"
case NEWPID:
return "pid"
case NEWIPC:
return "ipc"
case NEWUSER:
return "user"
case NEWUTS:
return "uts"
}
return ""
}
// IsNamespaceSupported returns whether a namespace is available or
// not
func IsNamespaceSupported(ns NamespaceType) bool {
nsLock.Lock()
defer nsLock.Unlock()
supported, ok := supportedNamespaces[ns]
if ok {
return supported
}
nsFile := nsToFile(ns)
// if the namespace type is unknown, just return false
if nsFile == "" {
return false
}
_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
// a namespace is supported if it exists and we have permissions to read it
supported = err == nil
supportedNamespaces[ns] = supported
return supported
}
func NamespaceTypes() []NamespaceType {
return []NamespaceType{
NEWNET,
@@ -35,26 +84,7 @@ func (n *Namespace) GetPath(pid int) string {
if n.Path != "" {
return n.Path
}
return fmt.Sprintf("/proc/%d/ns/%s", pid, n.file())
}
func (n *Namespace) file() string {
file := ""
switch n.Type {
case NEWNET:
file = "net"
case NEWNS:
file = "mnt"
case NEWPID:
file = "pid"
case NEWIPC:
file = "ipc"
case NEWUSER:
file = "user"
case NEWUTS:
file = "uts"
}
return file
return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type))
}
func (n *Namespaces) Remove(t NamespaceType) bool {
@@ -87,3 +117,11 @@ func (n *Namespaces) index(t NamespaceType) int {
func (n *Namespaces) Contains(t NamespaceType) bool {
return n.index(t) != -1
}
func (n *Namespaces) PathOf(t NamespaceType) string {
i := n.index(t)
if i == -1 {
return ""
}
return (*n)[i].Path
}

View File

@@ -4,8 +4,10 @@ import (
"fmt"
"os"
"path/filepath"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/selinux"
)
type Validator interface {
@@ -35,10 +37,13 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
if err := v.usernamespace(config); err != nil {
return err
}
if err := v.sysctl(config); err != nil {
return err
}
return nil
}
// rootfs validates the the rootfs is an absolute path and is not a symlink
// rootfs validates if the rootfs is an absolute path and is not a symlink
// to the container's root filesystem.
func (v *ConfigValidator) rootfs(config *configs.Config) error {
cleaned, err := filepath.Abs(config.Rootfs)
@@ -48,7 +53,7 @@ func (v *ConfigValidator) rootfs(config *configs.Config) error {
if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil {
return err
}
if config.Rootfs != cleaned {
if filepath.Clean(config.Rootfs) != cleaned {
return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs)
}
return nil
@@ -76,6 +81,10 @@ func (v *ConfigValidator) security(config *configs.Config) error {
!config.Namespaces.Contains(configs.NEWNS) {
return fmt.Errorf("unable to restrict sys entries without a private MNT namespace")
}
if config.ProcessLabel != "" && !selinux.SelinuxEnabled() {
return fmt.Errorf("selinux label is specified in config, but selinux is disabled or not supported")
}
return nil
}
@@ -91,3 +100,39 @@ func (v *ConfigValidator) usernamespace(config *configs.Config) error {
}
return nil
}
// sysctl validates that the specified sysctl keys are valid or not.
// /proc/sys isn't completely namespaced and depending on which namespaces
// are specified, a subset of sysctls are permitted.
func (v *ConfigValidator) sysctl(config *configs.Config) error {
validSysctlMap := map[string]bool{
"kernel.msgmax": true,
"kernel.msgmnb": true,
"kernel.msgmni": true,
"kernel.sem": true,
"kernel.shmall": true,
"kernel.shmmax": true,
"kernel.shmmni": true,
"kernel.shm_rmid_forced": true,
}
for s := range config.Sysctl {
if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") {
if config.Namespaces.Contains(configs.NEWIPC) {
continue
} else {
return fmt.Errorf("sysctl %q is not allowed in the hosts ipc namespace", s)
}
}
if strings.HasPrefix(s, "net.") {
if config.Namespaces.Contains(configs.NEWNET) {
continue
} else {
return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s)
}
}
return fmt.Errorf("sysctl %q is not in a separate kernel namespace", s)
}
return nil
}

View File

@@ -0,0 +1,11 @@
package libcontainer
import (
"errors"
)
// NewConsole returns an initalized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func NewConsole(uid, gid int) (Console, error) {
return nil, errors.New("libcontainer console is not supported on Solaris")
}

View File

@@ -1,4 +1,4 @@
// Libcontainer provides a native Go implementation for creating containers
// Package libcontainer provides a native Go implementation for creating containers
// with namespaces, cgroups, capabilities, and filesystem access controls.
// It allows you to manage the lifecycle of the container performing additional operations
// after the container is created.
@@ -6,31 +6,25 @@ package libcontainer
import (
"os"
"time"
"github.com/opencontainers/runc/libcontainer/configs"
)
// The status of a container.
// Status is the status of a container.
type Status int
const (
// The container exists but has not been run yet
// Created is the status that denotes the container exists but has not been run yet.
Created Status = iota
// The container exists and is running.
// Running is the status that denotes the container exists and is running.
Running
// The container exists, it is in the process of being paused.
// Pausing is the status that denotes the container exists, it is in the process of being paused.
Pausing
// The container exists, but all its processes are paused.
// Paused is the status that denotes the container exists, but all its processes are paused.
Paused
// The container exists, but its state is saved on disk
Checkpointed
// The container does not exist.
Destroyed
// Stopped is the status that denotes the container does not have a created or running process.
Stopped
)
func (s Status) String() string {
@@ -43,10 +37,8 @@ func (s Status) String() string {
return "pausing"
case Paused:
return "paused"
case Checkpointed:
return "checkpointed"
case Destroyed:
return "destroyed"
case Stopped:
return "stopped"
default:
return "unknown"
}
@@ -61,14 +53,17 @@ type BaseState struct {
// InitProcessPid is the init process id in the parent namespace.
InitProcessPid int `json:"init_process_pid"`
// InitProcessStartTime is the init process start time.
// InitProcessStartTime is the init process start time in clock cycles since boot time.
InitProcessStartTime string `json:"init_process_start"`
// Created is the unix timestamp for the creation time of the container in UTC
Created time.Time `json:"created"`
// Config is the container's configuration.
Config configs.Config `json:"config"`
}
// A libcontainer container object.
// BaseContainer is a libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
@@ -81,13 +76,13 @@ type BaseContainer interface {
//
// errors:
// ContainerDestroyed - Container no longer exists,
// Systemerror - System error.
// SystemError - System error.
Status() (Status, error)
// State returns the current container's state information.
//
// errors:
// Systemerror - System error.
// SystemError - System error.
State() (*State, error)
// Returns the current config of the container.
@@ -97,7 +92,7 @@ type BaseContainer interface {
//
// errors:
// ContainerDestroyed - Container no longer exists,
// Systemerror - System error.
// SystemError - System error.
//
// Some of the returned PIDs may no longer refer to processes in the Container, unless
// the Container state is PAUSED in which case every PID in the slice is valid.
@@ -107,7 +102,7 @@ type BaseContainer interface {
//
// errors:
// ContainerDestroyed - Container no longer exists,
// Systemerror - System error.
// SystemError - System error.
Stats() (*Stats, error)
// Set resources of container as configured
@@ -115,7 +110,7 @@ type BaseContainer interface {
// We can use this to change resources when containers are running.
//
// errors:
// Systemerror - System error.
// SystemError - System error.
Set(config configs.Config) error
// Start a process inside the container. Returns error if process fails to
@@ -125,21 +120,38 @@ type BaseContainer interface {
// ContainerDestroyed - Container no longer exists,
// ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused,
// Systemerror - System error.
// SystemError - System error.
Start(process *Process) (err error)
// Run immediatly starts the process inside the conatiner. Returns error if process
// fails to start. It does not block waiting for the exec fifo after start returns but
// opens the fifo after start returns.
//
// errors:
// ContainerDestroyed - Container no longer exists,
// ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused,
// SystemError - System error.
Run(process *Process) (err error)
// Destroys the container after killing all running processes.
//
// Any event registrations are removed before the container is destroyed.
// No error is returned if the container is already destroyed.
//
// errors:
// Systemerror - System error.
// SystemError - System error.
Destroy() error
// Signal sends the provided signal code to the container's initial process.
//
// errors:
// Systemerror - System error.
// SystemError - System error.
Signal(s os.Signal) error
// Exec signals the container to exec the users process at the end of the init.
//
// errors:
// SystemError - System error.
Exec() error
}

View File

@@ -15,30 +15,35 @@ import (
"strings"
"sync"
"syscall"
"time"
"github.com/Sirupsen/logrus"
"github.com/golang/protobuf/proto"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/criurpc"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/syndtr/gocapability/capability"
"github.com/vishvananda/netlink/nl"
)
const stdioFdCount = 3
type linuxContainer struct {
id string
root string
config *configs.Config
cgroupManager cgroups.Manager
initPath string
initArgs []string
initProcess parentProcess
criuPath string
m sync.Mutex
criuVersion int
state containerState
id string
root string
config *configs.Config
cgroupManager cgroups.Manager
initPath string
initArgs []string
initProcess parentProcess
initProcessStartTime string
criuPath string
m sync.Mutex
criuVersion int
state containerState
created time.Time
}
// State represents a running container's state
@@ -59,7 +64,7 @@ type State struct {
ExternalDescriptors []string `json:"external_descriptors,omitempty"`
}
// A libcontainer container object.
// Container is a libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
@@ -75,13 +80,13 @@ type Container interface {
// Systemerror - System error.
Checkpoint(criuOpts *CriuOpts) error
// Restore restores the checkpointed container to a running state using the criu(8) utiity.
// Restore restores the checkpointed container to a running state using the criu(8) utility.
//
// errors:
// Systemerror - System error.
Restore(process *Process, criuOpts *CriuOpts) error
// If the Container state is RUNNING or PAUSING, sets the Container state to PAUSING and pauses
// If the Container state is RUNNING, sets the Container state to PAUSING and pauses
// the execution of any user processes. Asynchronously, when the container finished being paused the
// state is changed to PAUSED.
// If the Container state is PAUSED, do nothing.
@@ -138,7 +143,7 @@ func (c *linuxContainer) State() (*State, error) {
func (c *linuxContainer) Processes() ([]int, error) {
pids, err := c.cgroupManager.GetAllPids()
if err != nil {
return nil, newSystemError(err)
return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
}
return pids, nil
}
@@ -149,14 +154,14 @@ func (c *linuxContainer) Stats() (*Stats, error) {
stats = &Stats{}
)
if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
return stats, newSystemError(err)
return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
}
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
if err != nil {
return stats, newSystemError(err)
return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
}
stats.Interfaces = append(stats.Interfaces, istats)
}
@@ -167,6 +172,13 @@ func (c *linuxContainer) Stats() (*Stats, error) {
func (c *linuxContainer) Set(config configs.Config) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
if status == Stopped {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
}
c.config = &config
return c.cgroupManager.Set(c.config)
}
@@ -178,38 +190,90 @@ func (c *linuxContainer) Start(process *Process) error {
if err != nil {
return err
}
doInit := status == Destroyed
parent, err := c.newParentProcess(process, doInit)
return c.start(process, status == Stopped)
}
func (c *linuxContainer) Run(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return newSystemError(err)
return err
}
if err := c.start(process, status == Stopped); err != nil {
return err
}
if status == Stopped {
return c.exec()
}
return nil
}
func (c *linuxContainer) Exec() error {
c.m.Lock()
defer c.m.Unlock()
return c.exec()
}
func (c *linuxContainer) exec() error {
path := filepath.Join(c.root, execFifoFilename)
f, err := os.OpenFile(path, os.O_RDONLY, 0)
if err != nil {
return newSystemErrorWithCause(err, "open exec fifo for reading")
}
defer f.Close()
data, err := ioutil.ReadAll(f)
if err != nil {
return err
}
if len(data) > 0 {
os.Remove(path)
return nil
}
return fmt.Errorf("cannot start an already running container")
}
func (c *linuxContainer) start(process *Process, isInit bool) error {
parent, err := c.newParentProcess(process, isInit)
if err != nil {
return newSystemErrorWithCause(err, "creating new parent process")
}
if err := parent.start(); err != nil {
// terminate the process to ensure that it properly is reaped.
if err := parent.terminate(); err != nil {
logrus.Warn(err)
}
return newSystemError(err)
return newSystemErrorWithCause(err, "starting container process")
}
// generate a timestamp indicating when the container was started
c.created = time.Now().UTC()
c.state = &runningState{
c: c,
}
if doInit {
if err := c.updateState(parent); err != nil {
if isInit {
c.state = &createdState{
c: c,
}
state, err := c.updateState(parent)
if err != nil {
return err
}
c.initProcessStartTime = state.InitProcessStartTime
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Pid: parent.pid(),
Root: c.config.Rootfs,
Version: c.config.Version,
ID: c.id,
Pid: parent.pid(),
Root: c.config.Rootfs,
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
}
for _, hook := range c.config.Hooks.Poststart {
for i, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil {
if err := parent.terminate(); err != nil {
logrus.Warn(err)
}
return newSystemError(err)
return newSystemErrorWithCausef(err, "running poststart hook %d", i)
}
}
}
@@ -219,7 +283,7 @@ func (c *linuxContainer) Start(process *Process) error {
func (c *linuxContainer) Signal(s os.Signal) error {
if err := c.initProcess.signal(s); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "signaling init process")
}
return nil
}
@@ -227,19 +291,23 @@ func (c *linuxContainer) Signal(s os.Signal) error {
func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProcess, error) {
parentPipe, childPipe, err := newPipe()
if err != nil {
return nil, newSystemError(err)
return nil, newSystemErrorWithCause(err, "creating new init pipe")
}
cmd, err := c.commandTemplate(p, childPipe)
rootDir, err := os.Open(c.root)
if err != nil {
return nil, newSystemError(err)
return nil, err
}
cmd, err := c.commandTemplate(p, childPipe, rootDir)
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new command template")
}
if !doInit {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
return c.newSetnsProcess(p, cmd, parentPipe, childPipe, rootDir)
}
return c.newInitProcess(p, cmd, parentPipe, childPipe)
return c.newInitProcess(p, cmd, parentPipe, childPipe, rootDir)
}
func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.Cmd, error) {
func (c *linuxContainer) commandTemplate(p *Process, childPipe, rootDir *os.File) (*exec.Cmd, error) {
cmd := &exec.Cmd{
Path: c.initPath,
Args: c.initArgs,
@@ -251,8 +319,10 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
if cmd.SysProcAttr == nil {
cmd.SysProcAttr = &syscall.SysProcAttr{}
}
cmd.ExtraFiles = append(p.ExtraFiles, childPipe)
cmd.Env = append(cmd.Env, fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
cmd.ExtraFiles = append(p.ExtraFiles, childPipe, rootDir)
cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-2),
fmt.Sprintf("_LIBCONTAINER_STATEDIR=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
// NOTE: when running a container with no PID namespace and the parent process spawning the container is
// PID1 the pdeathsig is being delivered to the container's init process by the kernel for some reason
// even with the parent still running.
@@ -262,38 +332,42 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
return cmd, nil
}
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
t := "_LIBCONTAINER_INITTYPE=" + string(initStandard)
cloneFlags := c.config.Namespaces.CloneFlags()
if cloneFlags&syscall.CLONE_NEWUSER != 0 {
if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil {
// user mappings are not supported
return nil, err
}
enableSetgroups(cmd.SysProcAttr)
// Default to root user when user namespaces are enabled.
if cmd.SysProcAttr.Credential == nil {
cmd.SysProcAttr.Credential = &syscall.Credential{}
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
for _, ns := range c.config.Namespaces {
if ns.Path != "" {
nsMaps[ns.Type] = ns.Path
}
}
cmd.Env = append(cmd.Env, t)
cmd.SysProcAttr.Cloneflags = cloneFlags
_, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, "")
if err != nil {
return nil, err
}
return &initProcess{
cmd: cmd,
childPipe: childPipe,
parentPipe: parentPipe,
manager: c.cgroupManager,
config: c.newInitConfig(p),
container: c,
process: p,
cmd: cmd,
childPipe: childPipe,
parentPipe: parentPipe,
manager: c.cgroupManager,
config: c.newInitConfig(p),
container: c,
process: p,
bootstrapData: data,
sharePidns: sharePidns,
rootDir: rootDir,
}, nil
}
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe, rootDir *os.File) (*setnsProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
state, err := c.currentState()
if err != nil {
return nil, newSystemErrorWithCause(err, "getting container's current state")
}
// for setns process, we dont have to set cloneflags as the process namespaces
// will only be set via setns syscall
data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath)
data, err := c.bootstrapData(0, state.NamespacePaths, p.consolePath)
if err != nil {
return nil, err
}
@@ -306,11 +380,12 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe,
config: c.newInitConfig(p),
process: p,
bootstrapData: data,
rootDir: rootDir,
}, nil
}
func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
return &initConfig{
cfg := &initConfig{
Config: c.config,
Args: process.Args,
Env: process.Env,
@@ -319,7 +394,26 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
Console: process.consolePath,
Capabilities: process.Capabilities,
PassedFilesCount: len(process.ExtraFiles),
ContainerId: c.ID(),
NoNewPrivileges: c.config.NoNewPrivileges,
AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits,
ExecFifoPath: filepath.Join(c.root, execFifoFilename),
}
if process.NoNewPrivileges != nil {
cfg.NoNewPrivileges = *process.NoNewPrivileges
}
if process.AppArmorProfile != "" {
cfg.AppArmorProfile = process.AppArmorProfile
}
if process.Label != "" {
cfg.ProcessLabel = process.Label
}
if len(process.Rlimits) > 0 {
cfg.Rlimits = process.Rlimits
}
return cfg
}
func newPipe() (parent *os.File, child *os.File, err error) {
@@ -343,15 +437,16 @@ func (c *linuxContainer) Pause() error {
if err != nil {
return err
}
if status != Running {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
switch status {
case Running, Created:
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err
}
return c.state.transition(&pausedState{
c: c,
})
}
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err
}
return c.state.transition(&pausedState{
c: c,
})
return newGenericError(fmt.Errorf("container not running: %s", status), ContainerNotRunning)
}
func (c *linuxContainer) Resume() error {
@@ -380,23 +475,13 @@ func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struc
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
}
// XXX debug support, remove when debugging done.
func addArgsFromEnv(evar string, args *[]string) {
if e := os.Getenv(evar); e != "" {
for _, f := range strings.Fields(e) {
*args = append(*args, f)
}
}
fmt.Printf(">>> criu %v\n", *args)
}
// check Criu version greater than or equal to min_version
func (c *linuxContainer) checkCriuVersion(min_version string) error {
// checkCriuVersion checks Criu version greater than or equal to minVersion
func (c *linuxContainer) checkCriuVersion(minVersion string) error {
var x, y, z, versionReq int
_, err := fmt.Sscanf(min_version, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
_, err := fmt.Sscanf(minVersion, "%d.%d.%d\n", &x, &y, &z) // 1.5.2
if err != nil {
_, err = fmt.Sscanf(min_version, "Version: %d.%d\n", &x, &y) // 1.6
_, err = fmt.Sscanf(minVersion, "Version: %d.%d\n", &x, &y) // 1.6
}
versionReq = x*10000 + y*100 + z
@@ -441,7 +526,7 @@ func (c *linuxContainer) checkCriuVersion(min_version string) error {
c.criuVersion = x*10000 + y*100 + z
if c.criuVersion < versionReq {
return fmt.Errorf("CRIU version must be %s or higher", min_version)
return fmt.Errorf("CRIU version must be %s or higher", minVersion)
}
return nil
@@ -514,6 +599,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
FileLocks: proto.Bool(criuOpts.FileLocks),
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
}
// append optional criu opts, e.g., page-server and port
@@ -529,7 +615,8 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
if err := c.checkCriuVersion("1.7"); err != nil {
return err
}
rpcOpts.ManageCgroupsMode = proto.Uint32(uint32(criuOpts.ManageCgroupsMode))
mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
rpcOpts.ManageCgroupsMode = &mode
}
t := criurpc.CriuReqType_DUMP
@@ -587,6 +674,27 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo
req.Opts.ExtMnt = append(req.Opts.ExtMnt, extMnt)
}
func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(iface.HostInterfaceName)
veth.IfIn = proto.String(iface.Name)
req.Opts.Veths = append(req.Opts.Veths, veth)
break
case "loopback":
break
}
}
for _, i := range criuOpts.VethPairs {
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(i.HostInterfaceName)
veth.IfIn = proto.String(i.ContainerInterfaceName)
req.Opts.Veths = append(req.Opts.Veths, veth)
}
}
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
@@ -650,6 +758,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
ExtUnixSk: proto.Bool(criuOpts.ExternalUnixConnections),
TcpEstablished: proto.Bool(criuOpts.TcpEstablished),
FileLocks: proto.Bool(criuOpts.FileLocks),
EmptyNs: proto.Uint32(criuOpts.EmptyNs),
},
}
@@ -669,23 +778,9 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
break
}
}
for _, iface := range c.config.Networks {
switch iface.Type {
case "veth":
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(iface.HostInterfaceName)
veth.IfIn = proto.String(iface.Name)
req.Opts.Veths = append(req.Opts.Veths, veth)
break
case "loopback":
break
}
}
for _, i := range criuOpts.VethPairs {
veth := new(criurpc.CriuVethPair)
veth.IfOut = proto.String(i.HostInterfaceName)
veth.IfIn = proto.String(i.ContainerInterfaceName)
req.Opts.Veths = append(req.Opts.Veths, veth)
if criuOpts.EmptyNs&syscall.CLONE_NEWNET == 0 {
c.restoreNetwork(req, criuOpts)
}
// append optional manage cgroups mode
@@ -693,7 +788,8 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
if err := c.checkCriuVersion("1.7"); err != nil {
return err
}
req.Opts.ManageCgroupsMode = proto.Uint32(uint32(criuOpts.ManageCgroupsMode))
mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
req.Opts.ManageCgroupsMode = &mode
}
var (
@@ -850,7 +946,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
if err != nil {
return err
}
n, err = criuClient.Write(data)
_, err = criuClient.Write(data)
if err != nil {
return err
}
@@ -925,6 +1021,20 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
if err := lockNetwork(c.config); err != nil {
return err
}
case notify.GetScript() == "setup-namespaces":
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Pid: int(notify.GetPid()),
Root: c.config.Rootfs,
}
for i, hook := range c.config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
case notify.GetScript() == "post-restore":
pid := notify.GetPid()
r, err := newRestoredProcess(int(pid), fds)
@@ -938,7 +1048,7 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
}); err != nil {
return err
}
if err := c.updateState(r); err != nil {
if _, err := c.updateState(r); err != nil {
return err
}
if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
@@ -950,13 +1060,17 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
return nil
}
func (c *linuxContainer) updateState(process parentProcess) error {
func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
c.initProcess = process
state, err := c.currentState()
if err != nil {
return err
return nil, err
}
return c.saveState(state)
err = c.saveState(state)
if err != nil {
return nil, err
}
return state, nil
}
func (c *linuxContainer) saveState(s *State) error {
@@ -991,37 +1105,75 @@ func (c *linuxContainer) refreshState() error {
if paused {
return c.state.transition(&pausedState{c: c})
}
running, err := c.isRunning()
t, err := c.runType()
if err != nil {
return err
}
if running {
switch t {
case Created:
return c.state.transition(&createdState{c: c})
case Running:
return c.state.transition(&runningState{c: c})
}
return c.state.transition(&stoppedState{c: c})
}
func (c *linuxContainer) isRunning() (bool, error) {
if c.initProcess == nil {
// doesInitProcessExist checks if the init process is still the same process
// as the initial one, it could happen that the original process has exited
// and a new process has been created with the same pid, in this case, the
// container would already be stopped.
func (c *linuxContainer) doesInitProcessExist(initPid int) (bool, error) {
startTime, err := system.GetProcessStartTime(initPid)
if err != nil {
return false, newSystemErrorWithCausef(err, "getting init process %d start time", initPid)
}
if c.initProcessStartTime != startTime {
return false, nil
}
// return Running if the init process is alive
if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
if err == syscall.ESRCH {
return false, nil
}
return false, newSystemError(err)
}
return true, nil
}
func (c *linuxContainer) runType() (Status, error) {
if c.initProcess == nil {
return Stopped, nil
}
pid := c.initProcess.pid()
// return Running if the init process is alive
if err := syscall.Kill(pid, 0); err != nil {
if err == syscall.ESRCH {
// It means the process does not exist anymore, could happen when the
// process exited just when we call the function, we should not return
// error in this case.
return Stopped, nil
}
return Stopped, newSystemErrorWithCausef(err, "sending signal 0 to pid %d", pid)
}
// check if the process is still the original init process.
exist, err := c.doesInitProcessExist(pid)
if !exist || err != nil {
return Stopped, err
}
// check if the process that is running is the init process or the user's process.
// this is the difference between the container Running and Created.
environ, err := ioutil.ReadFile(fmt.Sprintf("/proc/%d/environ", pid))
if err != nil {
return Stopped, newSystemErrorWithCausef(err, "reading /proc/%d/environ", pid)
}
check := []byte("_LIBCONTAINER")
if bytes.Contains(environ, check) {
return Created, nil
}
return Running, nil
}
func (c *linuxContainer) isPaused() (bool, error) {
data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
if err != nil {
// If freezer cgroup is not mounted, the container would just be not paused.
if os.IsNotExist(err) {
return false, nil
}
return false, newSystemError(err)
return false, newSystemErrorWithCause(err, "checking if container is paused")
}
return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
}
@@ -1043,6 +1195,7 @@ func (c *linuxContainer) currentState() (*State, error) {
Config: *c.config,
InitProcessPid: pid,
InitProcessStartTime: startTime,
Created: c.created,
},
CgroupPaths: c.cgroupManager.GetPaths(),
NamespacePaths: make(map[configs.NamespaceType]string),
@@ -1053,6 +1206,9 @@ func (c *linuxContainer) currentState() (*State, error) {
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
}
for _, nsType := range configs.NamespaceTypes() {
if !configs.IsNamespaceSupported(nsType) {
continue
}
if _, ok := state.NamespacePaths[nsType]; !ok {
ns := configs.Namespace{Type: nsType}
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
@@ -1062,18 +1218,69 @@ func (c *linuxContainer) currentState() (*State, error) {
return state, nil
}
// bootstrapData encodes the necessary data in netlink binary format as a io.Reader.
// Consumer can write the data to a bootstrap program such as one that uses
// nsenter package to bootstrap the container's init process correctly, i.e. with
// correct namespaces, uid/gid mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath string) (io.Reader, error) {
// orderNamespacePaths sorts namespace paths into a list of paths that we
// can setns in order.
func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
paths := []string{}
nsTypes := []configs.NamespaceType{
configs.NEWIPC,
configs.NEWUTS,
configs.NEWNET,
configs.NEWPID,
configs.NEWNS,
}
// join userns if the init process explicitly requires NEWUSER
if c.config.Namespaces.Contains(configs.NEWUSER) {
nsTypes = append(nsTypes, configs.NEWUSER)
}
for _, nsType := range nsTypes {
if p, ok := namespaces[nsType]; ok && p != "" {
// check if the requested namespace is supported
if !configs.IsNamespaceSupported(nsType) {
return nil, newSystemError(fmt.Errorf("namespace %s is not supported", nsType))
}
// only set to join this namespace if it exists
if _, err := os.Lstat(p); err != nil {
return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
}
// do not allow namespace path with comma as we use it to separate
// the namespace paths
if strings.ContainsRune(p, ',') {
return nil, newSystemError(fmt.Errorf("invalid path %s", p))
}
paths = append(paths, p)
}
}
return paths, nil
}
func encodeIDMapping(idMap []configs.IDMap) ([]byte, error) {
data := bytes.NewBuffer(nil)
for _, im := range idMap {
line := fmt.Sprintf("%d %d %d\n", im.ContainerID, im.HostID, im.Size)
if _, err := data.WriteString(line); err != nil {
return nil, err
}
}
return data.Bytes(), nil
}
// bootstrapData encodes the necessary data in netlink binary format
// as a io.Reader.
// Consumer can write the data to a bootstrap program
// such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, consolePath string) (io.Reader, error) {
// create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0)
// write pid
// write cloneFlags
r.AddData(&Int32msg{
Type: PidAttr,
Value: uint32(pid),
Type: CloneFlagsAttr,
Value: uint32(cloneFlags),
})
// write console path
if consolePath != "" {
r.AddData(&Bytemsg{
@@ -1081,5 +1288,57 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath
Value: []byte(consolePath),
})
}
// write custom namespace paths
if len(nsMaps) > 0 {
nsPaths, err := c.orderNamespacePaths(nsMaps)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: NsPathsAttr,
Value: []byte(strings.Join(nsPaths, ",")),
})
}
// write namespace paths only when we are not joining an existing user ns
_, joinExistingUser := nsMaps[configs.NEWUSER]
if !joinExistingUser {
// write uid mappings
if len(c.config.UidMappings) > 0 {
b, err := encodeIDMapping(c.config.UidMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: UidmapAttr,
Value: b,
})
}
// write gid mappings
if len(c.config.GidMappings) > 0 {
b, err := encodeIDMapping(c.config.GidMappings)
if err != nil {
return nil, err
}
r.AddData(&Bytemsg{
Type: GidmapAttr,
Value: b,
})
// check if we have CAP_SETGID to setgroup properly
pid, err := capability.NewPid(os.Getpid())
if err != nil {
return nil, err
}
if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) {
r.AddData(&Boolmsg{
Type: SetgroupAttr,
Value: true,
})
}
}
}
return bytes.NewReader(r.Serialize()), nil
}

View File

@@ -1,13 +0,0 @@
// +build !go1.4
package libcontainer
import (
"fmt"
"syscall"
)
// not available before go 1.4
func (c *linuxContainer) addUidGidMappings(sys *syscall.SysProcAttr) error {
return fmt.Errorf("User namespace is not supported in golang < 1.4")
}

View File

@@ -0,0 +1,20 @@
package libcontainer
// State represents a running container's state
type State struct {
BaseState
// Platform specific fields below here
}
// A libcontainer container object.
//
// Each container is thread-safe within the same process. Since a container can
// be destroyed by a separate process, any function may return that the container
// was not found.
type Container interface {
BaseContainer
// Methods below here are platform specific
}

View File

@@ -1,26 +0,0 @@
// +build go1.4
package libcontainer
import "syscall"
// Converts IDMap to SysProcIDMap array and adds it to SysProcAttr.
func (c *linuxContainer) addUidGidMappings(sys *syscall.SysProcAttr) error {
if c.config.UidMappings != nil {
sys.UidMappings = make([]syscall.SysProcIDMap, len(c.config.UidMappings))
for i, um := range c.config.UidMappings {
sys.UidMappings[i].ContainerID = um.ContainerID
sys.UidMappings[i].HostID = um.HostID
sys.UidMappings[i].Size = um.Size
}
}
if c.config.GidMappings != nil {
sys.GidMappings = make([]syscall.SysProcIDMap, len(c.config.GidMappings))
for i, gm := range c.config.GidMappings {
sys.GidMappings[i].ContainerID = gm.ContainerID
sys.GidMappings[i].HostID = gm.HostID
sys.GidMappings[i].Size = gm.Size
}
}
return nil
}

View File

@@ -3,13 +3,13 @@
package libcontainer
// cgroup restoring strategy provided by criu
type cg_mode uint32
type cgMode uint32
const (
CRIU_CG_MODE_SOFT cg_mode = 3 + iota // restore cgroup properties if only dir created by criu
CRIU_CG_MODE_FULL // always restore all cgroups and their properties
CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system
CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT
CRIU_CG_MODE_SOFT cgMode = 3 + iota // restore cgroup properties if only dir created by criu
CRIU_CG_MODE_FULL // always restore all cgroups and their properties
CRIU_CG_MODE_STRICT // restore all, requiring them to not present in the system
CRIU_CG_MODE_DEFAULT // the same as CRIU_CG_MODE_SOFT
)
type CriuPageServerInfo struct {
@@ -32,5 +32,6 @@ type CriuOpts struct {
FileLocks bool // handle file locks, for safety
PageServer CriuPageServerInfo // allow to dump to criu page server
VethPairs []VethPairName // pass the veth to criu when restore
ManageCgroupsMode cg_mode // dump or restore cgroup mode
ManageCgroupsMode cgMode // dump or restore cgroup mode
EmptyNs uint32 // don't c/r properties for namespace from this mask
}

View File

@@ -19,6 +19,7 @@ It has these top-level messages:
CriuDumpResp
CriuRestoreResp
CriuNotify
CriuFeatures
CriuReq
CriuResp
*/
@@ -31,6 +32,54 @@ import math "math"
var _ = proto.Marshal
var _ = math.Inf
type CriuCgMode int32
const (
CriuCgMode_IGNORE CriuCgMode = 0
CriuCgMode_NONE CriuCgMode = 1
CriuCgMode_PROPS CriuCgMode = 2
CriuCgMode_SOFT CriuCgMode = 3
CriuCgMode_FULL CriuCgMode = 4
CriuCgMode_STRICT CriuCgMode = 5
CriuCgMode_DEFAULT CriuCgMode = 6
)
var CriuCgMode_name = map[int32]string{
0: "IGNORE",
1: "NONE",
2: "PROPS",
3: "SOFT",
4: "FULL",
5: "STRICT",
6: "DEFAULT",
}
var CriuCgMode_value = map[string]int32{
"IGNORE": 0,
"NONE": 1,
"PROPS": 2,
"SOFT": 3,
"FULL": 4,
"STRICT": 5,
"DEFAULT": 6,
}
func (x CriuCgMode) Enum() *CriuCgMode {
p := new(CriuCgMode)
*p = x
return p
}
func (x CriuCgMode) String() string {
return proto.EnumName(CriuCgMode_name, int32(x))
}
func (x *CriuCgMode) UnmarshalJSON(data []byte) error {
value, err := proto.UnmarshalJSONEnum(CriuCgMode_value, data, "CriuCgMode")
if err != nil {
return err
}
*x = CriuCgMode(value)
return nil
}
type CriuReqType int32
const (
@@ -43,6 +92,7 @@ const (
CriuReqType_NOTIFY CriuReqType = 6
CriuReqType_CPUINFO_DUMP CriuReqType = 7
CriuReqType_CPUINFO_CHECK CriuReqType = 8
CriuReqType_FEATURE_CHECK CriuReqType = 9
)
var CriuReqType_name = map[int32]string{
@@ -55,6 +105,7 @@ var CriuReqType_name = map[int32]string{
6: "NOTIFY",
7: "CPUINFO_DUMP",
8: "CPUINFO_CHECK",
9: "FEATURE_CHECK",
}
var CriuReqType_value = map[string]int32{
"EMPTY": 0,
@@ -66,6 +117,7 @@ var CriuReqType_value = map[string]int32{
"NOTIFY": 6,
"CPUINFO_DUMP": 7,
"CPUINFO_CHECK": 8,
"FEATURE_CHECK": 9,
}
func (x CriuReqType) Enum() *CriuReqType {
@@ -271,7 +323,12 @@ type CriuOpts struct {
SkipMnt []string `protobuf:"bytes,31,rep,name=skip_mnt" json:"skip_mnt,omitempty"`
EnableFs []string `protobuf:"bytes,32,rep,name=enable_fs" json:"enable_fs,omitempty"`
UnixSkIno []*UnixSk `protobuf:"bytes,33,rep,name=unix_sk_ino" json:"unix_sk_ino,omitempty"`
ManageCgroupsMode *uint32 `protobuf:"varint,34,opt,name=manage_cgroups_mode" json:"manage_cgroups_mode,omitempty"`
ManageCgroupsMode *CriuCgMode `protobuf:"varint,34,opt,name=manage_cgroups_mode,enum=CriuCgMode" json:"manage_cgroups_mode,omitempty"`
GhostLimit *uint32 `protobuf:"varint,35,opt,name=ghost_limit,def=1048576" json:"ghost_limit,omitempty"`
IrmapScanPaths []string `protobuf:"bytes,36,rep,name=irmap_scan_paths" json:"irmap_scan_paths,omitempty"`
External []string `protobuf:"bytes,37,rep,name=external" json:"external,omitempty"`
EmptyNs *uint32 `protobuf:"varint,38,opt,name=empty_ns" json:"empty_ns,omitempty"`
NoSeccomp *bool `protobuf:"varint,39,opt,name=no_seccomp" json:"no_seccomp,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
@@ -281,6 +338,7 @@ func (*CriuOpts) ProtoMessage() {}
const Default_CriuOpts_LogLevel int32 = 2
const Default_CriuOpts_CpuCap uint32 = 4294967295
const Default_CriuOpts_GhostLimit uint32 = 1048576
func (m *CriuOpts) GetImagesDirFd() int32 {
if m != nil && m.ImagesDirFd != nil {
@@ -513,13 +571,48 @@ func (m *CriuOpts) GetUnixSkIno() []*UnixSk {
return nil
}
func (m *CriuOpts) GetManageCgroupsMode() uint32 {
func (m *CriuOpts) GetManageCgroupsMode() CriuCgMode {
if m != nil && m.ManageCgroupsMode != nil {
return *m.ManageCgroupsMode
}
return CriuCgMode_IGNORE
}
func (m *CriuOpts) GetGhostLimit() uint32 {
if m != nil && m.GhostLimit != nil {
return *m.GhostLimit
}
return Default_CriuOpts_GhostLimit
}
func (m *CriuOpts) GetIrmapScanPaths() []string {
if m != nil {
return m.IrmapScanPaths
}
return nil
}
func (m *CriuOpts) GetExternal() []string {
if m != nil {
return m.External
}
return nil
}
func (m *CriuOpts) GetEmptyNs() uint32 {
if m != nil && m.EmptyNs != nil {
return *m.EmptyNs
}
return 0
}
func (m *CriuOpts) GetNoSeccomp() bool {
if m != nil && m.NoSeccomp != nil {
return *m.NoSeccomp
}
return false
}
type CriuDumpResp struct {
Restored *bool `protobuf:"varint,1,opt,name=restored" json:"restored,omitempty"`
XXX_unrecognized []byte `json:"-"`
@@ -576,6 +669,25 @@ func (m *CriuNotify) GetPid() int32 {
return 0
}
//
// List of features which can queried via
// CRIU_REQ_TYPE__FEATURE_CHECK
type CriuFeatures struct {
MemTrack *bool `protobuf:"varint,1,opt,name=mem_track" json:"mem_track,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *CriuFeatures) Reset() { *m = CriuFeatures{} }
func (m *CriuFeatures) String() string { return proto.CompactTextString(m) }
func (*CriuFeatures) ProtoMessage() {}
func (m *CriuFeatures) GetMemTrack() bool {
if m != nil && m.MemTrack != nil {
return *m.MemTrack
}
return false
}
type CriuReq struct {
Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"`
Opts *CriuOpts `protobuf:"bytes,2,opt,name=opts" json:"opts,omitempty"`
@@ -584,8 +696,13 @@ type CriuReq struct {
// When set service won't close the connection but
// will wait for more req-s to appear. Works not
// for all request types.
KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open" json:"keep_open,omitempty"`
XXX_unrecognized []byte `json:"-"`
KeepOpen *bool `protobuf:"varint,4,opt,name=keep_open" json:"keep_open,omitempty"`
//
// 'features' can be used to query which features
// are supported by the installed criu/kernel
// via RPC.
Features *CriuFeatures `protobuf:"bytes,5,opt,name=features" json:"features,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
func (m *CriuReq) Reset() { *m = CriuReq{} }
@@ -620,6 +737,13 @@ func (m *CriuReq) GetKeepOpen() bool {
return false
}
func (m *CriuReq) GetFeatures() *CriuFeatures {
if m != nil {
return m.Features
}
return nil
}
type CriuResp struct {
Type *CriuReqType `protobuf:"varint,1,req,name=type,enum=CriuReqType" json:"type,omitempty"`
Success *bool `protobuf:"varint,2,req,name=success" json:"success,omitempty"`
@@ -628,6 +752,7 @@ type CriuResp struct {
Notify *CriuNotify `protobuf:"bytes,5,opt,name=notify" json:"notify,omitempty"`
Ps *CriuPageServerInfo `protobuf:"bytes,6,opt,name=ps" json:"ps,omitempty"`
CrErrno *int32 `protobuf:"varint,7,opt,name=cr_errno" json:"cr_errno,omitempty"`
Features *CriuFeatures `protobuf:"bytes,8,opt,name=features" json:"features,omitempty"`
XXX_unrecognized []byte `json:"-"`
}
@@ -684,6 +809,14 @@ func (m *CriuResp) GetCrErrno() int32 {
return 0
}
func (m *CriuResp) GetFeatures() *CriuFeatures {
if m != nil {
return m.Features
}
return nil
}
func init() {
proto.RegisterEnum("CriuCgMode", CriuCgMode_name, CriuCgMode_value)
proto.RegisterEnum("CriuReqType", CriuReqType_name, CriuReqType_value)
}

View File

@@ -29,6 +29,16 @@ message unix_sk {
required uint32 inode = 1;
};
enum criu_cg_mode {
IGNORE = 0;
NONE = 1;
PROPS = 2;
SOFT = 3;
FULL = 4;
STRICT = 5;
DEFAULT = 6;
};
message criu_opts {
required int32 images_dir_fd = 1;
optional int32 pid = 2; /* if not set on dump, will dump requesting process */
@@ -75,7 +85,12 @@ message criu_opts {
repeated unix_sk unix_sk_ino = 33;
optional uint32 manage_cgroups_mode = 34;
optional criu_cg_mode manage_cgroups_mode = 34;
optional uint32 ghost_limit = 35 [default = 0x100000];
repeated string irmap_scan_paths = 36;
repeated string external = 37;
optional uint32 empty_ns = 38;
optional bool no_seccomp = 39;
}
message criu_dump_resp {
@@ -103,6 +118,16 @@ enum criu_req_type {
CPUINFO_DUMP = 7;
CPUINFO_CHECK = 8;
FEATURE_CHECK = 9;
}
/*
* List of features which can queried via
* CRIU_REQ_TYPE__FEATURE_CHECK
*/
message criu_features {
optional bool mem_track = 1;
}
/*
@@ -122,11 +147,17 @@ message criu_req {
* for all request types.
*/
optional bool keep_open = 4;
/*
* 'features' can be used to query which features
* are supported by the installed criu/kernel
* via RPC.
*/
optional criu_features features = 5;
}
/*
* Responce -- it states whether the request was served
* and additional request-specific informarion
* Response -- it states whether the request was served
* and additional request-specific information
*/
message criu_resp {
@@ -139,4 +170,5 @@ message criu_resp {
optional criu_page_server_info ps = 6;
optional int32 cr_errno = 7;
optional criu_features features = 8;
}

View File

@@ -2,7 +2,7 @@ package libcontainer
import "io"
// API error code type.
// ErrorCode is the API error code type.
type ErrorCode int
// API error codes.
@@ -19,7 +19,7 @@ const (
ContainerNotPaused
// Process errors
ProcessNotExecuted
NoProcessOps
// Common errors
ConfigInvalid
@@ -49,12 +49,14 @@ func (c ErrorCode) String() string {
return "Console exists for process"
case ContainerNotPaused:
return "Container is not paused"
case NoProcessOps:
return "No process operations"
default:
return "Unknown error"
}
}
// API Error type.
// Error is the API error type.
type Error interface {
error

View File

@@ -9,6 +9,7 @@ import (
"os/exec"
"path/filepath"
"regexp"
"runtime/debug"
"strconv"
"syscall"
@@ -22,11 +23,12 @@ import (
)
const (
stateFilename = "state.json"
stateFilename = "state.json"
execFifoFilename = "exec.fifo"
)
var (
idRegex = regexp.MustCompile(`^[\w_-]+$`)
idRegex = regexp.MustCompile(`^[\w+-\.]+$`)
maxIdLen = 1024
)
@@ -157,13 +159,34 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
}
uid, err := config.HostUID()
if err != nil {
return nil, newGenericError(err, SystemError)
}
gid, err := config.HostGID()
if err != nil {
return nil, newGenericError(err, SystemError)
}
containerRoot := filepath.Join(l.Root, id)
if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
} else if !os.IsNotExist(err) {
return nil, newGenericError(err, SystemError)
}
if err := os.MkdirAll(containerRoot, 0700); err != nil {
if err := os.MkdirAll(containerRoot, 0711); err != nil {
return nil, newGenericError(err, SystemError)
}
if err := os.Chown(containerRoot, uid, gid); err != nil {
return nil, newGenericError(err, SystemError)
}
fifoName := filepath.Join(containerRoot, execFifoFilename)
oldMask := syscall.Umask(0000)
if err := syscall.Mkfifo(fifoName, 0622); err != nil {
syscall.Umask(oldMask)
return nil, newGenericError(err, SystemError)
}
syscall.Umask(oldMask)
if err := os.Chown(fifoName, uid, gid); err != nil {
return nil, newGenericError(err, SystemError)
}
c := &linuxContainer{
@@ -194,16 +217,18 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
fds: state.ExternalDescriptors,
}
c := &linuxContainer{
initProcess: r,
id: id,
config: &state.Config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
root: containerRoot,
initProcess: r,
initProcessStartTime: state.InitProcessStartTime,
id: id,
config: &state.Config,
initPath: l.InitPath,
initArgs: l.InitArgs,
criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
root: containerRoot,
created: state.Created,
}
c.state = &createdState{c: c, s: Created}
c.state = &loadedState{c: c}
if err := c.refreshState(); err != nil {
return nil, err
}
@@ -217,10 +242,18 @@ func (l *LinuxFactory) Type() string {
// StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
// This is a low level implementation detail of the reexec and should not be consumed externally
func (l *LinuxFactory) StartInitialization() (err error) {
fdStr := os.Getenv("_LIBCONTAINER_INITPIPE")
pipefd, err := strconv.Atoi(fdStr)
if err != nil {
return fmt.Errorf("error converting env var _LIBCONTAINER_INITPIPE(%q) to an int: %s", fdStr, err)
var pipefd, rootfd int
for k, v := range map[string]*int{
"_LIBCONTAINER_INITPIPE": &pipefd,
"_LIBCONTAINER_STATEDIR": &rootfd,
} {
s := os.Getenv(k)
i, err := strconv.Atoi(s)
if err != nil {
return fmt.Errorf("unable to convert %s=%s to int", k, s)
}
*v = i
}
var (
pipe = os.NewFile(uintptr(pipefd), "pipe")
@@ -229,29 +262,31 @@ func (l *LinuxFactory) StartInitialization() (err error) {
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
var i initer
defer func() {
// if we have an error during the initialization of the container's init then send it back to the
// parent process in the form of an initError.
if err != nil {
if _, ok := i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init.
if err := utils.WriteJSON(pipe, syncT{procError}); err != nil {
panic(err)
}
}
if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
panic(err)
}
} else {
if err := utils.WriteJSON(pipe, syncT{procStart}); err != nil {
// We have an error during the initialization of the container's init,
// send it back to the parent process in the form of an initError.
// If container's init successed, syscall.Exec will not return, hence
// this defer function will never be called.
if _, ok := i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init.
if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
panic(err)
}
}
if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
panic(err)
}
// ensure that this pipe is always closed
pipe.Close()
}()
i, err = newContainerInit(it, pipe)
defer func() {
if e := recover(); e != nil {
err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
}
}()
i, err = newContainerInit(it, pipe, rootfd)
if err != nil {
return err
}

View File

@@ -14,8 +14,9 @@ type syncType uint8
const (
procReady syncType = iota
procError
procStart
procRun
procHooks
procResume
)
type syncT struct {
@@ -51,6 +52,21 @@ func newGenericError(err error, c ErrorCode) Error {
}
func newSystemError(err error) Error {
return createSystemError(err, "")
}
func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error {
return createSystemError(err, fmt.Sprintf(cause, v...))
}
func newSystemErrorWithCause(err error, cause string) Error {
return createSystemError(err, cause)
}
// createSystemError creates the specified error with the correct number of
// stack frames skipped. This is only to be called by the other functions for
// formatting the error.
func createSystemError(err error, cause string) Error {
if le, ok := err.(Error); ok {
return le
}
@@ -58,7 +74,8 @@ func newSystemError(err error) Error {
Timestamp: time.Now(),
Err: err,
ECode: SystemError,
Stack: stacktrace.Capture(1),
Cause: cause,
Stack: stacktrace.Capture(2),
}
if err != nil {
gerr.Message = err.Error()
@@ -70,12 +87,17 @@ type genericError struct {
Timestamp time.Time
ECode ErrorCode
Err error `json:"-"`
Cause string
Message string
Stack stacktrace.Stacktrace
}
func (e *genericError) Error() string {
return fmt.Sprintf("[%d] %s: %s", e.ECode, e.ECode, e.Message)
if e.Cause == "" {
return e.Message
}
frame := e.Stack.Frames[0]
return fmt.Sprintf("%s:%d: %s caused %q", frame.File, frame.Line, e.Cause, e.Message)
}
func (e *genericError) Code() ErrorCode {

View File

@@ -44,22 +44,28 @@ type network struct {
// initConfig is used for transferring parameters from Exec() to Init()
type initConfig struct {
Args []string `json:"args"`
Env []string `json:"env"`
Cwd string `json:"cwd"`
Capabilities []string `json:"capabilities"`
User string `json:"user"`
Config *configs.Config `json:"config"`
Console string `json:"console"`
Networks []*network `json:"network"`
PassedFilesCount int `json:"passed_files_count"`
Args []string `json:"args"`
Env []string `json:"env"`
Cwd string `json:"cwd"`
Capabilities []string `json:"capabilities"`
ProcessLabel string `json:"process_label"`
AppArmorProfile string `json:"apparmor_profile"`
NoNewPrivileges bool `json:"no_new_privileges"`
User string `json:"user"`
Config *configs.Config `json:"config"`
Console string `json:"console"`
Networks []*network `json:"network"`
PassedFilesCount int `json:"passed_files_count"`
ContainerId string `json:"containerid"`
Rlimits []configs.Rlimit `json:"rlimits"`
ExecFifoPath string `json:"start_pipe_path"`
}
type initer interface {
Init() error
}
func newContainerInit(t initType, pipe *os.File) (initer, error) {
func newContainerInit(t initType, pipe *os.File, stateDirFD int) (initer, error) {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err
@@ -74,9 +80,10 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
}, nil
case initStandard:
return &linuxStandardInit{
pipe: pipe,
parentPid: syscall.Getppid(),
config: config,
pipe: pipe,
parentPid: syscall.Getppid(),
config: config,
stateDirFD: stateDirFD,
}, nil
}
return nil, fmt.Errorf("unknown init type %q", t)
@@ -163,20 +170,22 @@ func syncParentReady(pipe io.ReadWriter) error {
return nil
}
// joinExistingNamespaces gets all the namespace paths specified for the container and
// does a setns on the namespace fd so that the current process joins the namespace.
func joinExistingNamespaces(namespaces []configs.Namespace) error {
for _, ns := range namespaces {
if ns.Path != "" {
f, err := os.OpenFile(ns.Path, os.O_RDONLY, 0)
if err != nil {
return err
}
err = system.Setns(f.Fd(), uintptr(ns.Syscall()))
f.Close()
if err != nil {
return err
}
// syncParentHooks sends to the given pipe a JSON payload which indicates that
// the parent should execute pre-start hooks. It then waits for the parent to
// indicate that it is cleared to resume.
func syncParentHooks(pipe io.ReadWriter) error {
// Tell parent.
if err := utils.WriteJSON(pipe, syncT{procHooks}); err != nil {
return err
}
// Wait for parent to give the all-clear.
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type != procResume {
return fmt.Errorf("invalid synchronisation flag from parent")
}
}
return nil
@@ -309,19 +318,19 @@ func setupRoute(config *configs.Config) error {
return nil
}
func setupRlimits(config *configs.Config) error {
for _, rlimit := range config.Rlimits {
l := &syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}
if err := syscall.Setrlimit(rlimit.Type, l); err != nil {
func setupRlimits(limits []configs.Rlimit, pid int) error {
for _, rlimit := range limits {
if err := system.Prlimit(pid, rlimit.Type, syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil {
return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
}
}
return nil
}
func setOomScoreAdj(oomScoreAdj int) error {
path := "/proc/self/oom_score_adj"
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0700)
func setOomScoreAdj(oomScoreAdj int, pid int) error {
path := fmt.Sprintf("/proc/%d/oom_score_adj", pid)
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600)
}
// killCgroupProcesses freezes then iterates over all the processes inside the
@@ -338,11 +347,14 @@ func killCgroupProcesses(m cgroups.Manager) error {
return err
}
for _, pid := range pids {
if p, err := os.FindProcess(pid); err == nil {
procs = append(procs, p)
if err := p.Kill(); err != nil {
logrus.Warn(err)
}
p, err := os.FindProcess(pid)
if err != nil {
logrus.Warn(err)
continue
}
procs = append(procs, p)
if err := p.Kill(); err != nil {
logrus.Warn(err)
}
}
if err := m.Freeze(configs.Thawed); err != nil {

View File

@@ -0,0 +1,66 @@
// +build linux
package keyctl
import (
"fmt"
"strconv"
"strings"
"syscall"
"unsafe"
)
const KEYCTL_JOIN_SESSION_KEYRING = 1
const KEYCTL_SETPERM = 5
const KEYCTL_DESCRIBE = 6
type KeySerial uint32
func JoinSessionKeyring(name string) (KeySerial, error) {
var _name *byte
var err error
if len(name) > 0 {
_name, err = syscall.BytePtrFromString(name)
if err != nil {
return KeySerial(0), err
}
}
sessKeyId, _, errn := syscall.Syscall(syscall.SYS_KEYCTL, KEYCTL_JOIN_SESSION_KEYRING, uintptr(unsafe.Pointer(_name)), 0)
if errn != 0 {
return 0, fmt.Errorf("could not create session key: %v", errn)
}
return KeySerial(sessKeyId), nil
}
// ModKeyringPerm modifies permissions on a keyring by reading the current permissions,
// anding the bits with the given mask (clearing permissions) and setting
// additional permission bits
func ModKeyringPerm(ringId KeySerial, mask, setbits uint32) error {
dest := make([]byte, 1024)
destBytes := unsafe.Pointer(&dest[0])
if _, _, err := syscall.Syscall6(syscall.SYS_KEYCTL, uintptr(KEYCTL_DESCRIBE), uintptr(ringId), uintptr(destBytes), uintptr(len(dest)), 0, 0); err != 0 {
return err
}
res := strings.Split(string(dest), ";")
if len(res) < 5 {
return fmt.Errorf("Destination buffer for key description is too small")
}
// parse permissions
perm64, err := strconv.ParseUint(res[3], 16, 32)
if err != nil {
return err
}
perm := (uint32(perm64) & mask) | setbits
if _, _, err := syscall.Syscall(syscall.SYS_KEYCTL, uintptr(KEYCTL_SETPERM), uintptr(ringId), uintptr(perm)); err != 0 {
return err
}
return nil
}

View File

@@ -21,6 +21,10 @@ func SetProcessLabel(processLabel string) error {
return nil
}
func GetFileLabel(path string) (string, error) {
return "", nil
}
func SetFileLabel(path string, fileLabel string) error {
return nil
}
@@ -48,7 +52,7 @@ func UnreserveLabel(label string) error {
return nil
}
// DupSecOpt takes an process label and returns security options that
// DupSecOpt takes a process label and returns security options that
// can be used to set duplicate labels on future container processes
func DupSecOpt(src string) []string {
return nil

View File

@@ -94,6 +94,11 @@ func GetProcessLabel() (string, error) {
return selinux.Getexeccon()
}
// GetFileLabel returns the label for specified path
func GetFileLabel(path string) (string, error) {
return selinux.Getfilecon(path)
}
// SetFileLabel modifies the "path" label to the specified file label
func SetFileLabel(path string, fileLabel string) error {
if selinux.SelinuxEnabled() && fileLabel != "" {
@@ -102,7 +107,7 @@ func SetFileLabel(path string, fileLabel string) error {
return nil
}
// Tell the kernel the label for all files to be created
// SetFileCreateLabel tells the kernel the label for all files to be created
func SetFileCreateLabel(fileLabel string) error {
if selinux.SelinuxEnabled() {
return selinux.Setfscreatecon(fileLabel)
@@ -110,7 +115,7 @@ func SetFileCreateLabel(fileLabel string) error {
return nil
}
// Change the label of path to the filelabel string.
// Relabel changes the label of path to the filelabel string.
// It changes the MCS label to s0 if shared is true.
// This will allow all containers to share the content.
func Relabel(path string, fileLabel string, shared bool) error {

View File

@@ -12,8 +12,12 @@ import (
// The number is randomly chosen to not conflict with known netlink types
const (
InitMsg uint16 = 62000
PidAttr uint16 = 27281
CloneFlagsAttr uint16 = 27281
ConsolePathAttr uint16 = 27282
NsPathsAttr uint16 = 27283
UidmapAttr uint16 = 27284
GidmapAttr uint16 = 27285
SetgroupAttr uint16 = 27286
// When syscall.NLA_HDRLEN is in gccgo, take this out.
syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
)
@@ -23,7 +27,8 @@ type Int32msg struct {
Value uint32
}
// int32msg has the following representation
// Serialize serializes the message.
// Int32msg has the following representation
// | nlattr len | nlattr type |
// | uint32 value |
func (msg *Int32msg) Serialize() []byte {
@@ -39,7 +44,7 @@ func (msg *Int32msg) Len() int {
return syscall_NLA_HDRLEN + 4
}
// bytemsg has the following representation
// Bytemsg has the following representation
// | nlattr len | nlattr type |
// | value | pad |
type Bytemsg struct {
@@ -60,3 +65,25 @@ func (msg *Bytemsg) Serialize() []byte {
func (msg *Bytemsg) Len() int {
return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
}
type Boolmsg struct {
Type uint16
Value bool
}
func (msg *Boolmsg) Serialize() []byte {
buf := make([]byte, msg.Len())
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(msg.Len()))
native.PutUint16(buf[2:4], msg.Type)
if msg.Value {
buf[4] = 1
} else {
buf[4] = 0
}
return buf
}
func (msg *Boolmsg) Len() int {
return syscall_NLA_HDRLEN + 1
}

View File

@@ -5,6 +5,8 @@ import (
"io"
"math"
"os"
"github.com/opencontainers/runc/libcontainer/configs"
)
type processOperations interface {
@@ -48,6 +50,20 @@ type Process struct {
// All capabilities not specified will be dropped from the processes capability mask
Capabilities []string
// AppArmorProfile specifies the profile to apply to the process and is
// changed at the time the process is execed
AppArmorProfile string
// Label specifies the label to apply to the process. It is commonly used by selinux
Label string
// NoNewPrivileges controls whether processes can gain additional privileges.
NoNewPrivileges *bool
// Rlimits specifies the resource limits, such as max open files, to set in the container
// If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits []configs.Rlimit
ops processOperations
}
@@ -55,7 +71,7 @@ type Process struct {
// Wait releases any resources associated with the Process
func (p Process) Wait() (*os.ProcessState, error) {
if p.ops == nil {
return nil, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
}
return p.ops.wait()
}
@@ -65,7 +81,7 @@ func (p Process) Pid() (int, error) {
// math.MinInt32 is returned here, because it's invalid value
// for the kill() system call.
if p.ops == nil {
return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
}
return p.ops.pid(), nil
}
@@ -73,7 +89,7 @@ func (p Process) Pid() (int, error) {
// Signal sends a signal to the Process.
func (p Process) Signal(sig os.Signal) error {
if p.ops == nil {
return newGenericError(fmt.Errorf("invalid process"), ProcessNotExecuted)
return newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
}
return p.ops.signal(sig)
}
@@ -86,8 +102,8 @@ type IO struct {
}
// NewConsole creates new console for process and returns it
func (p *Process) NewConsole(rootuid int) (Console, error) {
console, err := NewConsole(rootuid, rootuid)
func (p *Process) NewConsole(rootuid, rootgid int) (Console, error) {
console, err := NewConsole(rootuid, rootgid)
if err != nil {
return nil, err
}

View File

@@ -51,6 +51,7 @@ type setnsProcess struct {
fds []string
process *Process
bootstrapData io.Reader
rootDir *os.File
}
func (p *setnsProcess) startTime() (string, error) {
@@ -69,39 +70,49 @@ func (p *setnsProcess) start() (err error) {
defer p.parentPipe.Close()
err = p.cmd.Start()
p.childPipe.Close()
p.rootDir.Close()
if err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "starting setns process")
}
if p.bootstrapData != nil {
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}
}
if err = p.execSetns(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "executing setns process")
}
if len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
}
}
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting oom score")
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for process")
}
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "writing config to pipe")
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "calling shutdown on init pipe")
}
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *genericError
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
return newSystemErrorWithCause(err, "decoding init error from pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return newSystemError(ierr)
return ierr
}
return nil
}
@@ -114,7 +125,7 @@ func (p *setnsProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
return newSystemError(err)
return newSystemErrorWithCause(err, "waiting on setns process to finish")
}
if !status.Success() {
p.cmd.Wait()
@@ -123,7 +134,7 @@ func (p *setnsProcess) execSetns() error {
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return newSystemError(err)
return newSystemErrorWithCause(err, "reading pid from init pipe")
}
process, err := os.FindProcess(pid.Pid)
if err != nil {
@@ -167,14 +178,17 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) {
}
type initProcess struct {
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
config *initConfig
manager cgroups.Manager
container *linuxContainer
fds []string
process *Process
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
config *initConfig
manager cgroups.Manager
container *linuxContainer
fds []string
process *Process
bootstrapData io.Reader
sharePidns bool
rootDir *os.File
}
func (p *initProcess) pid() int {
@@ -185,27 +199,63 @@ func (p *initProcess) externalDescriptors() []string {
return p.fds
}
func (p *initProcess) start() (err error) {
// execSetns runs the process that executes C code to perform the setns calls
// because setns support requires the C process to fork off a child and perform the setns
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
// This is called by initProcess.start function
func (p *initProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
return err
}
if !status.Success() {
p.cmd.Wait()
return &exec.ExitError{ProcessState: status}
}
var pid *pid
if err := json.NewDecoder(p.parentPipe).Decode(&pid); err != nil {
p.cmd.Wait()
return err
}
process, err := os.FindProcess(pid.Pid)
if err != nil {
return err
}
p.cmd.Process = process
p.process.ops = p
return nil
}
func (p *initProcess) start() error {
defer p.parentPipe.Close()
err = p.cmd.Start()
err := p.cmd.Start()
p.process.ops = p
p.childPipe.Close()
p.rootDir.Close()
if err != nil {
p.process.ops = nil
return newSystemError(err)
return newSystemErrorWithCause(err, "starting init process command")
}
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return err
}
if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "running exec setns process for init")
}
// Save the standard descriptor names before the container process
// can potentially move them (e.g., via dup2()). If we don't do this now,
// we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(p.pid())
if err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid())
}
p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children
// can escape the cgroup
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
}
defer func() {
if err != nil {
@@ -213,56 +263,88 @@ func (p *initProcess) start() (err error) {
p.manager.Destroy()
}
}()
if p.config.Config.Hooks != nil {
s := configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Root: p.config.Config.Rootfs,
}
for _, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemError(err)
}
}
}
if err := p.createNetworkInterfaces(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "creating nework interfaces")
}
if err := p.sendConfig(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "sending config to init process")
}
var (
procSync syncT
sentRun bool
ierr *genericError
procSync syncT
sentRun bool
sentResume bool
ierr *genericError
)
dec := json.NewDecoder(p.parentPipe)
loop:
for {
if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
if err := dec.Decode(&procSync); err != nil {
if err == io.EOF {
break loop
}
return newSystemError(err)
return newSystemErrorWithCause(err, "decoding sync type from init pipe")
}
switch procSync.Type {
case procStart:
break loop
case procReady:
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
}
// set oom_score_adj
if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting oom score for ready process")
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for ready process")
}
// call prestart hooks
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
if p.config.Config.Hooks != nil {
s := configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Root: p.config.Config.Rootfs,
}
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
}
// Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "reading syncT run type")
}
sentRun = true
case procHooks:
if p.config.Config.Hooks != nil {
s := configs.HookState{
Version: p.container.config.Version,
ID: p.container.id,
Pid: p.pid(),
Root: p.config.Config.Rootfs,
BundlePath: utils.SearchLabels(p.config.Config.Labels, "bundle"),
}
for i, hook := range p.config.Config.Hooks.Prestart {
if err := hook.Run(s); err != nil {
return newSystemErrorWithCausef(err, "running prestart hook %d", i)
}
}
}
// Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procResume}); err != nil {
return newSystemErrorWithCause(err, "reading syncT resume type")
}
sentResume = true
case procError:
// wait for the child process to fully complete and receive an error message
// if one was encoutered
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
if err := dec.Decode(&ierr); err != nil && err != io.EOF {
return newSystemErrorWithCause(err, "decoding proc error from init")
}
if ierr != nil {
break loop
@@ -270,19 +352,22 @@ loop:
// Programmer error.
panic("No error following JSON procError payload.")
default:
return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
return newSystemError(fmt.Errorf("invalid JSON payload from child"))
}
}
if !sentRun {
return newSystemError(fmt.Errorf("could not synchronise with container process"))
return newSystemErrorWithCause(ierr, "container init failed")
}
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
return newSystemError(fmt.Errorf("could not synchronise after executing prestart hooks with container process"))
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "shutting down init pipe")
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return newSystemError(ierr)
return ierr
}
return nil
}
@@ -293,7 +378,7 @@ func (p *initProcess) wait() (*os.ProcessState, error) {
return p.cmd.ProcessState, err
}
// we should kill all processes in cgroup when init is died if we use host PID namespace
if p.cmd.SysProcAttr.Cloneflags&syscall.CLONE_NEWPID == 0 {
if p.sharePidns {
killCgroupProcesses(p.manager)
}
return p.cmd.ProcessState, nil
@@ -315,7 +400,9 @@ func (p *initProcess) startTime() (string, error) {
}
func (p *initProcess) sendConfig() error {
// send the state to the container's init process then shutdown writes for the parent
// send the config to the container's init process, we don't use JSON Encode
// here because there might be a problem in JSON decoder in some cases, see:
// https://github.com/docker/docker/issues/14203#issuecomment-174177790
return utils.WriteJSON(p.parentPipe, p.config)
}
@@ -365,7 +452,7 @@ func getPipeFds(pid int) ([]string, error) {
// InitializeIO creates pipes for use with the process's STDIO
// and returns the opposite side for each
func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
var fds []uintptr
i = &IO{}
// cleanup in case of an error
@@ -397,7 +484,7 @@ func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
p.Stderr, i.Stderr = w, r
// change ownership of the pipes incase we are in a user namespace
for _, fd := range fds {
if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil {
if err := syscall.Fchown(int(fd), rootuid, rootgid); err != nil {
return nil, err
}
}

View File

@@ -4,6 +4,7 @@ package libcontainer
import (
"fmt"
"io"
"io/ioutil"
"os"
"os/exec"
@@ -19,47 +20,65 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/system"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
)
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
// needsSetupDev returns true if /dev needs to be set up.
func needsSetupDev(config *configs.Config) bool {
for _, m := range config.Mounts {
if m.Device == "bind" && libcontainerUtils.CleanPath(m.Destination) == "/dev" {
return false
}
}
return true
}
// setupRootfs sets up the devices, mount points, and filesystems for use inside a
// new mount namespace.
func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
func setupRootfs(config *configs.Config, console *linuxConsole, pipe io.ReadWriter) (err error) {
if err := prepareRoot(config); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "preparing rootfs")
}
setupDev := len(config.Devices) != 0
setupDev := needsSetupDev(config)
for _, m := range config.Mounts {
for _, precmd := range m.PremountCmds {
if err := mountCmd(precmd); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "running premount command")
}
}
if err := mountToRootfs(m, config.Rootfs, config.MountLabel); err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "mounting %q to rootfs %q", m.Destination, config.Rootfs)
}
for _, postcmd := range m.PostmountCmds {
if err := mountCmd(postcmd); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "running postmount command")
}
}
}
if setupDev {
if err := createDevices(config); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "creating device nodes")
}
if err := setupPtmx(config, console); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting up ptmx")
}
if err := setupDevSymlinks(config.Rootfs); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting up /dev symlinks")
}
}
// Signal the parent to run the pre-start hooks.
// The hooks are run after the mounts are setup, but before we switch to the new
// root, so that the old root is still available in the hooks for any mount
// manipulations.
if err := syncParentHooks(pipe); err != nil {
return err
}
if err := syscall.Chdir(config.Rootfs); err != nil {
return newSystemError(err)
return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs)
}
if config.NoPivotRoot {
err = msMoveRoot(config.Rootfs)
@@ -67,16 +86,28 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
err = pivotRoot(config.Rootfs, config.PivotDir)
}
if err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "jailing process inside rootfs")
}
if setupDev {
if err := reOpenDevNull(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "reopening /dev/null inside container")
}
}
// remount dev as ro if specifed
for _, m := range config.Mounts {
if libcontainerUtils.CleanPath(m.Destination) == "/dev" {
if m.Flags&syscall.MS_RDONLY != 0 {
if err := remountReadonly(m.Destination); err != nil {
return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
}
}
break
}
}
// set rootfs ( / ) as readonly
if config.Readonlyfs {
if err := setReadonly(); err != nil {
return newSystemError(err)
return newSystemErrorWithCause(err, "setting rootfs as readonly")
}
}
syscall.Umask(0022)
@@ -84,14 +115,12 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
}
func mountCmd(cmd configs.Command) error {
command := exec.Command(cmd.Path, cmd.Args[:]...)
command.Env = cmd.Env
command.Dir = cmd.Dir
if out, err := command.CombinedOutput(); err != nil {
return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err)
}
return nil
}
@@ -119,8 +148,9 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
if err := mountPropagate(m, rootfs, ""); err != nil {
return err
}
return label.SetFileLabel(dest, mountLabel)
}
return label.SetFileLabel(dest, mountLabel)
return nil
case "tmpfs":
stat, err := os.Stat(dest)
if err != nil {
@@ -137,16 +167,6 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
}
}
return nil
case "devpts":
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
return mountPropagate(m, rootfs, mountLabel)
case "securityfs":
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
return mountPropagate(m, rootfs, mountLabel)
case "bind":
stat, err := os.Stat(m.Source)
if err != nil {
@@ -218,41 +238,33 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
return err
}
}
// create symlinks for merged cgroups
cwd, err := os.Getwd()
if err != nil {
return err
}
if err := os.Chdir(filepath.Join(rootfs, m.Destination)); err != nil {
return err
}
for _, mc := range merged {
for _, ss := range strings.Split(mc, ",") {
if err := os.Symlink(mc, ss); err != nil {
// if cgroup already exists, then okay(it could have been created before)
if os.IsExist(err) {
continue
}
os.Chdir(cwd)
// symlink(2) is very dumb, it will just shove the path into
// the link and doesn't do any checks or relative path
// conversion. Also, don't error out if the cgroup already exists.
if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) {
return err
}
}
}
if err := os.Chdir(cwd); err != nil {
return err
}
if m.Flags&syscall.MS_RDONLY != 0 {
// remount cgroup root as readonly
mcgrouproot := &configs.Mount{
Source: m.Destination,
Device: "bind",
Destination: m.Destination,
Flags: defaultMountFlags | syscall.MS_RDONLY,
Flags: defaultMountFlags | syscall.MS_RDONLY | syscall.MS_BIND,
}
if err := remount(mcgrouproot, rootfs); err != nil {
return err
}
}
default:
return fmt.Errorf("unknown mount device %q to %q", m.Device, m.Destination)
if err := os.MkdirAll(dest, 0755); err != nil {
return err
}
return mountPropagate(m, rootfs, mountLabel)
}
return nil
}
@@ -294,7 +306,7 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
// checkMountDestination checks to ensure that the mount destination is not over the top of /proc.
// dest is required to be an abs path and have any symlinks resolved before calling this function.
func checkMountDestination(rootfs, dest string) error {
if filepath.Clean(rootfs) == filepath.Clean(dest) {
if libcontainerUtils.CleanPath(rootfs) == libcontainerUtils.CleanPath(dest) {
return fmt.Errorf("mounting into / is prohibited")
}
invalidDestinations := []string{
@@ -307,7 +319,8 @@ func checkMountDestination(rootfs, dest string) error {
"/proc/cpuinfo",
"/proc/diskstats",
"/proc/meminfo",
"/proc/stats",
"/proc/stat",
"/proc/net/dev",
}
for _, valid := range validDestinations {
path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
@@ -340,7 +353,7 @@ func setupDevSymlinks(rootfs string) error {
// kcore support can be toggled with CONFIG_PROC_KCORE; only create a symlink
// in /dev if it exists in /proc.
if _, err := os.Stat("/proc/kcore"); err == nil {
links = append(links, [2]string{"/proc/kcore", "/dev/kcore"})
links = append(links, [2]string{"/proc/kcore", "/dev/core"})
}
for _, link := range links {
var (
@@ -489,10 +502,10 @@ func getParentMount(rootfs string) (string, string, error) {
}
// Make parent mount private if it was shared
func rootfsParentMountPrivate(config *configs.Config) error {
func rootfsParentMountPrivate(rootfs string) error {
sharedMount := false
parentMount, optionalOpts, err := getParentMount(config.Rootfs)
parentMount, optionalOpts, err := getParentMount(rootfs)
if err != nil {
return err
}
@@ -524,9 +537,10 @@ func prepareRoot(config *configs.Config) error {
if err := syscall.Mount("", "/", "", uintptr(flag), ""); err != nil {
return err
}
if err := rootfsParentMountPrivate(config); err != nil {
return err
if config.NoPivotRoot {
if err := rootfsParentMountPrivate(config.Rootfs); err != nil {
return err
}
}
return syscall.Mount(config.Rootfs, config.Rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, "")
@@ -550,7 +564,7 @@ func setupPtmx(config *configs.Config, console *linuxConsole) error {
return nil
}
func pivotRoot(rootfs, pivotBaseDir string) error {
func pivotRoot(rootfs, pivotBaseDir string) (err error) {
if pivotBaseDir == "" {
pivotBaseDir = "/"
}
@@ -562,8 +576,21 @@ func pivotRoot(rootfs, pivotBaseDir string) error {
if err != nil {
return fmt.Errorf("can't create pivot_root dir %s, error %v", pivotDir, err)
}
defer func() {
errVal := os.Remove(pivotDir)
if err == nil {
err = errVal
}
}()
if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
return fmt.Errorf("pivot_root %s", err)
// Make the parent mount private
if err := rootfsParentMountPrivate(rootfs); err != nil {
return err
}
// Try again
if err := syscall.PivotRoot(rootfs, pivotDir); err != nil {
return fmt.Errorf("pivot_root %s", err)
}
}
if err := syscall.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err)
@@ -580,7 +607,7 @@ func pivotRoot(rootfs, pivotBaseDir string) error {
if err := syscall.Unmount(pivotDir, syscall.MNT_DETACH); err != nil {
return fmt.Errorf("unmount pivot_root dir %s", err)
}
return os.Remove(pivotDir)
return nil
}
func msMoveRoot(rootfs string) error {
@@ -669,14 +696,18 @@ func remount(m *configs.Mount, rootfs string) error {
// of propagation flags.
func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
var (
dest = m.Destination
data = label.FormatMountLabel(m.Data, mountLabel)
dest = m.Destination
data = label.FormatMountLabel(m.Data, mountLabel)
flags = m.Flags
)
if libcontainerUtils.CleanPath(dest) == "/dev" {
flags &= ^syscall.MS_RDONLY
}
if !strings.HasPrefix(dest, rootfs) {
dest = filepath.Join(rootfs, dest)
}
if err := syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), data); err != nil {
if err := syscall.Mount(m.Source, dest, m.Device, uintptr(flags), data); err != nil {
return err
}

View File

@@ -36,6 +36,11 @@ var archs = map[string]string{
"SCMP_ARCH_MIPSEL": "mipsel",
"SCMP_ARCH_MIPSEL64": "mipsel64",
"SCMP_ARCH_MIPSEL64N32": "mipsel64n32",
"SCMP_ARCH_PPC": "ppc",
"SCMP_ARCH_PPC64": "ppc64",
"SCMP_ARCH_PPC64LE": "ppc64le",
"SCMP_ARCH_S390": "s390",
"SCMP_ARCH_S390X": "s390x",
}
// ConvertStringToOperator converts a string into a Seccomp comparison operator.

View File

@@ -5,7 +5,6 @@ package seccomp
import (
"bufio"
"fmt"
"log"
"os"
"strings"
"syscall"
@@ -167,7 +166,6 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
// Ignore it, don't error out
callNum, err := libseccomp.GetSyscallFromName(call.Name)
if err != nil {
log.Printf("Error resolving syscall name %s: %s - ignoring syscall.", call.Name, err)
return nil
}

View File

@@ -10,7 +10,7 @@ import (
var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
// Seccomp not supported, do nothing
// InitSeccomp does nothing because seccomp is not supported.
func InitSeccomp(config *configs.Seccomp) error {
if config != nil {
return ErrSeccompNotEnabled

View File

@@ -13,9 +13,9 @@ import (
"regexp"
"strconv"
"strings"
"sync"
"syscall"
"github.com/docker/docker/pkg/mount"
"github.com/opencontainers/runc/libcontainer/system"
)
@@ -35,6 +35,7 @@ const (
var (
assignRegex = regexp.MustCompile(`^([^=]+)=(.*)$`)
mcsList = make(map[string]bool)
mcsLock sync.Mutex
selinuxfs = "unknown"
selinuxEnabled = false // Stores whether selinux is currently enabled
selinuxEnabledChecked = false // Stores whether selinux enablement has been checked or established yet
@@ -58,16 +59,31 @@ func getSelinuxMountPoint() string {
}
selinuxfs = ""
mounts, err := mount.GetMounts()
f, err := os.Open("/proc/self/mountinfo")
if err != nil {
return selinuxfs
}
for _, mount := range mounts {
if mount.Fstype == "selinuxfs" {
selinuxfs = mount.Mountpoint
break
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
txt := scanner.Text()
// Safe as mountinfo encodes mountpoints with spaces as \040.
sepIdx := strings.Index(txt, " - ")
if sepIdx == -1 {
continue
}
if !strings.Contains(txt[sepIdx:], "selinuxfs") {
continue
}
fields := strings.Split(txt, " ")
if len(fields) < 5 {
continue
}
selinuxfs = fields[4]
break
}
if selinuxfs != "" {
var buf syscall.Statfs_t
syscall.Statfs(selinuxfs, &buf)
@@ -158,12 +174,14 @@ func Setfilecon(path string, scon string) error {
// Getfilecon returns the SELinux label for this path or returns an error.
func Getfilecon(path string) (string, error) {
con, err := system.Lgetxattr(path, xattrNameSelinux)
if err != nil {
return "", err
}
// Trim the NUL byte at the end of the byte buffer, if present.
if con[len(con)-1] == '\x00' {
if len(con) > 0 && con[len(con)-1] == '\x00' {
con = con[:len(con)-1]
}
return string(con), err
return string(con), nil
}
func Setfscreatecon(scon string) error {
@@ -265,6 +283,8 @@ func SelinuxGetEnforceMode() int {
}
func mcsAdd(mcs string) error {
mcsLock.Lock()
defer mcsLock.Unlock()
if mcsList[mcs] {
return fmt.Errorf("MCS Label already exists")
}
@@ -273,7 +293,9 @@ func mcsAdd(mcs string) error {
}
func mcsDelete(mcs string) {
mcsLock.Lock()
mcsList[mcs] = false
mcsLock.Unlock()
}
func IntToMcs(id int, catRange uint32) string {
@@ -289,7 +311,7 @@ func IntToMcs(id int, catRange uint32) string {
for ORD > TIER {
ORD = ORD - TIER
TIER -= 1
TIER--
}
TIER = SETSIZE - TIER
ORD = ORD + TIER
@@ -430,7 +452,7 @@ func badPrefix(fpath string) error {
return nil
}
// Change the fpath file object to the SELinux label scon.
// Chcon changes the fpath file object to the SELinux label scon.
// If the fpath is a directory and recurse is true Chcon will walk the
// directory tree setting the label
func Chcon(fpath string, scon string, recurse bool) error {
@@ -464,14 +486,14 @@ func DupSecOpt(src string) []string {
con["level"] == "" {
return nil
}
return []string{"label:user:" + con["user"],
"label:role:" + con["role"],
"label:type:" + con["type"],
"label:level:" + con["level"]}
return []string{"label=user:" + con["user"],
"label=role:" + con["role"],
"label=type:" + con["type"],
"label=level:" + con["level"]}
}
// DisableSecOpt returns a security opt that can be used to disabling SELinux
// labeling support for future container processes
func DisableSecOpt() []string {
return []string{"label:disable"}
return []string{"label=disable"}
}

View File

@@ -3,9 +3,11 @@
package libcontainer
import (
"fmt"
"os"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
@@ -17,12 +19,21 @@ type linuxSetnsInit struct {
config *initConfig
}
func (l *linuxSetnsInit) getSessionRingName() string {
return fmt.Sprintf("_ses.%s", l.config.ContainerId)
}
func (l *linuxSetnsInit) Init() error {
if err := setupRlimits(l.config.Config); err != nil {
return err
if !l.config.Config.NoNewKeyring {
// do not inherit the parent's session keyring
if _, err := keyctl.JoinSessionKeyring(l.getSessionRingName()); err != nil {
return err
}
}
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
return err
if l.config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
if l.config.Config.Seccomp != nil {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
@@ -32,13 +43,11 @@ func (l *linuxSetnsInit) Init() error {
if err := finalizeNamespace(l.config); err != nil {
return err
}
if err := apparmor.ApplyProfile(l.config.Config.AppArmorProfile); err != nil {
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
if l.config.Config.ProcessLabel != "" {
if err := label.SetProcessLabel(l.config.Config.ProcessLabel); err != nil {
return err
}
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
}

View File

@@ -2,14 +2,14 @@ package stacktrace
import "runtime"
// Caputure captures a stacktrace for the current calling go program
// Capture captures a stacktrace for the current calling go program
//
// skip is the number of frames to skip
func Capture(userSkip int) Stacktrace {
var (
skip = userSkip + 1 // add one for our own function
frames []Frame
prevPc uintptr = 0
prevPc uintptr
)
for i := skip; ; i++ {
pc, file, line, ok := runtime.Caller(i)

View File

@@ -3,28 +3,62 @@
package libcontainer
import (
"fmt"
"io"
"os"
"os/exec"
"syscall"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
)
type linuxStandardInit struct {
pipe io.ReadWriter
parentPid int
config *initConfig
pipe io.ReadWriteCloser
parentPid int
stateDirFD int
config *initConfig
}
func (l *linuxStandardInit) Init() error {
// join any namespaces via a path to the namespace fd if provided
if err := joinExistingNamespaces(l.config.Config.Namespaces); err != nil {
return err
func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
var newperms uint32
if l.config.Config.Namespaces.Contains(configs.NEWUSER) {
// with user ns we need 'other' search permissions
newperms = 0x8
} else {
// without user ns we need 'UID' search permissions
newperms = 0x80000
}
// create a unique per session container name that we can
// join in setns; however, other containers can also join it
return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
}
// PR_SET_NO_NEW_PRIVS isn't exposed in Golang so we define it ourselves copying the value
// the kernel
const PR_SET_NO_NEW_PRIVS = 0x26
func (l *linuxStandardInit) Init() error {
if !l.config.Config.NoNewKeyring {
ringname, keepperms, newperms := l.getSessionRingParams()
// do not inherit the parent's session keyring
sessKeyId, err := keyctl.JoinSessionKeyring(ringname)
if err != nil {
return err
}
// make session keyring searcheable
if err := keyctl.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return err
}
}
var console *linuxConsole
if l.config.Console != "" {
console = newConsoleFromPath(l.config.Console)
@@ -32,9 +66,6 @@ func (l *linuxStandardInit) Init() error {
return err
}
}
if _, err := syscall.Setsid(); err != nil {
return err
}
if console != nil {
if err := system.Setctty(); err != nil {
return err
@@ -46,16 +77,11 @@ func (l *linuxStandardInit) Init() error {
if err := setupRoute(l.config.Config); err != nil {
return err
}
if err := setupRlimits(l.config.Config); err != nil {
return err
}
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
return err
}
label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
if err := setupRootfs(l.config.Config, console); err != nil {
if err := setupRootfs(l.config.Config, console, l.pipe); err != nil {
return err
}
}
@@ -64,10 +90,10 @@ func (l *linuxStandardInit) Init() error {
return err
}
}
if err := apparmor.ApplyProfile(l.config.Config.AppArmorProfile); err != nil {
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return err
}
if err := label.SetProcessLabel(l.config.Config.ProcessLabel); err != nil {
if err := label.SetProcessLabel(l.config.ProcessLabel); err != nil {
return err
}
@@ -90,13 +116,21 @@ func (l *linuxStandardInit) Init() error {
if err != nil {
return err
}
if l.config.NoNewPrivileges {
if err := system.Prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return err
}
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return err
}
if l.config.Config.Seccomp != nil {
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err
}
@@ -115,5 +149,30 @@ func (l *linuxStandardInit) Init() error {
if syscall.Getppid() != l.parentPid {
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
// check for the arg before waiting to make sure it exists and it is returned
// as a create time error.
name, err := exec.LookPath(l.config.Args[0])
if err != nil {
return err
}
// close the pipe to signal that we have completed our init.
l.pipe.Close()
// wait for the fifo to be opened on the other side before
// exec'ing the users process.
fd, err := syscall.Openat(l.stateDirFD, execFifoFilename, os.O_WRONLY|syscall.O_CLOEXEC, 0)
if err != nil {
return newSystemErrorWithCause(err, "openat exec fifo")
}
if _, err := syscall.Write(fd, []byte("0")); err != nil {
return newSystemErrorWithCause(err, "write 0 exec fifo")
}
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
}
}
if err := syscall.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process")
}
return nil
}

View File

@@ -6,9 +6,11 @@ import (
"fmt"
"os"
"path/filepath"
"syscall"
"github.com/Sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
)
func newStateTransitionError(from, to containerState) error {
@@ -56,9 +58,10 @@ func destroy(c *linuxContainer) error {
func runPoststopHooks(c *linuxContainer) error {
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Root: c.config.Rootfs,
Version: c.config.Version,
ID: c.id,
Root: c.config.Rootfs,
BundlePath: utils.SearchLabels(c.config.Labels, "bundle"),
}
for _, hook := range c.config.Hooks.Poststop {
if err := hook.Run(s); err != nil {
@@ -75,7 +78,7 @@ type stoppedState struct {
}
func (b *stoppedState) status() Status {
return Destroyed
return Stopped
}
func (b *stoppedState) transition(s containerState) error {
@@ -108,11 +111,11 @@ func (r *runningState) status() Status {
func (r *runningState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
running, err := r.c.isRunning()
t, err := r.c.runType()
if err != nil {
return err
}
if running {
if t == Running {
return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
}
r.c.state = s
@@ -127,16 +130,40 @@ func (r *runningState) transition(s containerState) error {
}
func (r *runningState) destroy() error {
running, err := r.c.isRunning()
t, err := r.c.runType()
if err != nil {
return err
}
if running {
if t == Running {
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
}
return destroy(r.c)
}
type createdState struct {
c *linuxContainer
}
func (i *createdState) status() Status {
return Created
}
func (i *createdState) transition(s containerState) error {
switch s.(type) {
case *runningState, *pausedState, *stoppedState:
i.c.state = s
return nil
case *createdState:
return nil
}
return newStateTransitionError(i, s)
}
func (i *createdState) destroy() error {
i.c.initProcess.signal(syscall.SIGKILL)
return destroy(i.c)
}
// pausedState represents a container that is currently pause. It cannot be destroyed in a
// paused state and must transition back to running first.
type pausedState struct {
@@ -159,11 +186,11 @@ func (p *pausedState) transition(s containerState) error {
}
func (p *pausedState) destroy() error {
isRunning, err := p.c.isRunning()
t, err := p.c.runType()
if err != nil {
return err
}
if !isRunning {
if t != Running && t != Created {
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
@@ -173,7 +200,7 @@ func (p *pausedState) destroy() error {
}
// restoredState is the same as the running state but also has accociated checkpoint
// information that maybe need destroyed when the container is stopped and destory is called.
// information that maybe need destroyed when the container is stopped and destroy is called.
type restoredState struct {
imageDir string
c *linuxContainer
@@ -202,22 +229,25 @@ func (r *restoredState) destroy() error {
return destroy(r.c)
}
// createdState is used whenever a container is restored, loaded, or setting additional
// loadedState is used whenever a container is restored, loaded, or setting additional
// processes inside and it should not be destroyed when it is exiting.
type createdState struct {
type loadedState struct {
c *linuxContainer
s Status
}
func (n *createdState) status() Status {
func (n *loadedState) status() Status {
return n.s
}
func (n *createdState) transition(s containerState) error {
func (n *loadedState) transition(s containerState) error {
n.c.state = s
return nil
}
func (n *createdState) destroy() error {
return nil
func (n *loadedState) destroy() error {
if err := n.c.refreshState(); err != nil {
return err
}
return n.c.state.destroy()
}

View File

@@ -0,0 +1,7 @@
package libcontainer
// Solaris - TODO
type Stats struct {
Interfaces []*NetworkInterface
}

View File

@@ -11,6 +11,19 @@ import (
"unsafe"
)
// If arg2 is nonzero, set the "child subreaper" attribute of the
// calling process; if arg2 is zero, unset the attribute. When a
// process is marked as a child subreaper, all of the children
// that it creates, and their descendants, will be marked as
// having a subreaper. In effect, a subreaper fulfills the role
// of init(1) for its descendant processes. Upon termination of
// a process that is orphaned (i.e., its immediate parent has
// already terminated) and marked as having a subreaper, the
// nearest still living ancestor subreaper will receive a SIGCHLD
// signal and be able to wait(2) on the process to discover its
// termination status.
const PR_SET_CHILD_SUBREAPER = 36
type ParentDeathSignal int
func (p ParentDeathSignal) Restore() error {
@@ -40,6 +53,14 @@ func Execv(cmd string, args []string, env []string) error {
return syscall.Exec(name, args, env)
}
func Prlimit(pid, resource int, limit syscall.Rlimit) error {
_, _, err := syscall.RawSyscall6(syscall.SYS_PRLIMIT64, uintptr(pid), uintptr(resource), uintptr(unsafe.Pointer(&limit)), uintptr(unsafe.Pointer(&limit)), 0, 0)
if err != 0 {
return err
}
return nil
}
func SetParentDeathSignal(sig uintptr) error {
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, sig, 0); err != 0 {
return err
@@ -79,17 +100,12 @@ func Setctty() error {
return nil
}
/*
* Detect whether we are currently running in a user namespace.
* Copied from github.com/lxc/lxd/shared/util.go
*/
// RunningInUserNS detects whether we are currently running in a user namespace.
// Copied from github.com/lxc/lxd/shared/util.go
func RunningInUserNS() bool {
file, err := os.Open("/proc/self/uid_map")
if err != nil {
/*
* This kernel-provided file only exists if user namespaces are
* supported
*/
// This kernel-provided file only exists if user namespaces are supported
return false
}
defer file.Close()
@@ -112,3 +128,16 @@ func RunningInUserNS() bool {
}
return true
}
// SetSubreaper sets the value i as the subreaper setting for the calling process
func SetSubreaper(i int) error {
return Prctl(PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)
}
func Prctl(option int, arg2, arg3, arg4, arg5 uintptr) (err error) {
_, _, e1 := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0)
if e1 != 0 {
err = e1
}
return
}

View File

@@ -0,0 +1,9 @@
// +build !linux
package system
// RunningInUserNS is a stub for non-Linux systems
// Always returns false
func RunningInUserNS() bool {
return false
}

View File

@@ -2,13 +2,15 @@ package user
import (
"errors"
"fmt"
"syscall"
)
var (
// The current operating system does not provide the required data for user lookups.
ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
// No matching entries found in file.
ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
ErrNoGroupEntries = errors.New("no matching entries in group file")
)
func lookupUser(filter func(u User) bool) (User, error) {
@@ -27,7 +29,7 @@ func lookupUser(filter func(u User) bool) (User, error) {
// No user entries found.
if len(users) == 0 {
return User{}, fmt.Errorf("no matching entries in passwd file")
return User{}, ErrNoPasswdEntries
}
// Assume the first entry is the "correct" one.
@@ -75,7 +77,7 @@ func lookupGroup(filter func(g Group) bool) (Group, error) {
// No user entries found.
if len(groups) == 0 {
return Group{}, fmt.Errorf("no matching entries in group file")
return Group{}, ErrNoGroupEntries
}
// Assume the first entry is the "correct" one.

View File

@@ -15,7 +15,7 @@ const (
)
var (
ErrRange = fmt.Errorf("Uids and gids must be in range %d-%d", minId, maxId)
ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
)
type User struct {
@@ -42,29 +42,30 @@ func parseLine(line string, v ...interface{}) {
parts := strings.Split(line, ":")
for i, p := range parts {
// Ignore cases where we don't have enough fields to populate the arguments.
// Some configuration files like to misbehave.
if len(v) <= i {
// if we have more "parts" than we have places to put them, bail for great "tolerance" of naughty configuration files
break
}
// Use the type of the argument to figure out how to parse it, scanf() style.
// This is legit.
switch e := v[i].(type) {
case *string:
// "root", "adm", "/bin/bash"
*e = p
case *int:
// "0", "4", "1000"
// ignore string to int conversion errors, for great "tolerance" of naughty configuration files
// "numbers", with conversion errors ignored because of some misbehaving configuration files.
*e, _ = strconv.Atoi(p)
case *[]string:
// "", "root", "root,adm,daemon"
// Comma-separated lists.
if p != "" {
*e = strings.Split(p, ",")
} else {
*e = []string{}
}
default:
// panic, because this is a programming/logic error, not a runtime one
panic("parseLine expects only pointers! argument " + strconv.Itoa(i) + " is not a pointer!")
// Someone goof'd when writing code using this function. Scream so they can hear us.
panic(fmt.Sprintf("parseLine only accepts {*string, *int, *[]string} as arguments! %#v is not a pointer!", e))
}
}
}
@@ -106,8 +107,8 @@ func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) {
return nil, err
}
text := strings.TrimSpace(s.Text())
if text == "" {
line := strings.TrimSpace(s.Text())
if line == "" {
continue
}
@@ -117,10 +118,7 @@ func ParsePasswdFilter(r io.Reader, filter func(User) bool) ([]User, error) {
// root:x:0:0:root:/root:/bin/bash
// adm:x:3:4:adm:/var/adm:/bin/false
p := User{}
parseLine(
text,
&p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell,
)
parseLine(line, &p.Name, &p.Pass, &p.Uid, &p.Gid, &p.Gecos, &p.Home, &p.Shell)
if filter == nil || filter(p) {
out = append(out, p)
@@ -135,6 +133,7 @@ func ParseGroupFile(path string) ([]Group, error) {
if err != nil {
return nil, err
}
defer group.Close()
return ParseGroup(group)
}
@@ -178,10 +177,7 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
// root:x:0:root
// adm:x:4:root,adm,daemon
p := Group{}
parseLine(
text,
&p.Name, &p.Pass, &p.Gid, &p.List,
)
parseLine(text, &p.Name, &p.Pass, &p.Gid, &p.List)
if filter == nil || filter(p) {
out = append(out, p)
@@ -192,9 +188,10 @@ func ParseGroupFilter(r io.Reader, filter func(Group) bool) ([]Group, error) {
}
type ExecUser struct {
Uid, Gid int
Sgids []int
Home string
Uid int
Gid int
Sgids []int
Home string
}
// GetExecUserPath is a wrapper for GetExecUser. It reads data from each of the
@@ -235,12 +232,12 @@ func GetExecUserPath(userSpec string, defaults *ExecUser, passwdPath, groupPath
// * "uid:gid
// * "user:gid"
// * "uid:group"
//
// It should be noted that if you specify a numeric user or group id, they will
// not be evaluated as usernames (only the metadata will be filled). So attempting
// to parse a user with user.Name = "1337" will produce the user with a UID of
// 1337.
func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (*ExecUser, error) {
var (
userArg, groupArg string
name string
)
if defaults == nil {
defaults = new(ExecUser)
}
@@ -258,87 +255,113 @@ func GetExecUser(userSpec string, defaults *ExecUser, passwd, group io.Reader) (
user.Sgids = []int{}
}
// allow for userArg to have either "user" syntax, or optionally "user:group" syntax
// Allow for userArg to have either "user" syntax, or optionally "user:group" syntax
var userArg, groupArg string
parseLine(userSpec, &userArg, &groupArg)
// Convert userArg and groupArg to be numeric, so we don't have to execute
// Atoi *twice* for each iteration over lines.
uidArg, uidErr := strconv.Atoi(userArg)
gidArg, gidErr := strconv.Atoi(groupArg)
// Find the matching user.
users, err := ParsePasswdFilter(passwd, func(u User) bool {
if userArg == "" {
// Default to current state of the user.
return u.Uid == user.Uid
}
return u.Name == userArg || strconv.Itoa(u.Uid) == userArg
if uidErr == nil {
// If the userArg is numeric, always treat it as a UID.
return uidArg == u.Uid
}
return u.Name == userArg
})
// If we can't find the user, we have to bail.
if err != nil && passwd != nil {
if userArg == "" {
userArg = strconv.Itoa(user.Uid)
}
return nil, fmt.Errorf("Unable to find user %v: %v", userArg, err)
return nil, fmt.Errorf("unable to find user %s: %v", userArg, err)
}
haveUser := users != nil && len(users) > 0
if haveUser {
// if we found any user entries that matched our filter, let's take the first one as "correct"
name = users[0].Name
var matchedUserName string
if len(users) > 0 {
// First match wins, even if there's more than one matching entry.
matchedUserName = users[0].Name
user.Uid = users[0].Uid
user.Gid = users[0].Gid
user.Home = users[0].Home
} else if userArg != "" {
// we asked for a user but didn't find them... let's check to see if we wanted a numeric user
user.Uid, err = strconv.Atoi(userArg)
if err != nil {
// not numeric - we have to bail
return nil, fmt.Errorf("Unable to find user %v", userArg)
// If we can't find a user with the given username, the only other valid
// option is if it's a numeric username with no associated entry in passwd.
if uidErr != nil {
// Not numeric.
return nil, fmt.Errorf("unable to find user %s: %v", userArg, ErrNoPasswdEntries)
}
user.Uid = uidArg
// Must be inside valid uid range.
if user.Uid < minId || user.Uid > maxId {
return nil, ErrRange
}
// if userArg couldn't be found in /etc/passwd but is numeric, just roll with it - this is legit
// Okay, so it's numeric. We can just roll with this.
}
if groupArg != "" || name != "" {
// On to the groups. If we matched a username, we need to do this because of
// the supplementary group IDs.
if groupArg != "" || matchedUserName != "" {
groups, err := ParseGroupFilter(group, func(g Group) bool {
// Explicit group format takes precedence.
if groupArg != "" {
return g.Name == groupArg || strconv.Itoa(g.Gid) == groupArg
}
// Check if user is a member.
for _, u := range g.List {
if u == name {
return true
// If the group argument isn't explicit, we'll just search for it.
if groupArg == "" {
// Check if user is a member of this group.
for _, u := range g.List {
if u == matchedUserName {
return true
}
}
return false
}
return false
if gidErr == nil {
// If the groupArg is numeric, always treat it as a GID.
return gidArg == g.Gid
}
return g.Name == groupArg
})
if err != nil && group != nil {
return nil, fmt.Errorf("Unable to find groups for user %v: %v", users[0].Name, err)
return nil, fmt.Errorf("unable to find groups for spec %v: %v", matchedUserName, err)
}
haveGroup := groups != nil && len(groups) > 0
// Only start modifying user.Gid if it is in explicit form.
if groupArg != "" {
if haveGroup {
// if we found any group entries that matched our filter, let's take the first one as "correct"
if len(groups) > 0 {
// First match wins, even if there's more than one matching entry.
user.Gid = groups[0].Gid
} else {
// we asked for a group but didn't find id... let's check to see if we wanted a numeric group
user.Gid, err = strconv.Atoi(groupArg)
if err != nil {
// not numeric - we have to bail
return nil, fmt.Errorf("Unable to find group %v", groupArg)
}
} else if groupArg != "" {
// If we can't find a group with the given name, the only other valid
// option is if it's a numeric group name with no associated entry in group.
// Ensure gid is inside gid range.
if gidErr != nil {
// Not numeric.
return nil, fmt.Errorf("unable to find group %s: %v", groupArg, ErrNoGroupEntries)
}
user.Gid = gidArg
// Must be inside valid gid range.
if user.Gid < minId || user.Gid > maxId {
return nil, ErrRange
}
// if groupArg couldn't be found in /etc/group but is numeric, just roll with it - this is legit
// Okay, so it's numeric. We can just roll with this.
}
} else if haveGroup {
// If implicit group format, fill supplementary gids.
} else if len(groups) > 0 {
// Supplementary group ids only make sense if in the implicit form.
user.Sgids = make([]int, len(groups))
for i, group := range groups {
user.Sgids[i] = group.Gid

View File

@@ -5,7 +5,9 @@ import (
"encoding/hex"
"encoding/json"
"io"
"os"
"path/filepath"
"strings"
"syscall"
)
@@ -54,3 +56,66 @@ func WriteJSON(w io.Writer, v interface{}) error {
_, err = w.Write(data)
return err
}
// CleanPath makes a path safe for use with filepath.Join. This is done by not
// only cleaning the path, but also (if the path is relative) adding a leading
// '/' and cleaning it (then removing the leading '/'). This ensures that a
// path resulting from prepending another path will always resolve to lexically
// be a subdirectory of the prefixed path. This is all done lexically, so paths
// that include symlinks won't be safe as a result of using CleanPath.
func CleanPath(path string) string {
// Deal with empty strings nicely.
if path == "" {
return ""
}
// Ensure that all paths are cleaned (especially problematic ones like
// "/../../../../../" which can cause lots of issues).
path = filepath.Clean(path)
// If the path isn't absolute, we need to do more processing to fix paths
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
// paths to relative ones.
if !filepath.IsAbs(path) {
path = filepath.Clean(string(os.PathSeparator) + path)
// This can't fail, as (by definition) all paths are relative to root.
path, _ = filepath.Rel(string(os.PathSeparator), path)
}
// Clean the path again for good measure.
return filepath.Clean(path)
}
// SearchLabels searches a list of key-value pairs for the provided key and
// returns the corresponding value. The pairs must be separated with '='.
func SearchLabels(labels []string, query string) string {
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == query {
return parts[1]
}
}
return ""
}
// Annotations returns the bundle path and user defined annotations from the
// libcontianer state. We need to remove the bundle because that is a label
// added by libcontainer.
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
userAnnotations = make(map[string]string)
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == "bundle" {
bundle = parts[1]
} else {
userAnnotations[parts[0]] = parts[1]
}
}
return
}