Ensure kubelet pid is not moved to system container

This commit is contained in:
Jimmi Dyson 2016-01-06 23:36:48 +00:00
parent 6e6974a38f
commit 1c289943f5
62 changed files with 1765 additions and 774 deletions

76
Godeps/Godeps.json generated
View File

@ -579,93 +579,93 @@
}, },
{ {
"ImportPath": "github.com/google/cadvisor/api", "ImportPath": "github.com/google/cadvisor/api",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/cache/memory", "ImportPath": "github.com/google/cadvisor/cache/memory",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/collector", "ImportPath": "github.com/google/cadvisor/collector",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/container", "ImportPath": "github.com/google/cadvisor/container",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/events", "ImportPath": "github.com/google/cadvisor/events",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/fs", "ImportPath": "github.com/google/cadvisor/fs",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/healthz", "ImportPath": "github.com/google/cadvisor/healthz",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/http", "ImportPath": "github.com/google/cadvisor/http",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/info/v1", "ImportPath": "github.com/google/cadvisor/info/v1",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/info/v2", "ImportPath": "github.com/google/cadvisor/info/v2",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/manager", "ImportPath": "github.com/google/cadvisor/manager",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/metrics", "ImportPath": "github.com/google/cadvisor/metrics",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/pages", "ImportPath": "github.com/google/cadvisor/pages",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/storage", "ImportPath": "github.com/google/cadvisor/storage",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/summary", "ImportPath": "github.com/google/cadvisor/summary",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/utils", "ImportPath": "github.com/google/cadvisor/utils",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/validate", "ImportPath": "github.com/google/cadvisor/validate",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/cadvisor/version", "ImportPath": "github.com/google/cadvisor/version",
"Comment": "v0.20.4", "Comment": "v0.20.5",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317" "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
}, },
{ {
"ImportPath": "github.com/google/gofuzz", "ImportPath": "github.com/google/gofuzz",
@ -793,8 +793,8 @@
}, },
{ {
"ImportPath": "github.com/opencontainers/runc/libcontainer", "ImportPath": "github.com/opencontainers/runc/libcontainer",
"Comment": "v0.0.5", "Comment": "v0.0.7",
"Rev": "97bc9a7faf3dd660d9be90a2880b2e37f3cdbf38" "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
}, },
{ {
"ImportPath": "github.com/pborman/uuid", "ImportPath": "github.com/pborman/uuid",

View File

@ -139,6 +139,7 @@ func newDockerContainerHandler(
rootFs: rootFs, rootFs: rootFs,
rootfsStorageDir: rootfsStorageDir, rootfsStorageDir: rootfsStorageDir,
fsHandler: newFsHandler(time.Minute, rootfsStorageDir, otherStorageDir, fsInfo), fsHandler: newFsHandler(time.Minute, rootfsStorageDir, otherStorageDir, fsInfo),
envs: make(map[string]string),
} }
// We assume that if Inspect fails then the container is not known to docker. // We assume that if Inspect fails then the container is not known to docker.
@ -206,36 +207,31 @@ func libcontainerConfigToContainerSpec(config *libcontainerconfigs.Config, mi *i
spec.HasMemory = true spec.HasMemory = true
spec.Memory.Limit = math.MaxUint64 spec.Memory.Limit = math.MaxUint64
spec.Memory.SwapLimit = math.MaxUint64 spec.Memory.SwapLimit = math.MaxUint64
if config.Cgroups.Memory > 0 {
spec.Memory.Limit = uint64(config.Cgroups.Memory)
}
if config.Cgroups.MemorySwap > 0 {
spec.Memory.SwapLimit = uint64(config.Cgroups.MemorySwap)
}
// Get CPU info if config.Cgroups.Resources != nil {
spec.HasCpu = true if config.Cgroups.Resources.Memory > 0 {
spec.Cpu.Limit = 1024 spec.Memory.Limit = uint64(config.Cgroups.Resources.Memory)
if config.Cgroups.CpuShares != 0 { }
spec.Cpu.Limit = uint64(config.Cgroups.CpuShares) if config.Cgroups.Resources.MemorySwap > 0 {
spec.Memory.SwapLimit = uint64(config.Cgroups.Resources.MemorySwap)
}
// Get CPU info
spec.HasCpu = true
spec.Cpu.Limit = 1024
if config.Cgroups.Resources.CpuShares != 0 {
spec.Cpu.Limit = uint64(config.Cgroups.Resources.CpuShares)
}
spec.Cpu.Mask = utils.FixCpuMask(config.Cgroups.Resources.CpusetCpus, mi.NumCores)
} }
spec.Cpu.Mask = utils.FixCpuMask(config.Cgroups.CpusetCpus, mi.NumCores)
spec.HasDiskIo = true spec.HasDiskIo = true
return spec return spec
} }
var (
hasNetworkModes = map[string]bool{
"host": true,
"bridge": true,
"default": true,
}
)
func hasNet(networkMode string) bool { func hasNet(networkMode string) bool {
return hasNetworkModes[networkMode] return !strings.HasPrefix(networkMode, "container:")
} }
func (self *dockerContainerHandler) GetSpec() (info.ContainerSpec, error) { func (self *dockerContainerHandler) GetSpec() (info.ContainerSpec, error) {

View File

@ -292,31 +292,32 @@ func convertOldConfigToNew(config v1Config) *configs.Config {
result.Routes = config.Config.Routes result.Routes = config.Config.Routes
var newCgroup = &configs.Cgroup{ var newCgroup = &configs.Cgroup{
Name: old.Name, Name: old.Name,
Parent: old.Parent, Parent: old.Parent,
AllowAllDevices: old.AllowAllDevices, Resources: &configs.Resources{
AllowedDevices: old.AllowedDevices, AllowAllDevices: old.Resources.AllowAllDevices,
DeniedDevices: old.DeniedDevices, AllowedDevices: old.Resources.AllowedDevices,
Memory: old.Memory, DeniedDevices: old.Resources.DeniedDevices,
MemoryReservation: old.MemoryReservation, Memory: old.Resources.Memory,
MemorySwap: old.MemorySwap, MemoryReservation: old.Resources.MemoryReservation,
KernelMemory: old.KernelMemory, MemorySwap: old.Resources.MemorySwap,
CpuShares: old.CpuShares, KernelMemory: old.Resources.KernelMemory,
CpuQuota: old.CpuQuota, CpuShares: old.Resources.CpuShares,
CpuPeriod: old.CpuPeriod, CpuQuota: old.Resources.CpuQuota,
CpuRtRuntime: old.CpuRtRuntime, CpuPeriod: old.Resources.CpuPeriod,
CpuRtPeriod: old.CpuRtPeriod, CpuRtRuntime: old.Resources.CpuRtRuntime,
CpusetCpus: old.CpusetCpus, CpuRtPeriod: old.Resources.CpuRtPeriod,
CpusetMems: old.CpusetMems, CpusetCpus: old.Resources.CpusetCpus,
BlkioWeight: old.BlkioWeight, CpusetMems: old.Resources.CpusetMems,
BlkioLeafWeight: old.BlkioLeafWeight, BlkioWeight: old.Resources.BlkioWeight,
Freezer: old.Freezer, BlkioLeafWeight: old.Resources.BlkioLeafWeight,
HugetlbLimit: old.HugetlbLimit, Freezer: old.Resources.Freezer,
Slice: old.Slice, HugetlbLimit: old.Resources.HugetlbLimit,
OomKillDisable: old.OomKillDisable, OomKillDisable: old.Resources.OomKillDisable,
MemorySwappiness: old.MemorySwappiness, MemorySwappiness: old.Resources.MemorySwappiness,
NetPrioIfpriomap: old.NetPrioIfpriomap, NetPrioIfpriomap: old.Resources.NetPrioIfpriomap,
NetClsClassid: old.NetClsClassid, NetClsClassid: old.Resources.NetClsClassid,
},
} }
result.Cgroups = newCgroup result.Cgroups = newCgroup

View File

@ -54,6 +54,8 @@ type RealFsInfo struct {
// Map from label to block device path. // Map from label to block device path.
// Labels are intent-specific tags that are auto-detected. // Labels are intent-specific tags that are auto-detected.
labels map[string]string labels map[string]string
dmsetup dmsetupClient
} }
type Context struct { type Context struct {
@ -67,9 +69,11 @@ func NewFsInfo(context Context) (FsInfo, error) {
if err != nil { if err != nil {
return nil, err return nil, err
} }
partitions := make(map[string]partition, 0) fsInfo := &RealFsInfo{
fsInfo := &RealFsInfo{} partitions: make(map[string]partition, 0),
fsInfo.labels = make(map[string]string, 0) labels: make(map[string]string, 0),
dmsetup: &defaultDmsetupClient{},
}
supportedFsType := map[string]bool{ supportedFsType := map[string]bool{
// all ext systems are checked through prefix. // all ext systems are checked through prefix.
"btrfs": true, "btrfs": true,
@ -82,49 +86,87 @@ func NewFsInfo(context Context) (FsInfo, error) {
continue continue
} }
// Avoid bind mounts. // Avoid bind mounts.
if _, ok := partitions[mount.Source]; ok { if _, ok := fsInfo.partitions[mount.Source]; ok {
continue continue
} }
if mount.Fstype == "zfs" { if mount.Fstype == "zfs" {
Fstype = mount.Fstype Fstype = mount.Fstype
} }
partitions[mount.Source] = partition{ fsInfo.partitions[mount.Source] = partition{
fsType: Fstype, fsType: Fstype,
mountpoint: mount.Mountpoint, mountpoint: mount.Mountpoint,
major: uint(mount.Major), major: uint(mount.Major),
minor: uint(mount.Minor), minor: uint(mount.Minor),
} }
} }
if storageDriver, ok := context.DockerInfo["Driver"]; ok && storageDriver == "devicemapper" {
dev, major, minor, blockSize, err := dockerDMDevice(context.DockerInfo["DriverStatus"]) // need to call this before the log line below printing out the partitions, as this function may
if err != nil { // add a "partition" for devicemapper to fsInfo.partitions
glog.Warningf("Could not get Docker devicemapper device: %v", err) fsInfo.addDockerImagesLabel(context)
} else {
partitions[dev] = partition{ glog.Infof("Filesystem partitions: %+v", fsInfo.partitions)
fsType: "devicemapper", fsInfo.addSystemRootLabel()
major: major,
minor: minor,
blockSize: blockSize,
}
fsInfo.labels[LabelDockerImages] = dev
}
}
glog.Infof("Filesystem partitions: %+v", partitions)
fsInfo.partitions = partitions
fsInfo.addLabels(context)
return fsInfo, nil return fsInfo, nil
} }
func (self *RealFsInfo) addLabels(context Context) { // getDockerDeviceMapperInfo returns information about the devicemapper device and "partition" if
dockerPaths := getDockerImagePaths(context) // docker is using devicemapper for its storage driver. If a loopback device is being used, don't
// return any information or error, as we want to report based on the actual partition where the
// loopback file resides, inside of the loopback file itself.
func (self *RealFsInfo) getDockerDeviceMapperInfo(dockerInfo map[string]string) (string, *partition, error) {
if storageDriver, ok := dockerInfo["Driver"]; ok && storageDriver != "devicemapper" {
return "", nil, nil
}
var driverStatus [][]string
if err := json.Unmarshal([]byte(dockerInfo["DriverStatus"]), &driverStatus); err != nil {
return "", nil, err
}
dataLoopFile := dockerStatusValue(driverStatus, "Data loop file")
if len(dataLoopFile) > 0 {
return "", nil, nil
}
dev, major, minor, blockSize, err := dockerDMDevice(driverStatus, self.dmsetup)
if err != nil {
return "", nil, err
}
return dev, &partition{
fsType: "devicemapper",
major: major,
minor: minor,
blockSize: blockSize,
}, nil
}
// addSystemRootLabel attempts to determine which device contains the mount for /.
func (self *RealFsInfo) addSystemRootLabel() {
for src, p := range self.partitions { for src, p := range self.partitions {
if p.mountpoint == "/" { if p.mountpoint == "/" {
if _, ok := self.labels[LabelSystemRoot]; !ok { if _, ok := self.labels[LabelSystemRoot]; !ok {
self.labels[LabelSystemRoot] = src self.labels[LabelSystemRoot] = src
} }
} }
self.updateDockerImagesPath(src, p.mountpoint, dockerPaths) }
// TODO(rjnagal): Add label for docker devicemapper pool. }
// addDockerImagesLabel attempts to determine which device contains the mount for docker images.
func (self *RealFsInfo) addDockerImagesLabel(context Context) {
dockerDev, dockerPartition, err := self.getDockerDeviceMapperInfo(context.DockerInfo)
if err != nil {
glog.Warningf("Could not get Docker devicemapper device: %v", err)
}
if len(dockerDev) > 0 && dockerPartition != nil {
self.partitions[dockerDev] = *dockerPartition
self.labels[LabelDockerImages] = dockerDev
} else {
dockerPaths := getDockerImagePaths(context)
for src, p := range self.partitions {
self.updateDockerImagesPath(src, p.mountpoint, dockerPaths)
}
} }
} }
@ -345,20 +387,30 @@ func dockerStatusValue(status [][]string, target string) string {
return "" return ""
} }
// dmsetupClient knows to to interact with dmsetup to retrieve information about devicemapper.
type dmsetupClient interface {
table(poolName string) ([]byte, error)
//TODO add status(poolName string) ([]byte, error) and use it in getDMStats so we can unit test
}
// defaultDmsetupClient implements the standard behavior for interacting with dmsetup.
type defaultDmsetupClient struct{}
var _ dmsetupClient = &defaultDmsetupClient{}
func (*defaultDmsetupClient) table(poolName string) ([]byte, error) {
return exec.Command("dmsetup", "table", poolName).Output()
}
// Devicemapper thin provisioning is detailed at // Devicemapper thin provisioning is detailed at
// https://www.kernel.org/doc/Documentation/device-mapper/thin-provisioning.txt // https://www.kernel.org/doc/Documentation/device-mapper/thin-provisioning.txt
func dockerDMDevice(driverStatus string) (string, uint, uint, uint, error) { func dockerDMDevice(driverStatus [][]string, dmsetup dmsetupClient) (string, uint, uint, uint, error) {
var config [][]string poolName := dockerStatusValue(driverStatus, "Pool Name")
err := json.Unmarshal([]byte(driverStatus), &config)
if err != nil {
return "", 0, 0, 0, err
}
poolName := dockerStatusValue(config, "Pool Name")
if len(poolName) == 0 { if len(poolName) == 0 {
return "", 0, 0, 0, fmt.Errorf("Could not get dm pool name") return "", 0, 0, 0, fmt.Errorf("Could not get dm pool name")
} }
out, err := exec.Command("dmsetup", "table", poolName).Output() out, err := dmsetup.table(poolName)
if err != nil { if err != nil {
return "", 0, 0, 0, err return "", 0, 0, 0, err
} }

View File

@ -136,7 +136,7 @@ type MachineInfo struct {
CpuFrequency uint64 `json:"cpu_frequency_khz"` CpuFrequency uint64 `json:"cpu_frequency_khz"`
// The amount of memory (in bytes) in this machine // The amount of memory (in bytes) in this machine
MemoryCapacity int64 `json:"memory_capacity"` MemoryCapacity uint64 `json:"memory_capacity"`
// The machine id // The machine id
MachineID string `json:"machine_id"` MachineID string `json:"machine_id"`

View File

@ -41,7 +41,7 @@ type Attributes struct {
CpuFrequency uint64 `json:"cpu_frequency_khz"` CpuFrequency uint64 `json:"cpu_frequency_khz"`
// The amount of memory (in bytes) in this machine // The amount of memory (in bytes) in this machine
MemoryCapacity int64 `json:"memory_capacity"` MemoryCapacity uint64 `json:"memory_capacity"`
// The machine id // The machine id
MachineID string `json:"machine_id"` MachineID string `json:"machine_id"`

View File

@ -82,8 +82,8 @@ func GetClockSpeed(procInfo []byte) (uint64, error) {
} }
// GetMachineMemoryCapacity returns the machine's total memory from /proc/meminfo. // GetMachineMemoryCapacity returns the machine's total memory from /proc/meminfo.
// Returns the total memory capacity as an int64 (number of bytes). // Returns the total memory capacity as an uint64 (number of bytes).
func GetMachineMemoryCapacity() (int64, error) { func GetMachineMemoryCapacity() (uint64, error) {
out, err := ioutil.ReadFile("/proc/meminfo") out, err := ioutil.ReadFile("/proc/meminfo")
if err != nil { if err != nil {
return 0, err return 0, err
@ -97,8 +97,8 @@ func GetMachineMemoryCapacity() (int64, error) {
} }
// GetMachineSwapCapacity returns the machine's total swap from /proc/meminfo. // GetMachineSwapCapacity returns the machine's total swap from /proc/meminfo.
// Returns the total swap capacity as an int64 (number of bytes). // Returns the total swap capacity as an uint64 (number of bytes).
func GetMachineSwapCapacity() (int64, error) { func GetMachineSwapCapacity() (uint64, error) {
out, err := ioutil.ReadFile("/proc/meminfo") out, err := ioutil.ReadFile("/proc/meminfo")
if err != nil { if err != nil {
return 0, err return 0, err
@ -113,14 +113,14 @@ func GetMachineSwapCapacity() (int64, error) {
// parseCapacity matches a Regexp in a []byte, returning the resulting value in bytes. // parseCapacity matches a Regexp in a []byte, returning the resulting value in bytes.
// Assumes that the value matched by the Regexp is in KB. // Assumes that the value matched by the Regexp is in KB.
func parseCapacity(b []byte, r *regexp.Regexp) (int64, error) { func parseCapacity(b []byte, r *regexp.Regexp) (uint64, error) {
matches := r.FindSubmatch(b) matches := r.FindSubmatch(b)
if len(matches) != 2 { if len(matches) != 2 {
return -1, fmt.Errorf("failed to match regexp in output: %q", string(b)) return 0, fmt.Errorf("failed to match regexp in output: %q", string(b))
} }
m, err := strconv.ParseInt(string(matches[1]), 10, 64) m, err := strconv.ParseUint(string(matches[1]), 10, 64)
if err != nil { if err != nil {
return -1, err return 0, err
} }
// Convert to bytes. // Convert to bytes.

View File

@ -1 +1 @@
0.20.4 0.20.5

View File

@ -10,80 +10,165 @@ host system and which is (optionally) isolated from other containers in the syst
#### Using libcontainer #### Using libcontainer
To create a container you first have to initialize an instance of a factory Because containers are spawned in a two step process you will need a binary that
that will handle the creation and initialization for a container. will be executed as the init process for the container. In libcontainer, we use
the current binary (/proc/self/exe) to be executed as the init process, and use
Because containers are spawned in a two step process you will need to provide arg "init", we call the first step process "bootstrap", so you always need a "init"
arguments to a binary that will be executed as the init process for the container. function as the entry of "bootstrap".
To use the current binary that is spawning the containers and acting as the parent
you can use `os.Args[0]` and we have a command called `init` setup.
```go ```go
root, err := libcontainer.New("/var/lib/container", libcontainer.InitArgs(os.Args[0], "init")) func init() {
if len(os.Args) > 1 && os.Args[1] == "init" {
runtime.GOMAXPROCS(1)
runtime.LockOSThread()
factory, _ := libcontainer.New("")
if err := factory.StartInitialization(); err != nil {
logrus.Fatal(err)
}
panic("--this line should have never been executed, congratulations--")
}
}
```
Then to create a container you first have to initialize an instance of a factory
that will handle the creation and initialization for a container.
```go
factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
if err != nil { if err != nil {
log.Fatal(err) logrus.Fatal(err)
return
} }
``` ```
Once you have an instance of the factory created we can create a configuration Once you have an instance of the factory created we can create a configuration
struct describing how the container is to be created. A sample would look similar to this: struct describing how the container is to be created. A sample would look similar to this:
```go ```go
defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
config := &configs.Config{ config := &configs.Config{
Rootfs: rootfs, Rootfs: "/your/path/to/rootfs",
Capabilities: []string{ Capabilities: []string{
"CAP_CHOWN", "CAP_CHOWN",
"CAP_DAC_OVERRIDE", "CAP_DAC_OVERRIDE",
"CAP_FSETID", "CAP_FSETID",
"CAP_FOWNER", "CAP_FOWNER",
"CAP_MKNOD", "CAP_MKNOD",
"CAP_NET_RAW", "CAP_NET_RAW",
"CAP_SETGID", "CAP_SETGID",
"CAP_SETUID", "CAP_SETUID",
"CAP_SETFCAP", "CAP_SETFCAP",
"CAP_SETPCAP", "CAP_SETPCAP",
"CAP_NET_BIND_SERVICE", "CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT", "CAP_SYS_CHROOT",
"CAP_KILL", "CAP_KILL",
"CAP_AUDIT_WRITE", "CAP_AUDIT_WRITE",
}, },
Namespaces: configs.Namespaces([]configs.Namespace{ Namespaces: configs.Namespaces([]configs.Namespace{
{Type: configs.NEWNS}, {Type: configs.NEWNS},
{Type: configs.NEWUTS}, {Type: configs.NEWUTS},
{Type: configs.NEWIPC}, {Type: configs.NEWIPC},
{Type: configs.NEWPID}, {Type: configs.NEWPID},
{Type: configs.NEWNET}, {Type: configs.NEWUSER},
}), {Type: configs.NEWNET},
Cgroups: &configs.Cgroup{ }),
Name: "test-container", Cgroups: &configs.Cgroup{
Parent: "system", Name: "test-container",
AllowAllDevices: false, Parent: "system",
AllowedDevices: configs.DefaultAllowedDevices, Resources: &configs.Resources{
}, MemorySwappiness: -1,
AllowAllDevices: false,
Devices: configs.DefaultAutoCreatedDevices, AllowedDevices: configs.DefaultAllowedDevices,
Hostname: "testing", },
Networks: []*configs.Network{ },
{ MaskPaths: []string{
Type: "loopback", "/proc/kcore",
Address: "127.0.0.1/0", },
Gateway: "localhost", ReadonlyPaths: []string{
}, "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
}, },
Rlimits: []configs.Rlimit{ Devices: configs.DefaultAutoCreatedDevices,
{ Hostname: "testing",
Type: syscall.RLIMIT_NOFILE, Mounts: []*configs.Mount{
Hard: uint64(1024), {
Soft: uint64(1024), Source: "proc",
}, Destination: "/proc",
}, Device: "proc",
Flags: defaultMountFlags,
},
{
Source: "tmpfs",
Destination: "/dev",
Device: "tmpfs",
Flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME,
Data: "mode=755",
},
{
Source: "devpts",
Destination: "/dev/pts",
Device: "devpts",
Flags: syscall.MS_NOSUID | syscall.MS_NOEXEC,
Data: "newinstance,ptmxmode=0666,mode=0620,gid=5",
},
{
Device: "tmpfs",
Source: "shm",
Destination: "/dev/shm",
Data: "mode=1777,size=65536k",
Flags: defaultMountFlags,
},
{
Source: "mqueue",
Destination: "/dev/mqueue",
Device: "mqueue",
Flags: defaultMountFlags,
},
{
Source: "sysfs",
Destination: "/sys",
Device: "sysfs",
Flags: defaultMountFlags | syscall.MS_RDONLY,
},
},
UidMappings: []configs.IDMap{
{
ContainerID: 0,
Host: 1000,
size: 65536,
},
},
GidMappings: []configs.IDMap{
{
ContainerID: 0,
Host: 1000,
size: 65536,
},
},
Networks: []*configs.Network{
{
Type: "loopback",
Address: "127.0.0.1/0",
Gateway: "localhost",
},
},
Rlimits: []configs.Rlimit{
{
Type: syscall.RLIMIT_NOFILE,
Hard: uint64(1025),
Soft: uint64(1025),
},
},
} }
``` ```
Once you have the configuration populated you can create a container: Once you have the configuration populated you can create a container:
```go ```go
container, err := root.Create("container-id", config) container, err := factory.Create("container-id", config)
if err != nil {
logrus.Fatal(err)
return
}
``` ```
To spawn bash as the initial process inside the container and have the To spawn bash as the initial process inside the container and have the
@ -91,23 +176,25 @@ processes pid returned in order to wait, signal, or kill the process:
```go ```go
process := &libcontainer.Process{ process := &libcontainer.Process{
Args: []string{"/bin/bash"}, Args: []string{"/bin/bash"},
Env: []string{"PATH=/bin"}, Env: []string{"PATH=/bin"},
User: "daemon", User: "daemon",
Stdin: os.Stdin, Stdin: os.Stdin,
Stdout: os.Stdout, Stdout: os.Stdout,
Stderr: os.Stderr, Stderr: os.Stderr,
} }
err := container.Start(process) err := container.Start(process)
if err != nil { if err != nil {
log.Fatal(err) logrus.Fatal(err)
container.Destroy()
return
} }
// wait for the process to finish. // wait for the process to finish.
status, err := process.Wait() _, err := process.Wait()
if err != nil { if err != nil {
log.Fatal(err) logrus.Fatal(err)
} }
// destroy the container. // destroy the container.
@ -124,7 +211,6 @@ processes, err := container.Processes()
// it's processes. // it's processes.
stats, err := container.Stats() stats, err := container.Stats()
// pause all processes inside the container. // pause all processes inside the container.
container.Pause() container.Pause()

View File

@ -60,7 +60,7 @@ are required to be mounted within the rootfs that the runtime will setup.
After a container's filesystems are mounted within the newly created After a container's filesystems are mounted within the newly created
mount namespace `/dev` will need to be populated with a set of device nodes. mount namespace `/dev` will need to be populated with a set of device nodes.
It is expected that a rootfs does not need to have any device nodes specified It is expected that a rootfs does not need to have any device nodes specified
for `/dev` witin the rootfs as the container will setup the correct devices for `/dev` within the rootfs as the container will setup the correct devices
that are required for executing a container's process. that are required for executing a container's process.
| Path | Mode | Access | | Path | Mode | Access |

View File

@ -2,10 +2,19 @@
package apparmor package apparmor
import (
"errors"
)
var ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported")
func IsEnabled() bool { func IsEnabled() bool {
return false return false
} }
func ApplyProfile(name string) error { func ApplyProfile(name string) error {
if name != "" {
return ErrApparmorNotEnabled
}
return nil return nil
} }

View File

@ -15,6 +15,9 @@ type Manager interface {
// Returns the PIDs inside the cgroup set // Returns the PIDs inside the cgroup set
GetPids() ([]int, error) GetPids() ([]int, error)
// Returns the PIDs inside the cgroup set & all sub-cgroups
GetAllPids() ([]int, error)
// Returns statistics for the cgroup set // Returns statistics for the cgroup set
GetStats() (*Stats, error) GetStats() (*Stats, error)

View File

@ -23,6 +23,7 @@ var (
&MemoryGroup{}, &MemoryGroup{},
&CpuGroup{}, &CpuGroup{},
&CpuacctGroup{}, &CpuacctGroup{},
&PidsGroup{},
&BlkioGroup{}, &BlkioGroup{},
&HugetlbGroup{}, &HugetlbGroup{},
&NetClsGroup{}, &NetClsGroup{},
@ -112,6 +113,22 @@ func (m *Manager) Apply(pid int) (err error) {
return err return err
} }
if c.Paths != nil {
paths := make(map[string]string)
for name, path := range c.Paths {
_, err := d.path(name)
if err != nil {
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[name] = path
}
m.Paths = paths
return cgroups.EnterPid(m.Paths, pid)
}
paths := make(map[string]string) paths := make(map[string]string)
defer func() { defer func() {
if err != nil { if err != nil {
@ -135,17 +152,13 @@ func (m *Manager) Apply(pid int) (err error) {
paths[sys.Name()] = p paths[sys.Name()] = p
} }
m.Paths = paths m.Paths = paths
if paths["cpu"] != "" {
if err := CheckCpushares(paths["cpu"], c.CpuShares); err != nil {
return err
}
}
return nil return nil
} }
func (m *Manager) Destroy() error { func (m *Manager) Destroy() error {
if m.Cgroups.Paths != nil {
return nil
}
m.mu.Lock() m.mu.Lock()
defer m.mu.Unlock() defer m.mu.Unlock()
if err := cgroups.RemovePaths(m.Paths); err != nil { if err := cgroups.RemovePaths(m.Paths); err != nil {
@ -179,15 +192,28 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
} }
func (m *Manager) Set(container *configs.Config) error { func (m *Manager) Set(container *configs.Config) error {
for name, path := range m.Paths { for _, sys := range subsystems {
sys, err := subsystems.Get(name) // Generate fake cgroup data.
if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { d, err := getCgroupData(container.Cgroups, -1)
continue if err != nil {
return err
} }
// Get the path, but don't error out if the cgroup wasn't found.
path, err := d.path(sys.Name())
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := sys.Set(path, container.Cgroups); err != nil { if err := sys.Set(path, container.Cgroups); err != nil {
return err return err
} }
} }
if m.Paths["cpu"] != "" {
if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
return err
}
}
return nil return nil
} }
@ -202,40 +228,78 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
if err != nil { if err != nil {
return err return err
} }
prevState := m.Cgroups.Freezer prevState := m.Cgroups.Resources.Freezer
m.Cgroups.Freezer = state m.Cgroups.Resources.Freezer = state
freezer, err := subsystems.Get("freezer") freezer, err := subsystems.Get("freezer")
if err != nil { if err != nil {
return err return err
} }
err = freezer.Set(dir, m.Cgroups) err = freezer.Set(dir, m.Cgroups)
if err != nil { if err != nil {
m.Cgroups.Freezer = prevState m.Cgroups.Resources.Freezer = prevState
return err return err
} }
return nil return nil
} }
func (m *Manager) GetPids() ([]int, error) { func (m *Manager) GetPids() ([]int, error) {
d, err := getCgroupData(m.Cgroups, 0) dir, err := getCgroupPath(m.Cgroups)
if err != nil { if err != nil {
return nil, err return nil, err
} }
dir, err := d.path("devices")
if err != nil {
return nil, err
}
return cgroups.GetPids(dir) return cgroups.GetPids(dir)
} }
func (m *Manager) GetAllPids() ([]int, error) {
dir, err := getCgroupPath(m.Cgroups)
if err != nil {
return nil, err
}
return cgroups.GetAllPids(dir)
}
func getCgroupPath(c *configs.Cgroup) (string, error) {
d, err := getCgroupData(c, 0)
if err != nil {
return "", err
}
return d.path("devices")
}
// pathClean makes a path safe for use with filepath.Join. This is done by not
// only cleaning the path, but also (if the path is relative) adding a leading
// '/' and cleaning it (then removing the leading '/'). This ensures that a
// path resulting from prepending another path will always resolve to lexically
// be a subdirectory of the prefixed path. This is all done lexically, so paths
// that include symlinks won't be safe as a result of using pathClean.
func pathClean(path string) string {
// Ensure that all paths are cleaned (especially problematic ones like
// "/../../../../../" which can cause lots of issues).
path = filepath.Clean(path)
// If the path isn't absolute, we need to do more processing to fix paths
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
// paths to relative ones.
if !filepath.IsAbs(path) {
path = filepath.Clean(string(os.PathSeparator) + path)
// This can't fail, as (by definition) all paths are relative to root.
path, _ = filepath.Rel(string(os.PathSeparator), path)
}
// Clean the path again for good measure.
return filepath.Clean(path)
}
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
root, err := getCgroupRoot() root, err := getCgroupRoot()
if err != nil { if err != nil {
return nil, err return nil, err
} }
// Clean the parent slice path.
c.Parent = pathClean(c.Parent)
return &cgroupData{ return &cgroupData{
root: root, root: root,
parent: c.Parent, parent: c.Parent,

View File

@ -22,31 +22,26 @@ func (s *BlkioGroup) Name() string {
} }
func (s *BlkioGroup) Apply(d *cgroupData) error { func (s *BlkioGroup) Apply(d *cgroupData) error {
dir, err := d.join("blkio") _, err := d.join("blkio")
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil return nil
} }
func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error { func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.BlkioWeight != 0 { if cgroup.Resources.BlkioWeight != 0 {
if err := writeFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.BlkioWeight), 10)); err != nil { if err := writeFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil {
return err return err
} }
} }
if cgroup.BlkioLeafWeight != 0 { if cgroup.Resources.BlkioLeafWeight != 0 {
if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.BlkioLeafWeight), 10)); err != nil { if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil {
return err return err
} }
} }
for _, wd := range cgroup.BlkioWeightDevice { for _, wd := range cgroup.Resources.BlkioWeightDevice {
if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil { if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
return err return err
} }
@ -54,22 +49,22 @@ func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error {
return err return err
} }
} }
for _, td := range cgroup.BlkioThrottleReadBpsDevice { for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
return err return err
} }
} }
for _, td := range cgroup.BlkioThrottleWriteBpsDevice { for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
return err return err
} }
} }
for _, td := range cgroup.BlkioThrottleReadIOPSDevice { for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err return err
} }
} }
for _, td := range cgroup.BlkioThrottleWriteIOPSDevice { for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err return err
} }

View File

@ -22,41 +22,36 @@ func (s *CpuGroup) Name() string {
func (s *CpuGroup) Apply(d *cgroupData) error { func (s *CpuGroup) Apply(d *cgroupData) error {
// We always want to join the cpu group, to allow fair cpu scheduling // We always want to join the cpu group, to allow fair cpu scheduling
// on a container basis // on a container basis
dir, err := d.join("cpu") _, err := d.join("cpu")
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil return nil
} }
func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error { func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.CpuShares != 0 { if cgroup.Resources.CpuShares != 0 {
if err := writeFile(path, "cpu.shares", strconv.FormatInt(cgroup.CpuShares, 10)); err != nil { if err := writeFile(path, "cpu.shares", strconv.FormatInt(cgroup.Resources.CpuShares, 10)); err != nil {
return err return err
} }
} }
if cgroup.CpuPeriod != 0 { if cgroup.Resources.CpuPeriod != 0 {
if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(cgroup.CpuPeriod, 10)); err != nil { if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(cgroup.Resources.CpuPeriod, 10)); err != nil {
return err return err
} }
} }
if cgroup.CpuQuota != 0 { if cgroup.Resources.CpuQuota != 0 {
if err := writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.CpuQuota, 10)); err != nil { if err := writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil {
return err return err
} }
} }
if cgroup.CpuRtPeriod != 0 { if cgroup.Resources.CpuRtPeriod != 0 {
if err := writeFile(path, "cpu.rt_period_us", strconv.FormatInt(cgroup.CpuRtPeriod, 10)); err != nil { if err := writeFile(path, "cpu.rt_period_us", strconv.FormatInt(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
return err return err
} }
} }
if cgroup.CpuRtRuntime != 0 { if cgroup.Resources.CpuRtRuntime != 0 {
if err := writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.CpuRtRuntime, 10)); err != nil { if err := writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil {
return err return err
} }
} }

View File

@ -4,6 +4,7 @@ package fs
import ( import (
"bytes" "bytes"
"fmt"
"io/ioutil" "io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
@ -29,13 +30,13 @@ func (s *CpusetGroup) Apply(d *cgroupData) error {
} }
func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error { func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.CpusetCpus != "" { if cgroup.Resources.CpusetCpus != "" {
if err := writeFile(path, "cpuset.cpus", cgroup.CpusetCpus); err != nil { if err := writeFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
return err return err
} }
} }
if cgroup.CpusetMems != "" { if cgroup.Resources.CpusetMems != "" {
if err := writeFile(path, "cpuset.mems", cgroup.CpusetMems); err != nil { if err := writeFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
return err return err
} }
} }
@ -63,11 +64,6 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
if err := s.ensureParent(dir, root); err != nil { if err := s.ensureParent(dir, root); err != nil {
return err return err
} }
// the default values inherit from parent cgroup are already set in
// s.ensureParent, cover these if we have our own
if err := s.Set(dir, cgroup); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file // because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems // unlike the other subsystems
if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil { if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil {
@ -95,6 +91,10 @@ func (s *CpusetGroup) ensureParent(current, root string) error {
if filepath.Clean(parent) == root { if filepath.Clean(parent) == root {
return nil return nil
} }
// Avoid infinite recursion.
if parent == current {
return fmt.Errorf("cpuset: cgroup parent path outside cgroup root")
}
if err := s.ensureParent(parent, root); err != nil { if err := s.ensureParent(parent, root); err != nil {
return err return err
} }

View File

@ -15,27 +15,22 @@ func (s *DevicesGroup) Name() string {
} }
func (s *DevicesGroup) Apply(d *cgroupData) error { func (s *DevicesGroup) Apply(d *cgroupData) error {
dir, err := d.join("devices") _, err := d.join("devices")
if err != nil { if err != nil {
// We will return error even it's `not found` error, devices // We will return error even it's `not found` error, devices
// cgroup is hard requirement for container's security. // cgroup is hard requirement for container's security.
return err return err
} }
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil return nil
} }
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
if !cgroup.AllowAllDevices { if !cgroup.Resources.AllowAllDevices {
if err := writeFile(path, "devices.deny", "a"); err != nil { if err := writeFile(path, "devices.deny", "a"); err != nil {
return err return err
} }
for _, dev := range cgroup.AllowedDevices { for _, dev := range cgroup.Resources.AllowedDevices {
if err := writeFile(path, "devices.allow", dev.CgroupString()); err != nil { if err := writeFile(path, "devices.allow", dev.CgroupString()); err != nil {
return err return err
} }
@ -47,7 +42,7 @@ func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
return err return err
} }
for _, dev := range cgroup.DeniedDevices { for _, dev := range cgroup.Resources.DeniedDevices {
if err := writeFile(path, "devices.deny", dev.CgroupString()); err != nil { if err := writeFile(path, "devices.deny", dev.CgroupString()); err != nil {
return err return err
} }

View File

@ -19,22 +19,17 @@ func (s *FreezerGroup) Name() string {
} }
func (s *FreezerGroup) Apply(d *cgroupData) error { func (s *FreezerGroup) Apply(d *cgroupData) error {
dir, err := d.join("freezer") _, err := d.join("freezer")
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil return nil
} }
func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error { func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
switch cgroup.Freezer { switch cgroup.Resources.Freezer {
case configs.Frozen, configs.Thawed: case configs.Frozen, configs.Thawed:
if err := writeFile(path, "freezer.state", string(cgroup.Freezer)); err != nil { if err := writeFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
return err return err
} }
@ -43,7 +38,7 @@ func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
if err != nil { if err != nil {
return err return err
} }
if strings.TrimSpace(state) == string(cgroup.Freezer) { if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) {
break break
} }
time.Sleep(1 * time.Millisecond) time.Sleep(1 * time.Millisecond)
@ -51,7 +46,7 @@ func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
case configs.Undefined: case configs.Undefined:
return nil return nil
default: default:
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Freezer)) return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer))
} }
return nil return nil

View File

@ -19,20 +19,15 @@ func (s *HugetlbGroup) Name() string {
} }
func (s *HugetlbGroup) Apply(d *cgroupData) error { func (s *HugetlbGroup) Apply(d *cgroupData) error {
dir, err := d.join("hugetlb") _, err := d.join("hugetlb")
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil return nil
} }
func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error { func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error {
for _, hugetlb := range cgroup.HugetlbLimit { for _, hugetlb := range cgroup.Resources.HugetlbLimit {
if err := writeFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil { if err := writeFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err return err
} }

View File

@ -32,8 +32,9 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
return err return err
} }
} }
// We have to set kernel memory here, as we can't change it once
if err := s.Set(path, d.config); err != nil { // processes have been attached.
if err := s.SetKernelMemory(path, d.config); err != nil {
return err return err
} }
} }
@ -50,45 +51,49 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
return nil
}
func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error {
// This has to be done separately because it has special constraints (it
// can't be done after there are processes attached to the cgroup).
if cgroup.Resources.KernelMemory > 0 {
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
return err
}
}
return nil return nil
} }
func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Memory != 0 { if cgroup.Resources.Memory != 0 {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Memory, 10)); err != nil { if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err return err
} }
} }
if cgroup.MemoryReservation != 0 { if cgroup.Resources.MemoryReservation != 0 {
if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.MemoryReservation, 10)); err != nil { if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
return err return err
} }
} }
if cgroup.MemorySwap > 0 { if cgroup.Resources.MemorySwap > 0 {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.MemorySwap, 10)); err != nil { if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err return err
} }
} }
if cgroup.KernelMemory > 0 { if cgroup.Resources.OomKillDisable {
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.KernelMemory, 10)); err != nil {
return err
}
}
if cgroup.OomKillDisable {
if err := writeFile(path, "memory.oom_control", "1"); err != nil { if err := writeFile(path, "memory.oom_control", "1"); err != nil {
return err return err
} }
} }
if cgroup.MemorySwappiness >= 0 && cgroup.MemorySwappiness <= 100 { if cgroup.Resources.MemorySwappiness >= 0 && cgroup.Resources.MemorySwappiness <= 100 {
if err := writeFile(path, "memory.swappiness", strconv.FormatInt(cgroup.MemorySwappiness, 10)); err != nil { if err := writeFile(path, "memory.swappiness", strconv.FormatInt(cgroup.Resources.MemorySwappiness, 10)); err != nil {
return err return err
} }
} else if cgroup.MemorySwappiness == -1 { } else if cgroup.Resources.MemorySwappiness == -1 {
return nil return nil
} else { } else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", cgroup.MemorySwappiness) return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", cgroup.Resources.MemorySwappiness)
} }
return nil return nil
@ -139,12 +144,12 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
} }
func memoryAssigned(cgroup *configs.Cgroup) bool { func memoryAssigned(cgroup *configs.Cgroup) bool {
return cgroup.Memory != 0 || return cgroup.Resources.Memory != 0 ||
cgroup.MemoryReservation != 0 || cgroup.Resources.MemoryReservation != 0 ||
cgroup.MemorySwap > 0 || cgroup.Resources.MemorySwap > 0 ||
cgroup.KernelMemory > 0 || cgroup.Resources.KernelMemory > 0 ||
cgroup.OomKillDisable || cgroup.Resources.OomKillDisable ||
cgroup.MemorySwappiness != -1 cgroup.Resources.MemorySwappiness != -1
} }
func getMemoryData(path, name string) (cgroups.MemoryData, error) { func getMemoryData(path, name string) (cgroups.MemoryData, error) {

View File

@ -15,21 +15,16 @@ func (s *NetClsGroup) Name() string {
} }
func (s *NetClsGroup) Apply(d *cgroupData) error { func (s *NetClsGroup) Apply(d *cgroupData) error {
dir, err := d.join("net_cls") _, err := d.join("net_cls")
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil return nil
} }
func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error { func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.NetClsClassid != "" { if cgroup.Resources.NetClsClassid != "" {
if err := writeFile(path, "net_cls.classid", cgroup.NetClsClassid); err != nil { if err := writeFile(path, "net_cls.classid", cgroup.Resources.NetClsClassid); err != nil {
return err return err
} }
} }

View File

@ -15,20 +15,15 @@ func (s *NetPrioGroup) Name() string {
} }
func (s *NetPrioGroup) Apply(d *cgroupData) error { func (s *NetPrioGroup) Apply(d *cgroupData) error {
dir, err := d.join("net_prio") _, err := d.join("net_prio")
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil return nil
} }
func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error { func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error {
for _, prioMap := range cgroup.NetPrioIfpriomap { for _, prioMap := range cgroup.Resources.NetPrioIfpriomap {
if err := writeFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil { if err := writeFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
return err return err
} }

View File

@ -0,0 +1,57 @@
// +build linux
package fs
import (
"fmt"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type PidsGroup struct {
}
func (s *PidsGroup) Name() string {
return "pids"
}
func (s *PidsGroup) Apply(d *cgroupData) error {
_, err := d.join("pids")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.PidsLimit != 0 {
// "max" is the fallback value.
limit := "max"
if cgroup.Resources.PidsLimit > 0 {
limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
}
if err := writeFile(path, "pids.max", limit); err != nil {
return err
}
}
return nil
}
func (s *PidsGroup) Remove(d *cgroupData) error {
return removePath(d.path("pids"))
}
func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
value, err := getCgroupParamUint(path, "pids.current")
if err != nil {
return fmt.Errorf("failed to parse pids.current - %s", err)
}
stats.PidsStats.Current = value
return nil
}

View File

@ -49,6 +49,11 @@ type MemoryStats struct {
Stats map[string]uint64 `json:"stats,omitempty"` Stats map[string]uint64 `json:"stats,omitempty"`
} }
type PidsStats struct {
// number of pids in the cgroup
Current uint64 `json:"current,omitempty"`
}
type BlkioStatEntry struct { type BlkioStatEntry struct {
Major uint64 `json:"major,omitempty"` Major uint64 `json:"major,omitempty"`
Minor uint64 `json:"minor,omitempty"` Minor uint64 `json:"minor,omitempty"`
@ -80,6 +85,7 @@ type HugetlbStats struct {
type Stats struct { type Stats struct {
CpuStats CpuStats `json:"cpu_stats,omitempty"` CpuStats CpuStats `json:"cpu_stats,omitempty"`
MemoryStats MemoryStats `json:"memory_stats,omitempty"` MemoryStats MemoryStats `json:"memory_stats,omitempty"`
PidsStats PidsStats `json:"pids_stats,omitempty"`
BlkioStats BlkioStats `json:"blkio_stats,omitempty"` BlkioStats BlkioStats `json:"blkio_stats,omitempty"`
// the map is in the format "size of hugepage: stats of the hugepage" // the map is in the format "size of hugepage: stats of the hugepage"
HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`

View File

@ -26,6 +26,10 @@ func (m *Manager) GetPids() ([]int, error) {
return nil, fmt.Errorf("Systemd not supported") return nil, fmt.Errorf("Systemd not supported")
} }
func (m *Manager) GetAllPids() ([]int, error) {
return nil, fmt.Errorf("Systemd not supported")
}
func (m *Manager) Destroy() error { func (m *Manager) Destroy() error {
return fmt.Errorf("Systemd not supported") return fmt.Errorf("Systemd not supported")
} }

View File

@ -55,6 +55,7 @@ var subsystems = subsystemSet{
&fs.MemoryGroup{}, &fs.MemoryGroup{},
&fs.CpuGroup{}, &fs.CpuGroup{},
&fs.CpuacctGroup{}, &fs.CpuacctGroup{},
&fs.PidsGroup{},
&fs.BlkioGroup{}, &fs.BlkioGroup{},
&fs.HugetlbGroup{}, &fs.HugetlbGroup{},
&fs.PerfEventGroup{}, &fs.PerfEventGroup{},
@ -167,8 +168,25 @@ func (m *Manager) Apply(pid int) error {
properties []systemdDbus.Property properties []systemdDbus.Property
) )
if c.Slice != "" { if c.Paths != nil {
slice = c.Slice paths := make(map[string]string)
for name, path := range c.Paths {
_, err := getSubsystemPath(m.Cgroups, name)
if err != nil {
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[name] = path
}
m.Paths = paths
return cgroups.EnterPid(m.Paths, pid)
}
if c.Parent != "" {
slice = c.Parent
} }
properties = append(properties, properties = append(properties,
@ -189,26 +207,26 @@ func (m *Manager) Apply(pid int) error {
newProp("DefaultDependencies", false)) newProp("DefaultDependencies", false))
} }
if c.Memory != 0 { if c.Resources.Memory != 0 {
properties = append(properties, properties = append(properties,
newProp("MemoryLimit", uint64(c.Memory))) newProp("MemoryLimit", uint64(c.Resources.Memory)))
} }
if c.CpuShares != 0 { if c.Resources.CpuShares != 0 {
properties = append(properties, properties = append(properties,
newProp("CPUShares", uint64(c.CpuShares))) newProp("CPUShares", uint64(c.Resources.CpuShares)))
} }
if c.BlkioWeight != 0 { if c.Resources.BlkioWeight != 0 {
properties = append(properties, properties = append(properties,
newProp("BlockIOWeight", uint64(c.BlkioWeight))) newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
} }
// We need to set kernel memory before processes join cgroup because // We need to set kernel memory before processes join cgroup because
// kmem.limit_in_bytes can only be set when the cgroup is empty. // kmem.limit_in_bytes can only be set when the cgroup is empty.
// And swap memory limit needs to be set after memory limit, only // And swap memory limit needs to be set after memory limit, only
// memory limit is handled by systemd, so it's kind of ugly here. // memory limit is handled by systemd, so it's kind of ugly here.
if c.KernelMemory > 0 { if c.Resources.KernelMemory > 0 {
if err := setKernelMemory(c); err != nil { if err := setKernelMemory(c); err != nil {
return err return err
} }
@ -233,7 +251,7 @@ func (m *Manager) Apply(pid int) error {
return err return err
} }
// we need to manually join the freezer, net_cls, net_prio and cpuset cgroup in systemd // we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd
// because it does not currently support it via the dbus api. // because it does not currently support it via the dbus api.
if err := joinFreezer(c, pid); err != nil { if err := joinFreezer(c, pid); err != nil {
return err return err
@ -246,6 +264,10 @@ func (m *Manager) Apply(pid int) error {
return err return err
} }
if err := joinPids(c, pid); err != nil {
return err
}
if err := joinCpuset(c, pid); err != nil { if err := joinCpuset(c, pid); err != nil {
return err return err
} }
@ -277,17 +299,13 @@ func (m *Manager) Apply(pid int) error {
paths[s.Name()] = subsystemPath paths[s.Name()] = subsystemPath
} }
m.Paths = paths m.Paths = paths
if paths["cpu"] != "" {
if err := fs.CheckCpushares(paths["cpu"], c.CpuShares); err != nil {
return err
}
}
return nil return nil
} }
func (m *Manager) Destroy() error { func (m *Manager) Destroy() error {
if m.Cgroups.Paths != nil {
return nil
}
m.mu.Lock() m.mu.Lock()
defer m.mu.Unlock() defer m.mu.Unlock()
theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil) theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
@ -330,68 +348,65 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
} }
func joinCpu(c *configs.Cgroup, pid int) error { func joinCpu(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "cpu") _, err := join(c, "cpu", pid)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
if c.CpuQuota != 0 {
if err = writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(c.CpuQuota, 10)); err != nil {
return err
}
}
if c.CpuPeriod != 0 {
if err = writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(c.CpuPeriod, 10)); err != nil {
return err
}
}
if c.CpuRtPeriod != 0 {
if err = writeFile(path, "cpu.rt_period_us", strconv.FormatInt(c.CpuRtPeriod, 10)); err != nil {
return err
}
}
if c.CpuRtRuntime != 0 {
if err = writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(c.CpuRtRuntime, 10)); err != nil {
return err
}
}
return nil return nil
} }
func joinFreezer(c *configs.Cgroup, pid int) error { func joinFreezer(c *configs.Cgroup, pid int) error {
path, err := join(c, "freezer", pid) _, err := join(c, "freezer", pid)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
freezer, err := subsystems.Get("freezer") return nil
if err != nil {
return err
}
return freezer.Set(path, c)
} }
func joinNetPrio(c *configs.Cgroup, pid int) error { func joinNetPrio(c *configs.Cgroup, pid int) error {
path, err := join(c, "net_prio", pid) _, err := join(c, "net_prio", pid)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
netPrio, err := subsystems.Get("net_prio") return nil
if err != nil {
return err
}
return netPrio.Set(path, c)
} }
func joinNetCls(c *configs.Cgroup, pid int) error { func joinNetCls(c *configs.Cgroup, pid int) error {
path, err := join(c, "net_cls", pid) _, err := join(c, "net_cls", pid)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
netcls, err := subsystems.Get("net_cls") return nil
if err != nil { }
func joinPids(c *configs.Cgroup, pid int) error {
_, err := join(c, "pids", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
return netcls.Set(path, c) return nil
}
// systemd represents slice heirarchy using `-`, so we need to follow suit when
// generating the path of slice. Essentially, test-a-b.slice becomes
// test.slice/test-a.slice/test-a-b.slice.
func expandSlice(slice string) (string, error) {
suffix := ".slice"
sliceName := strings.TrimSuffix(slice, suffix)
var path, prefix string
for _, component := range strings.Split(sliceName, "-") {
// test--a.slice isn't permitted, nor is -test.slice.
if component == "" {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
// Append the component to the path and to the prefix.
path += prefix + component + suffix + "/"
prefix += component + "-"
}
return path, nil
} }
func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
@ -406,8 +421,13 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
} }
slice := "system.slice" slice := "system.slice"
if c.Slice != "" { if c.Parent != "" {
slice = c.Slice slice = c.Parent
}
slice, err = expandSlice(slice)
if err != nil {
return "", err
} }
return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
@ -418,15 +438,15 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
if err != nil { if err != nil {
return err return err
} }
prevState := m.Cgroups.Freezer prevState := m.Cgroups.Resources.Freezer
m.Cgroups.Freezer = state m.Cgroups.Resources.Freezer = state
freezer, err := subsystems.Get("freezer") freezer, err := subsystems.Get("freezer")
if err != nil { if err != nil {
return err return err
} }
err = freezer.Set(path, m.Cgroups) err = freezer.Set(path, m.Cgroups)
if err != nil { if err != nil {
m.Cgroups.Freezer = prevState m.Cgroups.Resources.Freezer = prevState
return err return err
} }
return nil return nil
@ -440,6 +460,14 @@ func (m *Manager) GetPids() ([]int, error) {
return cgroups.GetPids(path) return cgroups.GetPids(path)
} }
func (m *Manager) GetAllPids() ([]int, error) {
path, err := getSubsystemPath(m.Cgroups, "devices")
if err != nil {
return nil, err
}
return cgroups.GetAllPids(path)
}
func (m *Manager) GetStats() (*cgroups.Stats, error) { func (m *Manager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock() m.mu.Lock()
defer m.mu.Unlock() defer m.mu.Unlock()
@ -458,21 +486,28 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
} }
func (m *Manager) Set(container *configs.Config) error { func (m *Manager) Set(container *configs.Config) error {
for name, path := range m.Paths { for _, sys := range subsystems {
sys, err := subsystems.Get(name) // Get the subsystem path, but don't error out for not found cgroups.
if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { path, err := getSubsystemPath(container.Cgroups, sys.Name())
continue if err != nil && !cgroups.IsNotFound(err) {
return err
} }
if err := sys.Set(path, container.Cgroups); err != nil { if err := sys.Set(path, container.Cgroups); err != nil {
return err return err
} }
} }
if m.Paths["cpu"] != "" {
if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
return err
}
}
return nil return nil
} }
func getUnitName(c *configs.Cgroup) string { func getUnitName(c *configs.Cgroup) string {
return fmt.Sprintf("%s-%s.scope", c.Parent, c.Name) return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
} }
// Atm we can't use the systemd device support because of two missing things: // Atm we can't use the systemd device support because of two missing things:
@ -487,17 +522,13 @@ func getUnitName(c *configs.Cgroup) string {
// because systemd will re-write the device settings if it needs to re-apply the cgroup context. // because systemd will re-write the device settings if it needs to re-apply the cgroup context.
// This happens at least for v208 when any sibling unit is started. // This happens at least for v208 when any sibling unit is started.
func joinDevices(c *configs.Cgroup, pid int) error { func joinDevices(c *configs.Cgroup, pid int) error {
path, err := join(c, "devices", pid) _, err := join(c, "devices", pid)
// Even if it's `not found` error, we'll return err because devices cgroup // Even if it's `not found` error, we'll return err because devices cgroup
// is hard requirement for container security. // is hard requirement for container security.
if err != nil { if err != nil {
return err return err
} }
devices, err := subsystems.Get("devices") return nil
if err != nil {
return err
}
return devices.Set(path, c)
} }
func setKernelMemory(c *configs.Cgroup) error { func setKernelMemory(c *configs.Cgroup) error {
@ -510,52 +541,16 @@ func setKernelMemory(c *configs.Cgroup) error {
return err return err
} }
if c.KernelMemory > 0 { // This doesn't get called by manager.Set, so we need to do it here.
err = writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(c.KernelMemory, 10)) s := &fs.MemoryGroup{}
if err != nil { return s.SetKernelMemory(path, c)
return err
}
}
return nil
} }
func joinMemory(c *configs.Cgroup, pid int) error { func joinMemory(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "memory") _, err := join(c, "memory", pid)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
// -1 disables memoryswap
if c.MemorySwap > 0 {
err = writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.MemorySwap, 10))
if err != nil {
return err
}
}
if c.MemoryReservation > 0 {
err = writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(c.MemoryReservation, 10))
if err != nil {
return err
}
}
if c.OomKillDisable {
if err := writeFile(path, "memory.oom_control", "1"); err != nil {
return err
}
}
if c.MemorySwappiness >= 0 && c.MemorySwappiness <= 100 {
err = writeFile(path, "memory.swappiness", strconv.FormatInt(c.MemorySwappiness, 10))
if err != nil {
return err
}
} else if c.MemorySwappiness == -1 {
return nil
} else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", c.MemorySwappiness)
}
return nil return nil
} }
@ -577,68 +572,25 @@ func joinCpuset(c *configs.Cgroup, pid int) error {
// expects device path instead of major minor numbers, which is also confusing // expects device path instead of major minor numbers, which is also confusing
// for users. So we use fs work around for now. // for users. So we use fs work around for now.
func joinBlkio(c *configs.Cgroup, pid int) error { func joinBlkio(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "blkio") _, err := join(c, "blkio", pid)
if err != nil { if err != nil {
return err return err
} }
// systemd doesn't directly support this in the dbus properties
if c.BlkioLeafWeight != 0 {
if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(c.BlkioLeafWeight), 10)); err != nil {
return err
}
}
for _, wd := range c.BlkioWeightDevice {
if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
return err
}
if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
return err
}
}
for _, td := range c.BlkioThrottleReadBpsDevice {
if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range c.BlkioThrottleWriteBpsDevice {
if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range c.BlkioThrottleReadIOPSDevice {
if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
for _, td := range c.BlkioThrottleWriteIOPSDevice {
if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err
}
}
return nil return nil
} }
func joinHugetlb(c *configs.Cgroup, pid int) error { func joinHugetlb(c *configs.Cgroup, pid int) error {
path, err := join(c, "hugetlb", pid) _, err := join(c, "hugetlb", pid)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
hugetlb, err := subsystems.Get("hugetlb") return nil
if err != nil {
return err
}
return hugetlb.Set(path, c)
} }
func joinPerfEvent(c *configs.Cgroup, pid int) error { func joinPerfEvent(c *configs.Cgroup, pid int) error {
path, err := join(c, "perf_event", pid) _, err := join(c, "perf_event", pid)
if err != nil && !cgroups.IsNotFound(err) { if err != nil && !cgroups.IsNotFound(err) {
return err return err
} }
perfEvent, err := subsystems.Get("perf_event") return nil
if err != nil {
return err
}
return perfEvent.Set(path, c)
} }

View File

@ -13,7 +13,7 @@ import (
"time" "time"
"github.com/docker/docker/pkg/mount" "github.com/docker/docker/pkg/mount"
"github.com/docker/docker/pkg/units" "github.com/docker/go-units"
) )
const cgroupNamePrefix = "name=" const cgroupNamePrefix = "name="
@ -84,10 +84,19 @@ func FindCgroupMountpointDir() (string, error) {
// Safe as mountinfo encodes mountpoints with spaces as \040. // Safe as mountinfo encodes mountpoints with spaces as \040.
index := strings.Index(text, " - ") index := strings.Index(text, " - ")
postSeparatorFields := strings.Fields(text[index+3:]) postSeparatorFields := strings.Fields(text[index+3:])
if len(postSeparatorFields) < 3 { numPostFields := len(postSeparatorFields)
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
// This is an error as we can't detect if the mount is for "cgroup"
if numPostFields == 0 {
return "", fmt.Errorf("Found no fields post '-' in %q", text)
} }
if postSeparatorFields[0] == "cgroup" { if postSeparatorFields[0] == "cgroup" {
// Check that the mount is properly formated.
if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
}
return filepath.Dir(fields[4]), nil return filepath.Dir(fields[4]), nil
} }
} }
@ -323,9 +332,14 @@ func GetHugePageSize() ([]string, error) {
return pageSizes, nil return pageSizes, nil
} }
// GetPids returns all pids, that were added to cgroup at path and to all its // GetPids returns all pids, that were added to cgroup at path.
// subcgroups.
func GetPids(path string) ([]int, error) { func GetPids(path string) ([]int, error) {
return readProcsFile(path)
}
// GetAllPids returns all pids, that were added to cgroup at path and to all its
// subcgroups.
func GetAllPids(path string) ([]int, error) {
var pids []int var pids []int
// collect pids from all sub-cgroups // collect pids from all sub-cgroups
err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error { err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {

View File

@ -16,6 +16,17 @@ type Cgroup struct {
// name of parent cgroup or slice // name of parent cgroup or slice
Parent string `json:"parent"` Parent string `json:"parent"`
// ScopePrefix decribes prefix for the scope name
ScopePrefix string `json:"scope_prefix"`
// Paths represent the cgroups paths to join
Paths map[string]string
// Resources contains various cgroups settings to apply
*Resources
}
type Resources struct {
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
AllowAllDevices bool `json:"allow_all_devices"` AllowAllDevices bool `json:"allow_all_devices"`
@ -29,7 +40,7 @@ type Cgroup struct {
// Memory reservation or soft_limit (in bytes) // Memory reservation or soft_limit (in bytes)
MemoryReservation int64 `json:"memory_reservation"` MemoryReservation int64 `json:"memory_reservation"`
// Total memory usage (memory + swap); set `-1' to disable swap // Total memory usage (memory + swap); set `-1` to enable unlimited swap
MemorySwap int64 `json:"memory_swap"` MemorySwap int64 `json:"memory_swap"`
// Kernel memory limit (in bytes) // Kernel memory limit (in bytes)
@ -56,6 +67,9 @@ type Cgroup struct {
// MEM to use // MEM to use
CpusetMems string `json:"cpuset_mems"` CpusetMems string `json:"cpuset_mems"`
// Process limit; set <= `0' to disable limit.
PidsLimit int64 `json:"pids_limit"`
// Specifies per cgroup weight, range is from 10 to 1000. // Specifies per cgroup weight, range is from 10 to 1000.
BlkioWeight uint16 `json:"blkio_weight"` BlkioWeight uint16 `json:"blkio_weight"`
@ -83,9 +97,6 @@ type Cgroup struct {
// Hugetlb limit (in bytes) // Hugetlb limit (in bytes)
HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"` HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"`
// Parent slice to use for systemd TODO: remove in favor or parent
Slice string `json:"slice"`
// Whether to disable OOM Killer // Whether to disable OOM Killer
OomKillDisable bool `json:"oom_kill_disable"` OomKillDisable bool `json:"oom_kill_disable"`

View File

@ -0,0 +1,6 @@
// +build !windows,!linux,!freebsd
package configs
type Cgroup struct {
}

View File

@ -82,20 +82,6 @@ var (
Minor: 1, Minor: 1,
Permissions: "rwm", Permissions: "rwm",
}, },
{
Path: "/dev/tty0",
Type: 'c',
Major: 4,
Minor: 0,
Permissions: "rwm",
},
{
Path: "/dev/tty1",
Type: 'c',
Major: 4,
Minor: 1,
Permissions: "rwm",
},
// /dev/pts/ - pts namespaces are "coming soon" // /dev/pts/ - pts namespaces are "coming soon"
{ {
Path: "", Path: "",

View File

@ -6,8 +6,8 @@ import (
"errors" "errors"
) )
// newConsole returns an initalized console that can be used within a container by copying bytes // NewConsole returns an initalized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process. // from the master side to the slave that is attached as the tty for the container's init process.
func newConsole(uid, gid int) (Console, error) { func NewConsole(uid, gid int) (Console, error) {
return nil, errors.New("libcontainer console is not supported on FreeBSD") return nil, errors.New("libcontainer console is not supported on FreeBSD")
} }

View File

@ -10,9 +10,9 @@ import (
"github.com/opencontainers/runc/libcontainer/label" "github.com/opencontainers/runc/libcontainer/label"
) )
// newConsole returns an initalized console that can be used within a container by copying bytes // NewConsole returns an initalized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process. // from the master side to the slave that is attached as the tty for the container's init process.
func newConsole(uid, gid int) (Console, error) { func NewConsole(uid, gid int) (Console, error) {
master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
if err != nil { if err != nil {
return nil, err return nil, err

View File

@ -1,7 +1,7 @@
package libcontainer package libcontainer
// newConsole returns an initalized console that can be used within a container // NewConsole returns an initalized console that can be used within a container
func newConsole(uid, gid int) (Console, error) { func NewConsole(uid, gid int) (Console, error) {
return &windowsConsole{}, nil return &windowsConsole{}, nil
} }

View File

@ -14,8 +14,11 @@ import (
type Status int type Status int
const ( const (
// The container exists but has not been run yet
Created Status = iota
// The container exists and is running. // The container exists and is running.
Running Status = iota + 1 Running
// The container exists, it is in the process of being paused. // The container exists, it is in the process of being paused.
Pausing Pausing
@ -30,6 +33,25 @@ const (
Destroyed Destroyed
) )
func (s Status) String() string {
switch s {
case Created:
return "created"
case Running:
return "running"
case Pausing:
return "pausing"
case Paused:
return "paused"
case Checkpointed:
return "checkpointed"
case Destroyed:
return "destroyed"
default:
return "unknown"
}
}
// BaseState represents the platform agnostic pieces relating to a // BaseState represents the platform agnostic pieces relating to a
// running container's state // running container's state
type BaseState struct { type BaseState struct {

View File

@ -3,8 +3,10 @@
package libcontainer package libcontainer
import ( import (
"bytes"
"encoding/json" "encoding/json"
"fmt" "fmt"
"io"
"io/ioutil" "io/ioutil"
"os" "os"
"os/exec" "os/exec"
@ -19,6 +21,8 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/criurpc" "github.com/opencontainers/runc/libcontainer/criurpc"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/vishvananda/netlink/nl"
) )
const stdioFdCount = 3 const stdioFdCount = 3
@ -34,6 +38,7 @@ type linuxContainer struct {
criuPath string criuPath string
m sync.Mutex m sync.Mutex
criuVersion int criuVersion int
state containerState
} }
// State represents a running container's state // State represents a running container's state
@ -100,6 +105,12 @@ type Container interface {
// errors: // errors:
// Systemerror - System error. // Systemerror - System error.
NotifyOOM() (<-chan struct{}, error) NotifyOOM() (<-chan struct{}, error)
// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
//
// errors:
// Systemerror - System error.
NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
} }
// ID returns the container's unique ID // ID returns the container's unique ID
@ -125,7 +136,7 @@ func (c *linuxContainer) State() (*State, error) {
} }
func (c *linuxContainer) Processes() ([]int, error) { func (c *linuxContainer) Processes() ([]int, error) {
pids, err := c.cgroupManager.GetPids() pids, err := c.cgroupManager.GetAllPids()
if err != nil { if err != nil {
return nil, newSystemError(err) return nil, newSystemError(err)
} }
@ -179,22 +190,27 @@ func (c *linuxContainer) Start(process *Process) error {
} }
return newSystemError(err) return newSystemError(err)
} }
if doInit { c.state = &runningState{
c.updateState(parent) c: c,
} }
if c.config.Hooks != nil { if doInit {
s := configs.HookState{ if err := c.updateState(parent); err != nil {
Version: c.config.Version, return err
ID: c.id,
Pid: parent.pid(),
Root: c.config.Rootfs,
} }
for _, hook := range c.config.Hooks.Poststart { if c.config.Hooks != nil {
if err := hook.Run(s); err != nil { s := configs.HookState{
if err := parent.terminate(); err != nil { Version: c.config.Version,
logrus.Warn(err) ID: c.id,
Pid: parent.pid(),
Root: c.config.Rootfs,
}
for _, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil {
if err := parent.terminate(); err != nil {
logrus.Warn(err)
}
return newSystemError(err)
} }
return newSystemError(err)
} }
} }
} }
@ -218,7 +234,7 @@ func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProces
return nil, newSystemError(err) return nil, newSystemError(err)
} }
if !doInit { if !doInit {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe), nil return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
} }
return c.newInitProcess(p, cmd, parentPipe, childPipe) return c.newInitProcess(p, cmd, parentPipe, childPipe)
} }
@ -247,7 +263,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
} }
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
t := "_LIBCONTAINER_INITTYPE=standard" t := "_LIBCONTAINER_INITTYPE=" + string(initStandard)
cloneFlags := c.config.Namespaces.CloneFlags() cloneFlags := c.config.Namespaces.CloneFlags()
if cloneFlags&syscall.CLONE_NEWUSER != 0 { if cloneFlags&syscall.CLONE_NEWUSER != 0 {
if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil { if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil {
@ -273,23 +289,24 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
}, nil }, nil
} }
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) *setnsProcess { func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
cmd.Env = append(cmd.Env, cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
fmt.Sprintf("_LIBCONTAINER_INITPID=%d", c.initProcess.pid()), // for setns process, we dont have to set cloneflags as the process namespaces
"_LIBCONTAINER_INITTYPE=setns", // will only be set via setns syscall
) data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath)
if p.consolePath != "" { if err != nil {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_CONSOLE_PATH="+p.consolePath) return nil, err
} }
// TODO: set on container for process management // TODO: set on container for process management
return &setnsProcess{ return &setnsProcess{
cmd: cmd, cmd: cmd,
cgroupPaths: c.cgroupManager.GetPaths(), cgroupPaths: c.cgroupManager.GetPaths(),
childPipe: childPipe, childPipe: childPipe,
parentPipe: parentPipe, parentPipe: parentPipe,
config: c.newInitConfig(p), config: c.newInitConfig(p),
process: p, process: p,
} bootstrapData: data,
}, nil
} }
func (c *linuxContainer) newInitConfig(process *Process) *initConfig { func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
@ -316,54 +333,53 @@ func newPipe() (parent *os.File, child *os.File, err error) {
func (c *linuxContainer) Destroy() error { func (c *linuxContainer) Destroy() error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
status, err := c.currentStatus() return c.state.destroy()
if err != nil {
return err
}
if status != Destroyed {
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
}
if !c.config.Namespaces.Contains(configs.NEWPID) {
if err := killCgroupProcesses(c.cgroupManager); err != nil {
logrus.Warn(err)
}
}
err = c.cgroupManager.Destroy()
if rerr := os.RemoveAll(c.root); err == nil {
err = rerr
}
c.initProcess = nil
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Root: c.config.Rootfs,
}
for _, hook := range c.config.Hooks.Poststop {
if err := hook.Run(s); err != nil {
return err
}
}
}
return err
} }
func (c *linuxContainer) Pause() error { func (c *linuxContainer) Pause() error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
return c.cgroupManager.Freeze(configs.Frozen) status, err := c.currentStatus()
if err != nil {
return err
}
if status != Running {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
}
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err
}
return c.state.transition(&pausedState{
c: c,
})
} }
func (c *linuxContainer) Resume() error { func (c *linuxContainer) Resume() error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
return c.cgroupManager.Freeze(configs.Thawed) status, err := c.currentStatus()
if err != nil {
return err
}
if status != Paused {
return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
}
if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
return c.state.transition(&runningState{
c: c,
})
} }
func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
return notifyOnOOM(c.cgroupManager.GetPaths()) return notifyOnOOM(c.cgroupManager.GetPaths())
} }
func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
}
// XXX debug support, remove when debugging done. // XXX debug support, remove when debugging done.
func addArgsFromEnv(evar string, args *[]string) { func addArgsFromEnv(evar string, args *[]string) {
if e := os.Getenv(evar); e != "" { if e := os.Getenv(evar); e != "" {
@ -455,7 +471,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
} }
if criuOpts.ImagesDirectory == "" { if criuOpts.ImagesDirectory == "" {
criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image") return fmt.Errorf("invalid directory to save checkpoint")
} }
// Since a container can be C/R'ed multiple times, // Since a container can be C/R'ed multiple times,
@ -574,11 +590,9 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock() c.m.Lock()
defer c.m.Unlock() defer c.m.Unlock()
if err := c.checkCriuVersion("1.5.2"); err != nil { if err := c.checkCriuVersion("1.5.2"); err != nil {
return err return err
} }
if criuOpts.WorkDirectory == "" { if criuOpts.WorkDirectory == "" {
criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
} }
@ -587,22 +601,19 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) { if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
return err return err
} }
workDir, err := os.Open(criuOpts.WorkDirectory) workDir, err := os.Open(criuOpts.WorkDirectory)
if err != nil { if err != nil {
return err return err
} }
defer workDir.Close() defer workDir.Close()
if criuOpts.ImagesDirectory == "" { if criuOpts.ImagesDirectory == "" {
criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image") return fmt.Errorf("invalid directory to restore checkpoint")
} }
imageDir, err := os.Open(criuOpts.ImagesDirectory) imageDir, err := os.Open(criuOpts.ImagesDirectory)
if err != nil { if err != nil {
return err return err
} }
defer imageDir.Close() defer imageDir.Close()
// CRIU has a few requirements for a root directory: // CRIU has a few requirements for a root directory:
// * it must be a mount point // * it must be a mount point
// * its parent must not be overmounted // * its parent must not be overmounted
@ -613,18 +624,15 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
return err return err
} }
defer os.Remove(root) defer os.Remove(root)
root, err = filepath.EvalSymlinks(root) root, err = filepath.EvalSymlinks(root)
if err != nil { if err != nil {
return err return err
} }
err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "") err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "")
if err != nil { if err != nil {
return err return err
} }
defer syscall.Unmount(root, syscall.MNT_DETACH) defer syscall.Unmount(root, syscall.MNT_DETACH)
t := criurpc.CriuReqType_RESTORE t := criurpc.CriuReqType_RESTORE
req := &criurpc.CriuReq{ req := &criurpc.CriuReq{
Type: &t, Type: &t,
@ -692,15 +700,13 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
fds []string fds []string
fdJSON []byte fdJSON []byte
) )
if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
return err return err
} }
if err = json.Unmarshal(fdJSON, &fds); err != nil { if err := json.Unmarshal(fdJSON, &fds); err != nil {
return err return err
} }
for i := range fds { for i := range fds {
if s := fds[i]; strings.Contains(s, "pipe:") { if s := fds[i]; strings.Contains(s, "pipe:") {
inheritFd := new(criurpc.InheritFd) inheritFd := new(criurpc.InheritFd)
@ -709,12 +715,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
} }
} }
return c.criuSwrk(process, req, criuOpts, true)
err = c.criuSwrk(process, req, criuOpts, true)
if err != nil {
return err
}
return nil
} }
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
@ -909,46 +910,43 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
if notify == nil { if notify == nil {
return fmt.Errorf("invalid response: %s", resp.String()) return fmt.Errorf("invalid response: %s", resp.String())
} }
switch { switch {
case notify.GetScript() == "post-dump": case notify.GetScript() == "post-dump":
if !opts.LeaveRunning { f, err := os.Create(filepath.Join(c.root, "checkpoint"))
f, err := os.Create(filepath.Join(c.root, "checkpoint")) if err != nil {
if err != nil { return err
return err
}
f.Close()
} }
break f.Close()
case notify.GetScript() == "network-unlock": case notify.GetScript() == "network-unlock":
if err := unlockNetwork(c.config); err != nil { if err := unlockNetwork(c.config); err != nil {
return err return err
} }
break
case notify.GetScript() == "network-lock": case notify.GetScript() == "network-lock":
if err := lockNetwork(c.config); err != nil { if err := lockNetwork(c.config); err != nil {
return err return err
} }
break
case notify.GetScript() == "post-restore": case notify.GetScript() == "post-restore":
pid := notify.GetPid() pid := notify.GetPid()
r, err := newRestoredProcess(int(pid), fds) r, err := newRestoredProcess(int(pid), fds)
if err != nil { if err != nil {
return err return err
} }
process.ops = r
// TODO: crosbymichael restore previous process information by saving the init process information in if err := c.state.transition(&restoredState{
// the container's state file or separate process state files. imageDir: opts.ImagesDirectory,
c: c,
}); err != nil {
return err
}
if err := c.updateState(r); err != nil { if err := c.updateState(r); err != nil {
return err return err
} }
process.ops = r if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
break if !os.IsNotExist(err) {
logrus.Error(err)
}
}
} }
return nil return nil
} }
@ -958,66 +956,130 @@ func (c *linuxContainer) updateState(process parentProcess) error {
if err != nil { if err != nil {
return err return err
} }
return c.saveState(state)
}
func (c *linuxContainer) saveState(s *State) error {
f, err := os.Create(filepath.Join(c.root, stateFilename)) f, err := os.Create(filepath.Join(c.root, stateFilename))
if err != nil { if err != nil {
return err return err
} }
defer f.Close() defer f.Close()
os.Remove(filepath.Join(c.root, "checkpoint")) return utils.WriteJSON(f, s)
return json.NewEncoder(f).Encode(state) }
func (c *linuxContainer) deleteState() error {
return os.Remove(filepath.Join(c.root, stateFilename))
} }
func (c *linuxContainer) currentStatus() (Status, error) { func (c *linuxContainer) currentStatus() (Status, error) {
if _, err := os.Stat(filepath.Join(c.root, "checkpoint")); err == nil { if err := c.refreshState(); err != nil {
return Checkpointed, nil return -1, err
} }
return c.state.status(), nil
}
// refreshState needs to be called to verify that the current state on the
// container is what is true. Because consumers of libcontainer can use it
// out of process we need to verify the container's status based on runtime
// information and not rely on our in process info.
func (c *linuxContainer) refreshState() error {
paused, err := c.isPaused()
if err != nil {
return err
}
if paused {
return c.state.transition(&pausedState{c: c})
}
running, err := c.isRunning()
if err != nil {
return err
}
if running {
return c.state.transition(&runningState{c: c})
}
return c.state.transition(&stoppedState{c: c})
}
func (c *linuxContainer) isRunning() (bool, error) {
if c.initProcess == nil { if c.initProcess == nil {
return Destroyed, nil return false, nil
} }
// return Running if the init process is alive // return Running if the init process is alive
if err := syscall.Kill(c.initProcess.pid(), 0); err != nil { if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
if err == syscall.ESRCH { if err == syscall.ESRCH {
return Destroyed, nil return false, nil
} }
return 0, newSystemError(err) return false, newSystemError(err)
} }
if c.config.Cgroups != nil && c.config.Cgroups.Freezer == configs.Frozen { return true, nil
return Paused, nil }
func (c *linuxContainer) isPaused() (bool, error) {
data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
if err != nil {
if os.IsNotExist(err) {
return false, nil
}
return false, newSystemError(err)
} }
return Running, nil return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
} }
func (c *linuxContainer) currentState() (*State, error) { func (c *linuxContainer) currentState() (*State, error) {
status, err := c.currentStatus() var (
if err != nil { startTime string
return nil, err externalDescriptors []string
} pid = -1
if status == Destroyed { )
return nil, newGenericError(fmt.Errorf("container destroyed"), ContainerNotExists) if c.initProcess != nil {
} pid = c.initProcess.pid()
startTime, err := c.initProcess.startTime() startTime, _ = c.initProcess.startTime()
if err != nil { externalDescriptors = c.initProcess.externalDescriptors()
return nil, newSystemError(err)
} }
state := &State{ state := &State{
BaseState: BaseState{ BaseState: BaseState{
ID: c.ID(), ID: c.ID(),
Config: *c.config, Config: *c.config,
InitProcessPid: c.initProcess.pid(), InitProcessPid: pid,
InitProcessStartTime: startTime, InitProcessStartTime: startTime,
}, },
CgroupPaths: c.cgroupManager.GetPaths(), CgroupPaths: c.cgroupManager.GetPaths(),
NamespacePaths: make(map[configs.NamespaceType]string), NamespacePaths: make(map[configs.NamespaceType]string),
ExternalDescriptors: c.initProcess.externalDescriptors(), ExternalDescriptors: externalDescriptors,
} }
for _, ns := range c.config.Namespaces { if pid > 0 {
state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid()) for _, ns := range c.config.Namespaces {
} state.NamespacePaths[ns.Type] = ns.GetPath(pid)
for _, nsType := range configs.NamespaceTypes() { }
if _, ok := state.NamespacePaths[nsType]; !ok { for _, nsType := range configs.NamespaceTypes() {
ns := configs.Namespace{Type: nsType} if _, ok := state.NamespacePaths[nsType]; !ok {
state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid()) ns := configs.Namespace{Type: nsType}
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
}
} }
} }
return state, nil return state, nil
} }
// bootstrapData encodes the necessary data in netlink binary format as a io.Reader.
// Consumer can write the data to a bootstrap program such as one that uses
// nsenter package to bootstrap the container's init process correctly, i.e. with
// correct namespaces, uid/gid mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath string) (io.Reader, error) {
// create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0)
// write pid
r.AddData(&Int32msg{
Type: PidAttr,
Value: uint32(pid),
})
// write console path
if consolePath != "" {
r.AddData(&Bytemsg{
Type: ConsolePathAttr,
Value: []byte(consolePath),
})
}
return bytes.NewReader(r.Serialize()), nil
}

View File

@ -16,12 +16,14 @@ const (
ContainerPaused ContainerPaused
ContainerNotStopped ContainerNotStopped
ContainerNotRunning ContainerNotRunning
ContainerNotPaused
// Process errors // Process errors
ProcessNotExecuted ProcessNotExecuted
// Common errors // Common errors
ConfigInvalid ConfigInvalid
ConsoleExists
SystemError SystemError
) )
@ -43,6 +45,10 @@ func (c ErrorCode) String() string {
return "Container is not stopped" return "Container is not stopped"
case ContainerNotRunning: case ContainerNotRunning:
return "Container is not running" return "Container is not running"
case ConsoleExists:
return "Console exists for process"
case ContainerNotPaused:
return "Container is not paused"
default: default:
return "Unknown error" return "Unknown error"
} }

View File

@ -5,7 +5,6 @@ package libcontainer
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"io/ioutil"
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
@ -19,6 +18,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/configs/validate" "github.com/opencontainers/runc/libcontainer/configs/validate"
"github.com/opencontainers/runc/libcontainer/utils"
) )
const ( const (
@ -166,7 +166,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := os.MkdirAll(containerRoot, 0700); err != nil { if err := os.MkdirAll(containerRoot, 0700); err != nil {
return nil, newGenericError(err, SystemError) return nil, newGenericError(err, SystemError)
} }
return &linuxContainer{ c := &linuxContainer{
id: id, id: id,
root: containerRoot, root: containerRoot,
config: config, config: config,
@ -174,7 +174,9 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
initArgs: l.InitArgs, initArgs: l.InitArgs,
criuPath: l.CriuPath, criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
}, nil }
c.state = &stoppedState{c: c}
return c, nil
} }
func (l *LinuxFactory) Load(id string) (Container, error) { func (l *LinuxFactory) Load(id string) (Container, error) {
@ -191,7 +193,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
processStartTime: state.InitProcessStartTime, processStartTime: state.InitProcessStartTime,
fds: state.ExternalDescriptors, fds: state.ExternalDescriptors,
} }
return &linuxContainer{ c := &linuxContainer{
initProcess: r, initProcess: r,
id: id, id: id,
config: &state.Config, config: &state.Config,
@ -200,7 +202,12 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
criuPath: l.CriuPath, criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
root: containerRoot, root: containerRoot,
}, nil }
c.state = &createdState{c: c, s: Created}
if err := c.refreshState(); err != nil {
return nil, err
}
return c, nil
} }
func (l *LinuxFactory) Type() string { func (l *LinuxFactory) Type() string {
@ -222,21 +229,29 @@ func (l *LinuxFactory) StartInitialization() (err error) {
// clear the current process's environment to clean any libcontainer // clear the current process's environment to clean any libcontainer
// specific env vars. // specific env vars.
os.Clearenv() os.Clearenv()
var i initer
defer func() { defer func() {
// if we have an error during the initialization of the container's init then send it back to the // if we have an error during the initialization of the container's init then send it back to the
// parent process in the form of an initError. // parent process in the form of an initError.
if err != nil { if err != nil {
// ensure that any data sent from the parent is consumed so it doesn't if _, ok := i.(*linuxStandardInit); ok {
// receive ECONNRESET when the child writes to the pipe. // Synchronisation only necessary for standard init.
ioutil.ReadAll(pipe) if err := utils.WriteJSON(pipe, syncT{procError}); err != nil {
if err := json.NewEncoder(pipe).Encode(newSystemError(err)); err != nil { panic(err)
}
}
if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
panic(err)
}
} else {
if err := utils.WriteJSON(pipe, syncT{procStart}); err != nil {
panic(err) panic(err)
} }
} }
// ensure that this pipe is always closed // ensure that this pipe is always closed
pipe.Close() pipe.Close()
}() }()
i, err := newContainerInit(it, pipe) i, err = newContainerInit(it, pipe)
if err != nil { if err != nil {
return err return err
} }

View File

@ -9,6 +9,19 @@ import (
"github.com/opencontainers/runc/libcontainer/stacktrace" "github.com/opencontainers/runc/libcontainer/stacktrace"
) )
type syncType uint8
const (
procReady syncType = iota
procError
procStart
procRun
)
type syncT struct {
Type syncType `json:"type"`
}
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}} var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
Code: {{.ECode}} Code: {{.ECode}}
{{if .Message }} {{if .Message }}

View File

@ -1,16 +0,0 @@
#!/usr/bin/env bash
set -e
# This script runs all validations
validate() {
export MAKEDIR=/go/src/github.com/docker/docker/hack/make
sed -i 's!docker/docker!opencontainers/runc/libcontainer!' /go/src/github.com/docker/docker/hack/make/.validate
bash /go/src/github.com/docker/docker/hack/make/validate-dco
bash /go/src/github.com/docker/docker/hack/make/validate-gofmt
go get golang.org/x/tools/cmd/vet
bash /go/src/github.com/docker/docker/hack/make/validate-vet
}
# run validations
validate

View File

@ -5,6 +5,7 @@ package libcontainer
import ( import (
"encoding/json" "encoding/json"
"fmt" "fmt"
"io"
"io/ioutil" "io/ioutil"
"net" "net"
"os" "os"
@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
}, nil }, nil
case initStandard: case initStandard:
return &linuxStandardInit{ return &linuxStandardInit{
pipe: pipe,
parentPid: syscall.Getppid(), parentPid: syscall.Getppid(),
config: config, config: config,
}, nil }, nil
@ -140,6 +142,27 @@ func finalizeNamespace(config *initConfig) error {
return nil return nil
} }
// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe io.ReadWriter) error {
// Tell parent.
if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil {
return err
}
// Wait for parent to give the all-clear.
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type != procRun {
return fmt.Errorf("invalid synchronisation flag from parent")
}
}
return nil
}
// joinExistingNamespaces gets all the namespace paths specified for the container and // joinExistingNamespaces gets all the namespace paths specified for the container and
// does a setns on the namespace fd so that the current process joins the namespace. // does a setns on the namespace fd so that the current process joins the namespace.
func joinExistingNamespaces(namespaces []configs.Namespace) error { func joinExistingNamespaces(namespaces []configs.Namespace) error {
@ -309,7 +332,7 @@ func killCgroupProcesses(m cgroups.Manager) error {
if err := m.Freeze(configs.Frozen); err != nil { if err := m.Freeze(configs.Frozen); err != nil {
logrus.Warn(err) logrus.Warn(err)
} }
pids, err := m.GetPids() pids, err := m.GetAllPids()
if err != nil { if err != nil {
m.Freeze(configs.Thawed) m.Freeze(configs.Thawed)
return err return err

View File

@ -0,0 +1,62 @@
// +build linux
package libcontainer
import (
"syscall"
"github.com/vishvananda/netlink/nl"
)
// list of known message types we want to send to bootstrap program
// The number is randomly chosen to not conflict with known netlink types
const (
InitMsg uint16 = 62000
PidAttr uint16 = 27281
ConsolePathAttr uint16 = 27282
// When syscall.NLA_HDRLEN is in gccgo, take this out.
syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
)
type Int32msg struct {
Type uint16
Value uint32
}
// int32msg has the following representation
// | nlattr len | nlattr type |
// | uint32 value |
func (msg *Int32msg) Serialize() []byte {
buf := make([]byte, msg.Len())
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(msg.Len()))
native.PutUint16(buf[2:4], msg.Type)
native.PutUint32(buf[4:8], msg.Value)
return buf
}
func (msg *Int32msg) Len() int {
return syscall_NLA_HDRLEN + 4
}
// bytemsg has the following representation
// | nlattr len | nlattr type |
// | value | pad |
type Bytemsg struct {
Type uint16
Value []byte
}
func (msg *Bytemsg) Serialize() []byte {
l := msg.Len()
buf := make([]byte, (l+syscall.NLA_ALIGNTO-1) & ^(syscall.NLA_ALIGNTO-1))
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(l))
native.PutUint16(buf[2:4], msg.Type)
copy(buf[4:], msg.Value)
return buf
}
func (msg *Bytemsg) Len() int {
return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
}

View File

@ -93,7 +93,7 @@ func (l *loopback) create(n *network, nspid int) error {
} }
func (l *loopback) initialize(config *network) error { func (l *loopback) initialize(config *network) error {
return netlink.LinkSetUp(&netlink.Device{netlink.LinkAttrs{Name: "lo"}}) return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}})
} }
func (l *loopback) attach(n *configs.Network) (err error) { func (l *loopback) attach(n *configs.Network) (err error) {
@ -111,7 +111,7 @@ type veth struct {
} }
func (v *veth) detach(n *configs.Network) (err error) { func (v *veth) detach(n *configs.Network) (err error) {
return netlink.LinkSetMaster(&netlink.Device{netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil) return netlink.LinkSetMaster(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil)
} }
// attach a container network interface to an external network // attach a container network interface to an external network

View File

@ -12,31 +12,32 @@ import (
const oomCgroupName = "memory" const oomCgroupName = "memory"
// notifyOnOOM returns channel on which you can expect event about OOM, type PressureLevel uint
// if process died without OOM this channel will be closed.
// s is current *libcontainer.State for container. const (
func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) { LowPressure PressureLevel = iota
dir := paths[oomCgroupName] MediumPressure
if dir == "" { CriticalPressure
return nil, fmt.Errorf("There is no path for %q in state", oomCgroupName) )
}
oomControl, err := os.Open(filepath.Join(dir, "memory.oom_control")) func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) {
evFile, err := os.Open(filepath.Join(cgDir, evName))
if err != nil { if err != nil {
return nil, err return nil, err
} }
fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0) fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0)
if syserr != 0 { if syserr != 0 {
oomControl.Close() evFile.Close()
return nil, syserr return nil, syserr
} }
eventfd := os.NewFile(fd, "eventfd") eventfd := os.NewFile(fd, "eventfd")
eventControlPath := filepath.Join(dir, "cgroup.event_control") eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
data := fmt.Sprintf("%d %d", eventfd.Fd(), oomControl.Fd()) data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil { if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil {
eventfd.Close() eventfd.Close()
oomControl.Close() evFile.Close()
return nil, err return nil, err
} }
ch := make(chan struct{}) ch := make(chan struct{})
@ -44,7 +45,7 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
defer func() { defer func() {
close(ch) close(ch)
eventfd.Close() eventfd.Close()
oomControl.Close() evFile.Close()
}() }()
buf := make([]byte, 8) buf := make([]byte, 8)
for { for {
@ -61,3 +62,28 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
}() }()
return ch, nil return ch, nil
} }
// notifyOnOOM returns channel on which you can expect event about OOM,
// if process died without OOM this channel will be closed.
func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
dir := paths[oomCgroupName]
if dir == "" {
return nil, fmt.Errorf("path %q missing", oomCgroupName)
}
return registerMemoryEvent(dir, "memory.oom_control", "")
}
func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) {
dir := paths[oomCgroupName]
if dir == "" {
return nil, fmt.Errorf("path %q missing", oomCgroupName)
}
if level > CriticalPressure {
return nil, fmt.Errorf("invalid pressure level %d", level)
}
levelStr := []string{"low", "medium", "critical"}[level]
return registerMemoryEvent(dir, "memory.pressure_level", levelStr)
}

View File

@ -17,6 +17,11 @@
#include <sched.h> #include <sched.h>
#include <signal.h> #include <signal.h>
#include <linux/netlink.h>
#include <linux/types.h>
#include <stdint.h>
#include <sys/socket.h>
/* All arguments should be above stack, because it grows down */ /* All arguments should be above stack, because it grows down */
struct clone_arg { struct clone_arg {
/* /*
@ -63,24 +68,33 @@ static int clone_parent(jmp_buf * env)
return child; return child;
} }
static uint32_t readint32(char *buf)
{
return *(uint32_t *) buf;
}
// list of known message types we want to send to bootstrap program
// These are defined in libcontainer/message_linux.go
#define INIT_MSG 62000
#define PID_ATTR 27281
#define CONSOLE_PATH_ATTR 27282
void nsexec() void nsexec()
{ {
char *namespaces[] = { "ipc", "uts", "net", "pid", "mnt", "user" }; char *namespaces[] = { "ipc", "uts", "net", "pid", "mnt", "user" };
const int num = sizeof(namespaces) / sizeof(char *); const int num = sizeof(namespaces) / sizeof(char *);
jmp_buf env; jmp_buf env;
char buf[PATH_MAX], *val; char buf[PATH_MAX], *val;
int i, tfd, self_tfd, child, len, pipenum, consolefd = -1; int i, tfd, self_tfd, child, n, len, pipenum, consolefd = -1;
pid_t pid; pid_t pid = 0;
char *console;
val = getenv("_LIBCONTAINER_INITPID"); // if we dont have INITTYPE or this is the init process, skip the bootstrap process
if (val == NULL) val = getenv("_LIBCONTAINER_INITTYPE");
if (val == NULL || strcmp(val, "standard") == 0) {
return; return;
}
pid = atoi(val); if (strcmp(val, "setns") != 0) {
snprintf(buf, sizeof(buf), "%d", pid); pr_perror("Invalid inittype %s", val);
if (strcmp(val, buf)) {
pr_perror("Unable to parse _LIBCONTAINER_INITPID");
exit(1); exit(1);
} }
@ -89,7 +103,6 @@ void nsexec()
pr_perror("Child pipe not found"); pr_perror("Child pipe not found");
exit(1); exit(1);
} }
pipenum = atoi(val); pipenum = atoi(val);
snprintf(buf, sizeof(buf), "%d", pipenum); snprintf(buf, sizeof(buf), "%d", pipenum);
if (strcmp(val, buf)) { if (strcmp(val, buf)) {
@ -97,13 +110,56 @@ void nsexec()
exit(1); exit(1);
} }
console = getenv("_LIBCONTAINER_CONSOLE_PATH"); char nlbuf[NLMSG_HDRLEN];
if (console != NULL) { struct nlmsghdr *nh;
consolefd = open(console, O_RDWR); if ((n = read(pipenum, nlbuf, NLMSG_HDRLEN)) != NLMSG_HDRLEN) {
if (consolefd < 0) { pr_perror("Failed to read netlink header, got %d", n);
pr_perror("Failed to open console %s", console); exit(1);
exit(1); }
nh = (struct nlmsghdr *)nlbuf;
if (nh->nlmsg_type == NLMSG_ERROR) {
pr_perror("Invalid netlink header message");
exit(1);
}
if (nh->nlmsg_type != INIT_MSG) {
pr_perror("Unexpected netlink message type %d", nh->nlmsg_type);
exit(1);
}
// read the netlink payload
len = NLMSG_PAYLOAD(nh, 0);
char data[len];
if ((n = read(pipenum, data, len)) != len) {
pr_perror("Failed to read netlink payload, got %d", n);
exit(1);
}
int start = 0;
struct nlattr *attr;
while (start < len) {
int payload_len;
attr = (struct nlattr *)((void *)data + start);
start += NLA_HDRLEN;
payload_len = attr->nla_len - NLA_HDRLEN;
switch (attr->nla_type) {
case PID_ATTR:
pid = (pid_t) readint32(data + start);
break;
case CONSOLE_PATH_ATTR:
consolefd = open((char *)data + start, O_RDWR);
if (consolefd < 0) {
pr_perror("Failed to open console %s", (char *)data + start);
exit(1);
}
break;
} }
start += NLA_ALIGN(payload_len);
}
// required pid to be passed
if (pid == 0) {
pr_perror("missing pid");
exit(1);
} }
/* Check that the specified process exists */ /* Check that the specified process exists */
@ -133,15 +189,13 @@ void nsexec()
} }
/* Skip namespaces we're already part of */ /* Skip namespaces we're already part of */
if (fstatat(self_tfd, namespaces[i], &self_st, 0) != -1 && if (fstatat(self_tfd, namespaces[i], &self_st, 0) != -1 && st.st_ino == self_st.st_ino) {
st.st_ino == self_st.st_ino) {
continue; continue;
} }
fd = openat(tfd, namespaces[i], O_RDONLY); fd = openat(tfd, namespaces[i], O_RDONLY);
if (fd == -1) { if (fd == -1) {
pr_perror("Failed to open ns file %s for ns %s", buf, pr_perror("Failed to open ns file %s for ns %s", buf, namespaces[i]);
namespaces[i]);
exit(1); exit(1);
} }
// Set the namespace. // Set the namespace.

View File

@ -78,12 +78,28 @@ func (p Process) Signal(sig os.Signal) error {
return p.ops.signal(sig) return p.ops.signal(sig)
} }
// IO holds the process's STDIO
type IO struct {
Stdin io.WriteCloser
Stdout io.ReadCloser
Stderr io.ReadCloser
}
// NewConsole creates new console for process and returns it // NewConsole creates new console for process and returns it
func (p *Process) NewConsole(rootuid int) (Console, error) { func (p *Process) NewConsole(rootuid int) (Console, error) {
console, err := newConsole(rootuid, rootuid) console, err := NewConsole(rootuid, rootuid)
if err != nil { if err != nil {
return nil, err return nil, err
} }
p.consolePath = console.Path() p.consolePath = console.Path()
return console, nil return console, nil
} }
// ConsoleFromPath sets the process's console with the path provided
func (p *Process) ConsoleFromPath(path string) error {
if p.consolePath != "" {
return newGenericError(fmt.Errorf("console path already exists for process"), ConsoleExists)
}
p.consolePath = path
return nil
}

View File

@ -5,6 +5,7 @@ package libcontainer
import ( import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt"
"io" "io"
"os" "os"
"os/exec" "os/exec"
@ -15,6 +16,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
) )
type parentProcess interface { type parentProcess interface {
@ -41,13 +43,14 @@ type parentProcess interface {
} }
type setnsProcess struct { type setnsProcess struct {
cmd *exec.Cmd cmd *exec.Cmd
parentPipe *os.File parentPipe *os.File
childPipe *os.File childPipe *os.File
cgroupPaths map[string]string cgroupPaths map[string]string
config *initConfig config *initConfig
fds []string fds []string
process *Process process *Process
bootstrapData io.Reader
} }
func (p *setnsProcess) startTime() (string, error) { func (p *setnsProcess) startTime() (string, error) {
@ -64,6 +67,16 @@ func (p *setnsProcess) signal(sig os.Signal) error {
func (p *setnsProcess) start() (err error) { func (p *setnsProcess) start() (err error) {
defer p.parentPipe.Close() defer p.parentPipe.Close()
err = p.cmd.Start()
p.childPipe.Close()
if err != nil {
return newSystemError(err)
}
if p.bootstrapData != nil {
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemError(err)
}
}
if err = p.execSetns(); err != nil { if err = p.execSetns(); err != nil {
return newSystemError(err) return newSystemError(err)
} }
@ -72,9 +85,10 @@ func (p *setnsProcess) start() (err error) {
return newSystemError(err) return newSystemError(err)
} }
} }
if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil { if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
return newSystemError(err) return newSystemError(err)
} }
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err) return newSystemError(err)
} }
@ -84,6 +98,7 @@ func (p *setnsProcess) start() (err error) {
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err) return newSystemError(err)
} }
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil { if ierr != nil {
p.wait() p.wait()
return newSystemError(ierr) return newSystemError(ierr)
@ -96,11 +111,6 @@ func (p *setnsProcess) start() (err error) {
// before the go runtime boots, we wait on the process to die and receive the child's pid // before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe. // over the provided pipe.
func (p *setnsProcess) execSetns() error { func (p *setnsProcess) execSetns() error {
err := p.cmd.Start()
p.childPipe.Close()
if err != nil {
return newSystemError(err)
}
status, err := p.cmd.Process.Wait() status, err := p.cmd.Process.Wait()
if err != nil { if err != nil {
p.cmd.Wait() p.cmd.Wait()
@ -192,7 +202,6 @@ func (p *initProcess) start() (err error) {
return newSystemError(err) return newSystemError(err)
} }
p.setExternalDescriptors(fds) p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children // Do this before syncing with child so that no children
// can escape the cgroup // can escape the cgroup
if err := p.manager.Apply(p.pid()); err != nil { if err := p.manager.Apply(p.pid()); err != nil {
@ -223,13 +232,56 @@ func (p *initProcess) start() (err error) {
if err := p.sendConfig(); err != nil { if err := p.sendConfig(); err != nil {
return newSystemError(err) return newSystemError(err)
} }
// wait for the child process to fully complete and receive an error message var (
// if one was encoutered procSync syncT
var ierr *genericError sentRun bool
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { ierr *genericError
)
loop:
for {
if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
if err == io.EOF {
break loop
}
return newSystemError(err)
}
switch procSync.Type {
case procStart:
break loop
case procReady:
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemError(err)
}
// Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
return newSystemError(err)
}
sentRun = true
case procError:
// wait for the child process to fully complete and receive an error message
// if one was encoutered
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
}
if ierr != nil {
break loop
}
// Programmer error.
panic("No error following JSON procError payload.")
default:
return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
}
}
if !sentRun {
return newSystemError(fmt.Errorf("could not synchronise with container process"))
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err) return newSystemError(err)
} }
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil { if ierr != nil {
p.wait()
return newSystemError(ierr) return newSystemError(ierr)
} }
return nil return nil
@ -264,11 +316,7 @@ func (p *initProcess) startTime() (string, error) {
func (p *initProcess) sendConfig() error { func (p *initProcess) sendConfig() error {
// send the state to the container's init process then shutdown writes for the parent // send the state to the container's init process then shutdown writes for the parent
if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil { return utils.WriteJSON(p.parentPipe, p.config)
return err
}
// shutdown writes for the parent side of the pipe
return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR)
} }
func (p *initProcess) createNetworkInterfaces() error { func (p *initProcess) createNetworkInterfaces() error {
@ -314,3 +362,44 @@ func getPipeFds(pid int) ([]string, error) {
} }
return fds, nil return fds, nil
} }
// InitializeIO creates pipes for use with the process's STDIO
// and returns the opposite side for each
func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
var fds []uintptr
i = &IO{}
// cleanup in case of an error
defer func() {
if err != nil {
for _, fd := range fds {
syscall.Close(int(fd))
}
}
}()
// STDIN
r, w, err := os.Pipe()
if err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stdin, i.Stdin = r, w
// STDOUT
if r, w, err = os.Pipe(); err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stdout, i.Stdout = w, r
// STDERR
if r, w, err = os.Pipe(); err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stderr, i.Stderr = w, r
// change ownership of the pipes incase we are in a user namespace
for _, fd := range fds {
if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil {
return nil, err
}
}
return i, nil
}

View File

@ -18,6 +18,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/label" "github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/system"
) )
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
@ -299,6 +300,24 @@ func checkMountDestination(rootfs, dest string) error {
invalidDestinations := []string{ invalidDestinations := []string{
"/proc", "/proc",
} }
// White list, it should be sub directories of invalid destinations
validDestinations := []string{
// These entries can be bind mounted by files emulated by fuse,
// so commands like top, free displays stats in container.
"/proc/cpuinfo",
"/proc/diskstats",
"/proc/meminfo",
"/proc/stats",
}
for _, valid := range validDestinations {
path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
if err != nil {
return err
}
if path == "." {
return nil
}
}
for _, invalid := range invalidDestinations { for _, invalid := range invalidDestinations {
path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest) path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest)
if err != nil { if err != nil {
@ -365,11 +384,12 @@ func reOpenDevNull() error {
// Create the device nodes in the container. // Create the device nodes in the container.
func createDevices(config *configs.Config) error { func createDevices(config *configs.Config) error {
useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
oldMask := syscall.Umask(0000) oldMask := syscall.Umask(0000)
for _, node := range config.Devices { for _, node := range config.Devices {
// containers running in a user namespace are not allowed to mknod // containers running in a user namespace are not allowed to mknod
// devices so we can just bind mount it from the host. // devices so we can just bind mount it from the host.
if err := createDeviceNode(config.Rootfs, node, config.Namespaces.Contains(configs.NEWUSER)); err != nil { if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
syscall.Umask(oldMask) syscall.Umask(oldMask)
return err return err
} }

View File

@ -0,0 +1,47 @@
Name: cat
State: R (running)
Tgid: 19383
Ngid: 0
Pid: 19383
PPid: 19275
TracerPid: 0
Uid: 1000 1000 1000 1000
Gid: 1000 1000 1000 1000
FDSize: 256
Groups: 24 25 27 29 30 44 46 102 104 108 111 1000 1001
NStgid: 19383
NSpid: 19383
NSpgid: 19383
NSsid: 19275
VmPeak: 5944 kB
VmSize: 5944 kB
VmLck: 0 kB
VmPin: 0 kB
VmHWM: 744 kB
VmRSS: 744 kB
VmData: 324 kB
VmStk: 136 kB
VmExe: 48 kB
VmLib: 1776 kB
VmPTE: 32 kB
VmPMD: 12 kB
VmSwap: 0 kB
Threads: 1
SigQ: 0/30067
SigPnd: 0000000000000000
ShdPnd: 0000000000000000
SigBlk: 0000000000000000
SigIgn: 0000000000000080
SigCgt: 0000000000000000
CapInh: 0000000000000000
CapPrm: 0000000000000000
CapEff: 0000000000000000
CapBnd: 0000003fffffffff
CapAmb: 0000000000000000
Seccomp: 0
Cpus_allowed: f
Cpus_allowed_list: 0-3
Mems_allowed: 00000000,00000001
Mems_allowed_list: 0
voluntary_ctxt_switches: 0
nonvoluntary_ctxt_switches: 1

View File

@ -3,8 +3,11 @@
package seccomp package seccomp
import ( import (
"bufio"
"fmt" "fmt"
"log" "log"
"os"
"strings"
"syscall" "syscall"
"github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs"
@ -17,6 +20,9 @@ var (
actKill = libseccomp.ActKill actKill = libseccomp.ActKill
actTrace = libseccomp.ActTrace.SetReturnCode(int16(syscall.EPERM)) actTrace = libseccomp.ActTrace.SetReturnCode(int16(syscall.EPERM))
actErrno = libseccomp.ActErrno.SetReturnCode(int16(syscall.EPERM)) actErrno = libseccomp.ActErrno.SetReturnCode(int16(syscall.EPERM))
// SeccompModeFilter refers to the syscall argument SECCOMP_MODE_FILTER.
SeccompModeFilter = uintptr(2)
) )
// Filters given syscalls in a container, preventing them from being used // Filters given syscalls in a container, preventing them from being used
@ -73,6 +79,24 @@ func InitSeccomp(config *configs.Seccomp) error {
return nil return nil
} }
// IsEnabled returns if the kernel has been configured to support seccomp.
func IsEnabled() bool {
// Try to read from /proc/self/status for kernels > 3.8
s, err := parseStatusFile("/proc/self/status")
if err != nil {
// Check if Seccomp is supported, via CONFIG_SECCOMP.
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_GET_SECCOMP, 0, 0); err != syscall.EINVAL {
// Make sure the kernel has CONFIG_SECCOMP_FILTER.
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_SECCOMP, SeccompModeFilter, 0); err != syscall.EINVAL {
return true
}
}
return false
}
_, ok := s["Seccomp"]
return ok
}
// Convert Libcontainer Action to Libseccomp ScmpAction // Convert Libcontainer Action to Libseccomp ScmpAction
func getAction(act configs.Action) (libseccomp.ScmpAction, error) { func getAction(act configs.Action) (libseccomp.ScmpAction, error) {
switch act { switch act {
@ -178,3 +202,30 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
return nil return nil
} }
func parseStatusFile(path string) (map[string]string, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
s := bufio.NewScanner(f)
status := make(map[string]string)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
text := s.Text()
parts := strings.Split(text, ":")
if len(parts) <= 1 {
continue
}
status[parts[0]] = parts[1]
}
return status, nil
}

View File

@ -17,3 +17,8 @@ func InitSeccomp(config *configs.Seccomp) error {
} }
return nil return nil
} }
// IsEnabled returns false, because it is not supported.
func IsEnabled() bool {
return false
}

View File

@ -231,10 +231,14 @@ func ReserveLabel(scon string) {
} }
} }
func selinuxEnforcePath() string {
return fmt.Sprintf("%s/enforce", selinuxPath)
}
func SelinuxGetEnforce() int { func SelinuxGetEnforce() int {
var enforce int var enforce int
enforceS, err := readCon(fmt.Sprintf("%s/enforce", selinuxPath)) enforceS, err := readCon(selinuxEnforcePath())
if err != nil { if err != nil {
return -1 return -1
} }
@ -246,6 +250,10 @@ func SelinuxGetEnforce() int {
return enforce return enforce
} }
func SelinuxSetEnforce(mode int) error {
return writeCon(selinuxEnforcePath(), fmt.Sprintf("%d", mode))
}
func SelinuxGetEnforceMode() int { func SelinuxGetEnforceMode() int {
switch readConfig(selinuxTag) { switch readConfig(selinuxTag) {
case "enforcing": case "enforcing":

View File

@ -3,6 +3,7 @@
package libcontainer package libcontainer
import ( import (
"io"
"os" "os"
"syscall" "syscall"
@ -14,6 +15,7 @@ import (
) )
type linuxStandardInit struct { type linuxStandardInit struct {
pipe io.ReadWriter
parentPid int parentPid int
config *initConfig config *initConfig
} }
@ -50,7 +52,6 @@ func (l *linuxStandardInit) Init() error {
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil { if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
return err return err
} }
label.Init() label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace // InitializeMountNamespace() can be executed only for a new mount namespace
if l.config.Config.Namespaces.Contains(configs.NEWNS) { if l.config.Config.Namespaces.Contains(configs.NEWNS) {
@ -75,7 +76,6 @@ func (l *linuxStandardInit) Init() error {
return err return err
} }
} }
for _, path := range l.config.Config.ReadonlyPaths { for _, path := range l.config.Config.ReadonlyPaths {
if err := remountReadonly(path); err != nil { if err := remountReadonly(path); err != nil {
return err return err
@ -90,6 +90,12 @@ func (l *linuxStandardInit) Init() error {
if err != nil { if err != nil {
return err return err
} }
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return err
}
if l.config.Config.Seccomp != nil { if l.config.Config.Seccomp != nil {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err return err

View File

@ -0,0 +1,223 @@
// +build linux
package libcontainer
import (
"fmt"
"os"
"path/filepath"
"github.com/Sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/configs"
)
func newStateTransitionError(from, to containerState) error {
return &stateTransitionError{
From: from.status().String(),
To: to.status().String(),
}
}
// stateTransitionError is returned when an invalid state transition happens from one
// state to another.
type stateTransitionError struct {
From string
To string
}
func (s *stateTransitionError) Error() string {
return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
}
type containerState interface {
transition(containerState) error
destroy() error
status() Status
}
func destroy(c *linuxContainer) error {
if !c.config.Namespaces.Contains(configs.NEWPID) {
if err := killCgroupProcesses(c.cgroupManager); err != nil {
logrus.Warn(err)
}
}
err := c.cgroupManager.Destroy()
if rerr := os.RemoveAll(c.root); err == nil {
err = rerr
}
c.initProcess = nil
if herr := runPoststopHooks(c); err == nil {
err = herr
}
c.state = &stoppedState{c: c}
return err
}
func runPoststopHooks(c *linuxContainer) error {
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Root: c.config.Rootfs,
}
for _, hook := range c.config.Hooks.Poststop {
if err := hook.Run(s); err != nil {
return err
}
}
}
return nil
}
// stoppedState represents a container is a stopped/destroyed state.
type stoppedState struct {
c *linuxContainer
}
func (b *stoppedState) status() Status {
return Destroyed
}
func (b *stoppedState) transition(s containerState) error {
switch s.(type) {
case *runningState:
b.c.state = s
return nil
case *restoredState:
b.c.state = s
return nil
case *stoppedState:
return nil
}
return newStateTransitionError(b, s)
}
func (b *stoppedState) destroy() error {
return destroy(b.c)
}
// runningState represents a container that is currently running.
type runningState struct {
c *linuxContainer
}
func (r *runningState) status() Status {
return Running
}
func (r *runningState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
running, err := r.c.isRunning()
if err != nil {
return err
}
if running {
return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
}
r.c.state = s
return nil
case *pausedState:
r.c.state = s
return nil
case *runningState:
return nil
}
return newStateTransitionError(r, s)
}
func (r *runningState) destroy() error {
running, err := r.c.isRunning()
if err != nil {
return err
}
if running {
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
}
return destroy(r.c)
}
// pausedState represents a container that is currently pause. It cannot be destroyed in a
// paused state and must transition back to running first.
type pausedState struct {
c *linuxContainer
}
func (p *pausedState) status() Status {
return Paused
}
func (p *pausedState) transition(s containerState) error {
switch s.(type) {
case *runningState, *stoppedState:
p.c.state = s
return nil
case *pausedState:
return nil
}
return newStateTransitionError(p, s)
}
func (p *pausedState) destroy() error {
isRunning, err := p.c.isRunning()
if err != nil {
return err
}
if !isRunning {
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
return destroy(p.c)
}
return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
}
// restoredState is the same as the running state but also has accociated checkpoint
// information that maybe need destroyed when the container is stopped and destory is called.
type restoredState struct {
imageDir string
c *linuxContainer
}
func (r *restoredState) status() Status {
return Running
}
func (r *restoredState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
return nil
case *runningState:
return nil
}
return newStateTransitionError(r, s)
}
func (r *restoredState) destroy() error {
if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil {
if !os.IsNotExist(err) {
return err
}
}
return destroy(r.c)
}
// createdState is used whenever a container is restored, loaded, or setting additional
// processes inside and it should not be destroyed when it is exiting.
type createdState struct {
c *linuxContainer
s Status
}
func (n *createdState) status() Status {
return n.s
}
func (n *createdState) transition(s containerState) error {
n.c.state = s
return nil
}
func (n *createdState) destroy() error {
return nil
}

View File

@ -3,6 +3,9 @@
package system package system
import ( import (
"bufio"
"fmt"
"os"
"os/exec" "os/exec"
"syscall" "syscall"
"unsafe" "unsafe"
@ -75,3 +78,37 @@ func Setctty() error {
} }
return nil return nil
} }
/*
* Detect whether we are currently running in a user namespace.
* Copied from github.com/lxc/lxd/shared/util.go
*/
func RunningInUserNS() bool {
file, err := os.Open("/proc/self/uid_map")
if err != nil {
/*
* This kernel-provided file only exists if user namespaces are
* supported
*/
return false
}
defer file.Close()
buf := bufio.NewReader(file)
l, _, err := buf.ReadLine()
if err != nil {
return false
}
line := string(l)
var a, b, c int64
fmt.Sscanf(line, "%d %d %d", &a, &b, &c)
/*
* We assume we are in the initial user namespace if we have a full
* range - 4294967295 uids starting at uid 0.
*/
if a == 0 && b == 0 && c == 4294967295 {
return false
}
return true
}

View File

@ -3,6 +3,7 @@ package utils
import ( import (
"crypto/rand" "crypto/rand"
"encoding/hex" "encoding/hex"
"encoding/json"
"io" "io"
"path/filepath" "path/filepath"
"syscall" "syscall"
@ -36,10 +37,20 @@ func ResolveRootfs(uncleanRootfs string) (string, error) {
} }
// ExitStatus returns the correct exit status for a process based on if it // ExitStatus returns the correct exit status for a process based on if it
// was signaled or exited cleanly. // was signaled or exited cleanly
func ExitStatus(status syscall.WaitStatus) int { func ExitStatus(status syscall.WaitStatus) int {
if status.Signaled() { if status.Signaled() {
return exitSignalOffset + int(status.Signal()) return exitSignalOffset + int(status.Signal())
} }
return status.ExitStatus() return status.ExitStatus()
} }
// WriteJSON writes the provided struct v to w using standard json marshaling
func WriteJSON(w io.Writer, v interface{}) error {
data, err := json.Marshal(v)
if err != nil {
return err
}
_, err = w.Write(data)
return err
}

View File

@ -22,7 +22,7 @@ import (
type NodeInfo struct { type NodeInfo struct {
Cores int Cores int
Mem int64 // in bytes Mem uint64 // in bytes
} }
func nodeInfo(si *mesos.SlaveInfo, ei *mesos.ExecutorInfo) NodeInfo { func nodeInfo(si *mesos.SlaveInfo, ei *mesos.ExecutorInfo) NodeInfo {
@ -57,13 +57,13 @@ func nodeInfo(si *mesos.SlaveInfo, ei *mesos.ExecutorInfo) NodeInfo {
// TODO(sttts): switch to float64 when "Machine Allocables" are implemented // TODO(sttts): switch to float64 when "Machine Allocables" are implemented
ni.Cores += int(r.GetScalar().GetValue()) ni.Cores += int(r.GetScalar().GetValue())
case "mem": case "mem":
ni.Mem += int64(r.GetScalar().GetValue()) * 1024 * 1024 ni.Mem += uint64(r.GetScalar().GetValue()) * 1024 * 1024
} }
} }
// TODO(sttts): subtract executorCPU/Mem from static pod resources before subtracting them from the capacity // TODO(sttts): subtract executorCPU/Mem from static pod resources before subtracting them from the capacity
ni.Cores -= int(executorCPU) ni.Cores -= int(executorCPU)
ni.Mem -= int64(executorMem) * 1024 * 1024 ni.Mem -= uint64(executorMem) * 1024 * 1024
return ni return ni
} }

View File

@ -25,10 +25,10 @@ import (
type MesosCadvisor struct { type MesosCadvisor struct {
cadvisor.Interface cadvisor.Interface
cores int cores int
mem int64 mem uint64
} }
func NewMesosCadvisor(cores int, mem int64, port uint) (*MesosCadvisor, error) { func NewMesosCadvisor(cores int, mem uint64, port uint) (*MesosCadvisor, error) {
c, err := cadvisor.New(port) c, err := cadvisor.New(port)
if err != nil { if err != nil {
return nil, err return nil, err

View File

@ -28,7 +28,7 @@ func CapacityFromMachineInfo(info *cadvisorApi.MachineInfo) api.ResourceList {
int64(info.NumCores*1000), int64(info.NumCores*1000),
resource.DecimalSI), resource.DecimalSI),
api.ResourceMemory: *resource.NewQuantity( api.ResourceMemory: *resource.NewQuantity(
info.MemoryCapacity, int64(info.MemoryCapacity),
resource.BinarySI), resource.BinarySI),
} }
return c return c

View File

@ -126,8 +126,11 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
func createManager(containerName string) *fs.Manager { func createManager(containerName string) *fs.Manager {
return &fs.Manager{ return &fs.Manager{
Cgroups: &configs.Cgroup{ Cgroups: &configs.Cgroup{
Name: containerName, Parent: "/",
AllowAllDevices: true, Name: containerName,
Resources: &configs.Resources{
AllowAllDevices: true,
},
}, },
} }
} }
@ -208,10 +211,13 @@ func (cm *containerManagerImpl) setupNode() error {
dockerContainer := &fs.Manager{ dockerContainer := &fs.Manager{
Cgroups: &configs.Cgroup{ Cgroups: &configs.Cgroup{
Name: cm.DockerDaemonContainerName, Parent: "/",
Memory: memoryLimit, Name: cm.DockerDaemonContainerName,
MemorySwap: -1, Resources: &configs.Resources{
AllowAllDevices: true, Memory: memoryLimit,
MemorySwap: -1,
AllowAllDevices: true,
},
}, },
} }
cont.ensureStateFunc = func(manager *fs.Manager) error { cont.ensureStateFunc = func(manager *fs.Manager) error {
@ -227,7 +233,8 @@ func (cm *containerManagerImpl) setupNode() error {
rootContainer := &fs.Manager{ rootContainer := &fs.Manager{
Cgroups: &configs.Cgroup{ Cgroups: &configs.Cgroup{
Name: "/", Parent: "/",
Name: "/",
}, },
} }
manager := createManager(cm.SystemContainerName) manager := createManager(cm.SystemContainerName)
@ -377,40 +384,23 @@ func ensureSystemContainer(rootContainer *fs.Manager, manager *fs.Manager) error
continue continue
} }
// Get PIDs already in target group so we can remove them from the list of // Remove kernel pids and other protected PIDs (pid 1, PIDs already in system & kubelet containers)
// PIDs to move.
systemCgroupPIDs, err := manager.GetPids()
if err != nil {
errs = append(errs, fmt.Errorf("failed to list PIDs for %s: %v", manager.Cgroups.Name, err))
continue
}
systemCgroupPIDMap := make(map[int]struct{}, len(systemCgroupPIDs))
for _, pid := range systemCgroupPIDs {
systemCgroupPIDMap[pid] = struct{}{}
}
// Remove kernel pids and process 1
pids := make([]int, 0, len(allPids)) pids := make([]int, 0, len(allPids))
for _, pid := range allPids { for _, pid := range allPids {
if isKernelPid(pid) { if isKernelPid(pid) {
continue continue
} }
if _, ok := systemCgroupPIDMap[pid]; ok {
continue
}
pids = append(pids, pid) pids = append(pids, pid)
} }
glog.Infof("Found %d PIDs in root, %d of them are kernel related", len(allPids), len(allPids)-len(pids)) glog.Infof("Found %d PIDs in root, %d of them are not to be moved", len(allPids), len(allPids)-len(pids))
// Check if we moved all the non-kernel PIDs. // Check if we have moved all the non-kernel PIDs.
if len(pids) == 0 { if len(pids) == 0 {
break break
} }
glog.Infof("Moving non-kernel threads: %v", pids) glog.Infof("Moving non-kernel processes: %v", pids)
for _, pid := range pids { for _, pid := range pids {
err := manager.Apply(pid) err := manager.Apply(pid)
if err != nil { if err != nil {

View File

@ -1572,7 +1572,7 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe
if container.Name == PodInfraContainerName { if container.Name == PodInfraContainerName {
oomScoreAdj = qos.PodInfraOOMAdj oomScoreAdj = qos.PodInfraOOMAdj
} else { } else {
oomScoreAdj = qos.GetContainerOOMScoreAdjust(container, dm.machineInfo.MemoryCapacity) oomScoreAdj = qos.GetContainerOOMScoreAdjust(container, int64(dm.machineInfo.MemoryCapacity))
} }
cgroupName, err := dm.procFs.GetFullContainerName(containerInfo.State.Pid) cgroupName, err := dm.procFs.GetFullContainerName(containerInfo.State.Pid)
if err != nil { if err != nil {

View File

@ -41,7 +41,8 @@ func NewOOMAdjuster() *OOMAdjuster {
func getPids(cgroupName string) ([]int, error) { func getPids(cgroupName string) ([]int, error) {
fsManager := fs.Manager{ fsManager := fs.Manager{
Cgroups: &configs.Cgroup{ Cgroups: &configs.Cgroup{
Name: cgroupName, Parent: "/",
Name: cgroupName,
}, },
} }
return fsManager.GetPids() return fsManager.GetPids()

View File

@ -33,8 +33,11 @@ import (
func RunInResourceContainer(containerName string) error { func RunInResourceContainer(containerName string) error {
manager := fs.Manager{ manager := fs.Manager{
Cgroups: &configs.Cgroup{ Cgroups: &configs.Cgroup{
Name: containerName, Parent: "/",
AllowAllDevices: true, Name: containerName,
Resources: &configs.Resources{
AllowAllDevices: true,
},
}, },
} }