diff --git a/Godeps/Godeps.json b/Godeps/Godeps.json index 5aa98147444..a2ec9c25b28 100644 --- a/Godeps/Godeps.json +++ b/Godeps/Godeps.json @@ -579,93 +579,93 @@ }, { "ImportPath": "github.com/google/cadvisor/api", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/cache/memory", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/collector", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/container", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/events", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/fs", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/healthz", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/http", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/info/v1", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/info/v2", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/manager", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/metrics", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/pages", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/storage", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/summary", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/utils", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/validate", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/cadvisor/version", - "Comment": "v0.20.4", - "Rev": "59488ce2c4197f501283739c6a4dd3169999f317" + "Comment": "v0.20.5", + "Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87" }, { "ImportPath": "github.com/google/gofuzz", @@ -793,8 +793,8 @@ }, { "ImportPath": "github.com/opencontainers/runc/libcontainer", - "Comment": "v0.0.5", - "Rev": "97bc9a7faf3dd660d9be90a2880b2e37f3cdbf38" + "Comment": "v0.0.7", + "Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6" }, { "ImportPath": "github.com/pborman/uuid", diff --git a/Godeps/_workspace/src/github.com/google/cadvisor/container/docker/handler.go b/Godeps/_workspace/src/github.com/google/cadvisor/container/docker/handler.go index 89d443edc54..9222d8df46a 100644 --- a/Godeps/_workspace/src/github.com/google/cadvisor/container/docker/handler.go +++ b/Godeps/_workspace/src/github.com/google/cadvisor/container/docker/handler.go @@ -139,6 +139,7 @@ func newDockerContainerHandler( rootFs: rootFs, rootfsStorageDir: rootfsStorageDir, fsHandler: newFsHandler(time.Minute, rootfsStorageDir, otherStorageDir, fsInfo), + envs: make(map[string]string), } // We assume that if Inspect fails then the container is not known to docker. @@ -206,36 +207,31 @@ func libcontainerConfigToContainerSpec(config *libcontainerconfigs.Config, mi *i spec.HasMemory = true spec.Memory.Limit = math.MaxUint64 spec.Memory.SwapLimit = math.MaxUint64 - if config.Cgroups.Memory > 0 { - spec.Memory.Limit = uint64(config.Cgroups.Memory) - } - if config.Cgroups.MemorySwap > 0 { - spec.Memory.SwapLimit = uint64(config.Cgroups.MemorySwap) - } - // Get CPU info - spec.HasCpu = true - spec.Cpu.Limit = 1024 - if config.Cgroups.CpuShares != 0 { - spec.Cpu.Limit = uint64(config.Cgroups.CpuShares) + if config.Cgroups.Resources != nil { + if config.Cgroups.Resources.Memory > 0 { + spec.Memory.Limit = uint64(config.Cgroups.Resources.Memory) + } + if config.Cgroups.Resources.MemorySwap > 0 { + spec.Memory.SwapLimit = uint64(config.Cgroups.Resources.MemorySwap) + } + + // Get CPU info + spec.HasCpu = true + spec.Cpu.Limit = 1024 + if config.Cgroups.Resources.CpuShares != 0 { + spec.Cpu.Limit = uint64(config.Cgroups.Resources.CpuShares) + } + spec.Cpu.Mask = utils.FixCpuMask(config.Cgroups.Resources.CpusetCpus, mi.NumCores) } - spec.Cpu.Mask = utils.FixCpuMask(config.Cgroups.CpusetCpus, mi.NumCores) spec.HasDiskIo = true return spec } -var ( - hasNetworkModes = map[string]bool{ - "host": true, - "bridge": true, - "default": true, - } -) - func hasNet(networkMode string) bool { - return hasNetworkModes[networkMode] + return !strings.HasPrefix(networkMode, "container:") } func (self *dockerContainerHandler) GetSpec() (info.ContainerSpec, error) { diff --git a/Godeps/_workspace/src/github.com/google/cadvisor/container/libcontainer/compatibility.go b/Godeps/_workspace/src/github.com/google/cadvisor/container/libcontainer/compatibility.go index b14d3b36b59..7d484170e4b 100644 --- a/Godeps/_workspace/src/github.com/google/cadvisor/container/libcontainer/compatibility.go +++ b/Godeps/_workspace/src/github.com/google/cadvisor/container/libcontainer/compatibility.go @@ -292,31 +292,32 @@ func convertOldConfigToNew(config v1Config) *configs.Config { result.Routes = config.Config.Routes var newCgroup = &configs.Cgroup{ - Name: old.Name, - Parent: old.Parent, - AllowAllDevices: old.AllowAllDevices, - AllowedDevices: old.AllowedDevices, - DeniedDevices: old.DeniedDevices, - Memory: old.Memory, - MemoryReservation: old.MemoryReservation, - MemorySwap: old.MemorySwap, - KernelMemory: old.KernelMemory, - CpuShares: old.CpuShares, - CpuQuota: old.CpuQuota, - CpuPeriod: old.CpuPeriod, - CpuRtRuntime: old.CpuRtRuntime, - CpuRtPeriod: old.CpuRtPeriod, - CpusetCpus: old.CpusetCpus, - CpusetMems: old.CpusetMems, - BlkioWeight: old.BlkioWeight, - BlkioLeafWeight: old.BlkioLeafWeight, - Freezer: old.Freezer, - HugetlbLimit: old.HugetlbLimit, - Slice: old.Slice, - OomKillDisable: old.OomKillDisable, - MemorySwappiness: old.MemorySwappiness, - NetPrioIfpriomap: old.NetPrioIfpriomap, - NetClsClassid: old.NetClsClassid, + Name: old.Name, + Parent: old.Parent, + Resources: &configs.Resources{ + AllowAllDevices: old.Resources.AllowAllDevices, + AllowedDevices: old.Resources.AllowedDevices, + DeniedDevices: old.Resources.DeniedDevices, + Memory: old.Resources.Memory, + MemoryReservation: old.Resources.MemoryReservation, + MemorySwap: old.Resources.MemorySwap, + KernelMemory: old.Resources.KernelMemory, + CpuShares: old.Resources.CpuShares, + CpuQuota: old.Resources.CpuQuota, + CpuPeriod: old.Resources.CpuPeriod, + CpuRtRuntime: old.Resources.CpuRtRuntime, + CpuRtPeriod: old.Resources.CpuRtPeriod, + CpusetCpus: old.Resources.CpusetCpus, + CpusetMems: old.Resources.CpusetMems, + BlkioWeight: old.Resources.BlkioWeight, + BlkioLeafWeight: old.Resources.BlkioLeafWeight, + Freezer: old.Resources.Freezer, + HugetlbLimit: old.Resources.HugetlbLimit, + OomKillDisable: old.Resources.OomKillDisable, + MemorySwappiness: old.Resources.MemorySwappiness, + NetPrioIfpriomap: old.Resources.NetPrioIfpriomap, + NetClsClassid: old.Resources.NetClsClassid, + }, } result.Cgroups = newCgroup diff --git a/Godeps/_workspace/src/github.com/google/cadvisor/fs/fs.go b/Godeps/_workspace/src/github.com/google/cadvisor/fs/fs.go index 32acef535ec..c08fe445930 100644 --- a/Godeps/_workspace/src/github.com/google/cadvisor/fs/fs.go +++ b/Godeps/_workspace/src/github.com/google/cadvisor/fs/fs.go @@ -54,6 +54,8 @@ type RealFsInfo struct { // Map from label to block device path. // Labels are intent-specific tags that are auto-detected. labels map[string]string + + dmsetup dmsetupClient } type Context struct { @@ -67,9 +69,11 @@ func NewFsInfo(context Context) (FsInfo, error) { if err != nil { return nil, err } - partitions := make(map[string]partition, 0) - fsInfo := &RealFsInfo{} - fsInfo.labels = make(map[string]string, 0) + fsInfo := &RealFsInfo{ + partitions: make(map[string]partition, 0), + labels: make(map[string]string, 0), + dmsetup: &defaultDmsetupClient{}, + } supportedFsType := map[string]bool{ // all ext systems are checked through prefix. "btrfs": true, @@ -82,49 +86,87 @@ func NewFsInfo(context Context) (FsInfo, error) { continue } // Avoid bind mounts. - if _, ok := partitions[mount.Source]; ok { + if _, ok := fsInfo.partitions[mount.Source]; ok { continue } if mount.Fstype == "zfs" { Fstype = mount.Fstype } - partitions[mount.Source] = partition{ + fsInfo.partitions[mount.Source] = partition{ fsType: Fstype, mountpoint: mount.Mountpoint, major: uint(mount.Major), minor: uint(mount.Minor), } } - if storageDriver, ok := context.DockerInfo["Driver"]; ok && storageDriver == "devicemapper" { - dev, major, minor, blockSize, err := dockerDMDevice(context.DockerInfo["DriverStatus"]) - if err != nil { - glog.Warningf("Could not get Docker devicemapper device: %v", err) - } else { - partitions[dev] = partition{ - fsType: "devicemapper", - major: major, - minor: minor, - blockSize: blockSize, - } - fsInfo.labels[LabelDockerImages] = dev - } - } - glog.Infof("Filesystem partitions: %+v", partitions) - fsInfo.partitions = partitions - fsInfo.addLabels(context) + + // need to call this before the log line below printing out the partitions, as this function may + // add a "partition" for devicemapper to fsInfo.partitions + fsInfo.addDockerImagesLabel(context) + + glog.Infof("Filesystem partitions: %+v", fsInfo.partitions) + fsInfo.addSystemRootLabel() return fsInfo, nil } -func (self *RealFsInfo) addLabels(context Context) { - dockerPaths := getDockerImagePaths(context) +// getDockerDeviceMapperInfo returns information about the devicemapper device and "partition" if +// docker is using devicemapper for its storage driver. If a loopback device is being used, don't +// return any information or error, as we want to report based on the actual partition where the +// loopback file resides, inside of the loopback file itself. +func (self *RealFsInfo) getDockerDeviceMapperInfo(dockerInfo map[string]string) (string, *partition, error) { + if storageDriver, ok := dockerInfo["Driver"]; ok && storageDriver != "devicemapper" { + return "", nil, nil + } + + var driverStatus [][]string + if err := json.Unmarshal([]byte(dockerInfo["DriverStatus"]), &driverStatus); err != nil { + return "", nil, err + } + + dataLoopFile := dockerStatusValue(driverStatus, "Data loop file") + if len(dataLoopFile) > 0 { + return "", nil, nil + } + + dev, major, minor, blockSize, err := dockerDMDevice(driverStatus, self.dmsetup) + if err != nil { + return "", nil, err + } + + return dev, &partition{ + fsType: "devicemapper", + major: major, + minor: minor, + blockSize: blockSize, + }, nil +} + +// addSystemRootLabel attempts to determine which device contains the mount for /. +func (self *RealFsInfo) addSystemRootLabel() { for src, p := range self.partitions { if p.mountpoint == "/" { if _, ok := self.labels[LabelSystemRoot]; !ok { self.labels[LabelSystemRoot] = src } } - self.updateDockerImagesPath(src, p.mountpoint, dockerPaths) - // TODO(rjnagal): Add label for docker devicemapper pool. + } +} + +// addDockerImagesLabel attempts to determine which device contains the mount for docker images. +func (self *RealFsInfo) addDockerImagesLabel(context Context) { + dockerDev, dockerPartition, err := self.getDockerDeviceMapperInfo(context.DockerInfo) + if err != nil { + glog.Warningf("Could not get Docker devicemapper device: %v", err) + } + if len(dockerDev) > 0 && dockerPartition != nil { + self.partitions[dockerDev] = *dockerPartition + self.labels[LabelDockerImages] = dockerDev + } else { + dockerPaths := getDockerImagePaths(context) + + for src, p := range self.partitions { + self.updateDockerImagesPath(src, p.mountpoint, dockerPaths) + } } } @@ -345,20 +387,30 @@ func dockerStatusValue(status [][]string, target string) string { return "" } +// dmsetupClient knows to to interact with dmsetup to retrieve information about devicemapper. +type dmsetupClient interface { + table(poolName string) ([]byte, error) + //TODO add status(poolName string) ([]byte, error) and use it in getDMStats so we can unit test +} + +// defaultDmsetupClient implements the standard behavior for interacting with dmsetup. +type defaultDmsetupClient struct{} + +var _ dmsetupClient = &defaultDmsetupClient{} + +func (*defaultDmsetupClient) table(poolName string) ([]byte, error) { + return exec.Command("dmsetup", "table", poolName).Output() +} + // Devicemapper thin provisioning is detailed at // https://www.kernel.org/doc/Documentation/device-mapper/thin-provisioning.txt -func dockerDMDevice(driverStatus string) (string, uint, uint, uint, error) { - var config [][]string - err := json.Unmarshal([]byte(driverStatus), &config) - if err != nil { - return "", 0, 0, 0, err - } - poolName := dockerStatusValue(config, "Pool Name") +func dockerDMDevice(driverStatus [][]string, dmsetup dmsetupClient) (string, uint, uint, uint, error) { + poolName := dockerStatusValue(driverStatus, "Pool Name") if len(poolName) == 0 { return "", 0, 0, 0, fmt.Errorf("Could not get dm pool name") } - out, err := exec.Command("dmsetup", "table", poolName).Output() + out, err := dmsetup.table(poolName) if err != nil { return "", 0, 0, 0, err } diff --git a/Godeps/_workspace/src/github.com/google/cadvisor/info/v1/machine.go b/Godeps/_workspace/src/github.com/google/cadvisor/info/v1/machine.go index 7f50ce3b244..f26291c1125 100644 --- a/Godeps/_workspace/src/github.com/google/cadvisor/info/v1/machine.go +++ b/Godeps/_workspace/src/github.com/google/cadvisor/info/v1/machine.go @@ -136,7 +136,7 @@ type MachineInfo struct { CpuFrequency uint64 `json:"cpu_frequency_khz"` // The amount of memory (in bytes) in this machine - MemoryCapacity int64 `json:"memory_capacity"` + MemoryCapacity uint64 `json:"memory_capacity"` // The machine id MachineID string `json:"machine_id"` diff --git a/Godeps/_workspace/src/github.com/google/cadvisor/info/v2/machine.go b/Godeps/_workspace/src/github.com/google/cadvisor/info/v2/machine.go index 50fbee24410..b6c3f24f359 100644 --- a/Godeps/_workspace/src/github.com/google/cadvisor/info/v2/machine.go +++ b/Godeps/_workspace/src/github.com/google/cadvisor/info/v2/machine.go @@ -41,7 +41,7 @@ type Attributes struct { CpuFrequency uint64 `json:"cpu_frequency_khz"` // The amount of memory (in bytes) in this machine - MemoryCapacity int64 `json:"memory_capacity"` + MemoryCapacity uint64 `json:"memory_capacity"` // The machine id MachineID string `json:"machine_id"` diff --git a/Godeps/_workspace/src/github.com/google/cadvisor/utils/machine/machine.go b/Godeps/_workspace/src/github.com/google/cadvisor/utils/machine/machine.go index 547f1077dbd..fb125816934 100644 --- a/Godeps/_workspace/src/github.com/google/cadvisor/utils/machine/machine.go +++ b/Godeps/_workspace/src/github.com/google/cadvisor/utils/machine/machine.go @@ -82,8 +82,8 @@ func GetClockSpeed(procInfo []byte) (uint64, error) { } // GetMachineMemoryCapacity returns the machine's total memory from /proc/meminfo. -// Returns the total memory capacity as an int64 (number of bytes). -func GetMachineMemoryCapacity() (int64, error) { +// Returns the total memory capacity as an uint64 (number of bytes). +func GetMachineMemoryCapacity() (uint64, error) { out, err := ioutil.ReadFile("/proc/meminfo") if err != nil { return 0, err @@ -97,8 +97,8 @@ func GetMachineMemoryCapacity() (int64, error) { } // GetMachineSwapCapacity returns the machine's total swap from /proc/meminfo. -// Returns the total swap capacity as an int64 (number of bytes). -func GetMachineSwapCapacity() (int64, error) { +// Returns the total swap capacity as an uint64 (number of bytes). +func GetMachineSwapCapacity() (uint64, error) { out, err := ioutil.ReadFile("/proc/meminfo") if err != nil { return 0, err @@ -113,14 +113,14 @@ func GetMachineSwapCapacity() (int64, error) { // parseCapacity matches a Regexp in a []byte, returning the resulting value in bytes. // Assumes that the value matched by the Regexp is in KB. -func parseCapacity(b []byte, r *regexp.Regexp) (int64, error) { +func parseCapacity(b []byte, r *regexp.Regexp) (uint64, error) { matches := r.FindSubmatch(b) if len(matches) != 2 { - return -1, fmt.Errorf("failed to match regexp in output: %q", string(b)) + return 0, fmt.Errorf("failed to match regexp in output: %q", string(b)) } - m, err := strconv.ParseInt(string(matches[1]), 10, 64) + m, err := strconv.ParseUint(string(matches[1]), 10, 64) if err != nil { - return -1, err + return 0, err } // Convert to bytes. diff --git a/Godeps/_workspace/src/github.com/google/cadvisor/version/VERSION b/Godeps/_workspace/src/github.com/google/cadvisor/version/VERSION index a4e543eba38..58d8f8d4a98 100644 --- a/Godeps/_workspace/src/github.com/google/cadvisor/version/VERSION +++ b/Godeps/_workspace/src/github.com/google/cadvisor/version/VERSION @@ -1 +1 @@ -0.20.4 \ No newline at end of file +0.20.5 \ No newline at end of file diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/README.md b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/README.md index 295edb4f7e5..fc6b4b0b184 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/README.md +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/README.md @@ -10,80 +10,165 @@ host system and which is (optionally) isolated from other containers in the syst #### Using libcontainer -To create a container you first have to initialize an instance of a factory -that will handle the creation and initialization for a container. - -Because containers are spawned in a two step process you will need to provide -arguments to a binary that will be executed as the init process for the container. -To use the current binary that is spawning the containers and acting as the parent -you can use `os.Args[0]` and we have a command called `init` setup. +Because containers are spawned in a two step process you will need a binary that +will be executed as the init process for the container. In libcontainer, we use +the current binary (/proc/self/exe) to be executed as the init process, and use +arg "init", we call the first step process "bootstrap", so you always need a "init" +function as the entry of "bootstrap". ```go -root, err := libcontainer.New("/var/lib/container", libcontainer.InitArgs(os.Args[0], "init")) +func init() { + if len(os.Args) > 1 && os.Args[1] == "init" { + runtime.GOMAXPROCS(1) + runtime.LockOSThread() + factory, _ := libcontainer.New("") + if err := factory.StartInitialization(); err != nil { + logrus.Fatal(err) + } + panic("--this line should have never been executed, congratulations--") + } +} +``` + +Then to create a container you first have to initialize an instance of a factory +that will handle the creation and initialization for a container. + +```go +factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init")) if err != nil { - log.Fatal(err) + logrus.Fatal(err) + return } ``` Once you have an instance of the factory created we can create a configuration -struct describing how the container is to be created. A sample would look similar to this: +struct describing how the container is to be created. A sample would look similar to this: ```go +defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV config := &configs.Config{ - Rootfs: rootfs, - Capabilities: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - Namespaces: configs.Namespaces([]configs.Namespace{ - {Type: configs.NEWNS}, - {Type: configs.NEWUTS}, - {Type: configs.NEWIPC}, - {Type: configs.NEWPID}, - {Type: configs.NEWNET}, - }), - Cgroups: &configs.Cgroup{ - Name: "test-container", - Parent: "system", - AllowAllDevices: false, - AllowedDevices: configs.DefaultAllowedDevices, - }, - - Devices: configs.DefaultAutoCreatedDevices, - Hostname: "testing", - Networks: []*configs.Network{ - { - Type: "loopback", - Address: "127.0.0.1/0", - Gateway: "localhost", - }, - }, - Rlimits: []configs.Rlimit{ - { - Type: syscall.RLIMIT_NOFILE, - Hard: uint64(1024), - Soft: uint64(1024), - }, - }, + Rootfs: "/your/path/to/rootfs", + Capabilities: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Namespaces: configs.Namespaces([]configs.Namespace{ + {Type: configs.NEWNS}, + {Type: configs.NEWUTS}, + {Type: configs.NEWIPC}, + {Type: configs.NEWPID}, + {Type: configs.NEWUSER}, + {Type: configs.NEWNET}, + }), + Cgroups: &configs.Cgroup{ + Name: "test-container", + Parent: "system", + Resources: &configs.Resources{ + MemorySwappiness: -1, + AllowAllDevices: false, + AllowedDevices: configs.DefaultAllowedDevices, + }, + }, + MaskPaths: []string{ + "/proc/kcore", + }, + ReadonlyPaths: []string{ + "/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus", + }, + Devices: configs.DefaultAutoCreatedDevices, + Hostname: "testing", + Mounts: []*configs.Mount{ + { + Source: "proc", + Destination: "/proc", + Device: "proc", + Flags: defaultMountFlags, + }, + { + Source: "tmpfs", + Destination: "/dev", + Device: "tmpfs", + Flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, + Data: "mode=755", + }, + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + Flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, + Data: "newinstance,ptmxmode=0666,mode=0620,gid=5", + }, + { + Device: "tmpfs", + Source: "shm", + Destination: "/dev/shm", + Data: "mode=1777,size=65536k", + Flags: defaultMountFlags, + }, + { + Source: "mqueue", + Destination: "/dev/mqueue", + Device: "mqueue", + Flags: defaultMountFlags, + }, + { + Source: "sysfs", + Destination: "/sys", + Device: "sysfs", + Flags: defaultMountFlags | syscall.MS_RDONLY, + }, + }, + UidMappings: []configs.IDMap{ + { + ContainerID: 0, + Host: 1000, + size: 65536, + }, + }, + GidMappings: []configs.IDMap{ + { + ContainerID: 0, + Host: 1000, + size: 65536, + }, + }, + Networks: []*configs.Network{ + { + Type: "loopback", + Address: "127.0.0.1/0", + Gateway: "localhost", + }, + }, + Rlimits: []configs.Rlimit{ + { + Type: syscall.RLIMIT_NOFILE, + Hard: uint64(1025), + Soft: uint64(1025), + }, + }, } ``` Once you have the configuration populated you can create a container: ```go -container, err := root.Create("container-id", config) +container, err := factory.Create("container-id", config) +if err != nil { + logrus.Fatal(err) + return +} ``` To spawn bash as the initial process inside the container and have the @@ -91,23 +176,25 @@ processes pid returned in order to wait, signal, or kill the process: ```go process := &libcontainer.Process{ - Args: []string{"/bin/bash"}, - Env: []string{"PATH=/bin"}, - User: "daemon", - Stdin: os.Stdin, - Stdout: os.Stdout, - Stderr: os.Stderr, + Args: []string{"/bin/bash"}, + Env: []string{"PATH=/bin"}, + User: "daemon", + Stdin: os.Stdin, + Stdout: os.Stdout, + Stderr: os.Stderr, } err := container.Start(process) if err != nil { - log.Fatal(err) + logrus.Fatal(err) + container.Destroy() + return } // wait for the process to finish. -status, err := process.Wait() +_, err := process.Wait() if err != nil { - log.Fatal(err) + logrus.Fatal(err) } // destroy the container. @@ -124,7 +211,6 @@ processes, err := container.Processes() // it's processes. stats, err := container.Stats() - // pause all processes inside the container. container.Pause() diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/SPEC.md b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/SPEC.md index fad1dd72a26..615111252f9 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/SPEC.md +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/SPEC.md @@ -60,7 +60,7 @@ are required to be mounted within the rootfs that the runtime will setup. After a container's filesystems are mounted within the newly created mount namespace `/dev` will need to be populated with a set of device nodes. It is expected that a rootfs does not need to have any device nodes specified -for `/dev` witin the rootfs as the container will setup the correct devices +for `/dev` within the rootfs as the container will setup the correct devices that are required for executing a container's process. | Path | Mode | Access | diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_disabled.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_disabled.go index 937bf915c75..d4110cf0bc6 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_disabled.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_disabled.go @@ -2,10 +2,19 @@ package apparmor +import ( + "errors" +) + +var ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported") + func IsEnabled() bool { return false } func ApplyProfile(name string) error { + if name != "" { + return ErrApparmorNotEnabled + } return nil } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go index a08e905caaa..c8f7796567b 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go @@ -15,6 +15,9 @@ type Manager interface { // Returns the PIDs inside the cgroup set GetPids() ([]int, error) + // Returns the PIDs inside the cgroup set & all sub-cgroups + GetAllPids() ([]int, error) + // Returns statistics for the cgroup set GetStats() (*Stats, error) diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go index 68ebcfefbf9..4da3b734e18 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go @@ -23,6 +23,7 @@ var ( &MemoryGroup{}, &CpuGroup{}, &CpuacctGroup{}, + &PidsGroup{}, &BlkioGroup{}, &HugetlbGroup{}, &NetClsGroup{}, @@ -112,6 +113,22 @@ func (m *Manager) Apply(pid int) (err error) { return err } + if c.Paths != nil { + paths := make(map[string]string) + for name, path := range c.Paths { + _, err := d.path(name) + if err != nil { + if cgroups.IsNotFound(err) { + continue + } + return err + } + paths[name] = path + } + m.Paths = paths + return cgroups.EnterPid(m.Paths, pid) + } + paths := make(map[string]string) defer func() { if err != nil { @@ -135,17 +152,13 @@ func (m *Manager) Apply(pid int) (err error) { paths[sys.Name()] = p } m.Paths = paths - - if paths["cpu"] != "" { - if err := CheckCpushares(paths["cpu"], c.CpuShares); err != nil { - return err - } - } - return nil } func (m *Manager) Destroy() error { + if m.Cgroups.Paths != nil { + return nil + } m.mu.Lock() defer m.mu.Unlock() if err := cgroups.RemovePaths(m.Paths); err != nil { @@ -179,15 +192,28 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { } func (m *Manager) Set(container *configs.Config) error { - for name, path := range m.Paths { - sys, err := subsystems.Get(name) - if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { - continue + for _, sys := range subsystems { + // Generate fake cgroup data. + d, err := getCgroupData(container.Cgroups, -1) + if err != nil { + return err } + // Get the path, but don't error out if the cgroup wasn't found. + path, err := d.path(sys.Name()) + if err != nil && !cgroups.IsNotFound(err) { + return err + } + if err := sys.Set(path, container.Cgroups); err != nil { return err } } + + if m.Paths["cpu"] != "" { + if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { + return err + } + } return nil } @@ -202,40 +228,78 @@ func (m *Manager) Freeze(state configs.FreezerState) error { if err != nil { return err } - prevState := m.Cgroups.Freezer - m.Cgroups.Freezer = state + prevState := m.Cgroups.Resources.Freezer + m.Cgroups.Resources.Freezer = state freezer, err := subsystems.Get("freezer") if err != nil { return err } err = freezer.Set(dir, m.Cgroups) if err != nil { - m.Cgroups.Freezer = prevState + m.Cgroups.Resources.Freezer = prevState return err } return nil } func (m *Manager) GetPids() ([]int, error) { - d, err := getCgroupData(m.Cgroups, 0) + dir, err := getCgroupPath(m.Cgroups) if err != nil { return nil, err } - - dir, err := d.path("devices") - if err != nil { - return nil, err - } - return cgroups.GetPids(dir) } +func (m *Manager) GetAllPids() ([]int, error) { + dir, err := getCgroupPath(m.Cgroups) + if err != nil { + return nil, err + } + return cgroups.GetAllPids(dir) +} + +func getCgroupPath(c *configs.Cgroup) (string, error) { + d, err := getCgroupData(c, 0) + if err != nil { + return "", err + } + + return d.path("devices") +} + +// pathClean makes a path safe for use with filepath.Join. This is done by not +// only cleaning the path, but also (if the path is relative) adding a leading +// '/' and cleaning it (then removing the leading '/'). This ensures that a +// path resulting from prepending another path will always resolve to lexically +// be a subdirectory of the prefixed path. This is all done lexically, so paths +// that include symlinks won't be safe as a result of using pathClean. +func pathClean(path string) string { + // Ensure that all paths are cleaned (especially problematic ones like + // "/../../../../../" which can cause lots of issues). + path = filepath.Clean(path) + + // If the path isn't absolute, we need to do more processing to fix paths + // such as "../../../..//some/path". We also shouldn't convert absolute + // paths to relative ones. + if !filepath.IsAbs(path) { + path = filepath.Clean(string(os.PathSeparator) + path) + // This can't fail, as (by definition) all paths are relative to root. + path, _ = filepath.Rel(string(os.PathSeparator), path) + } + + // Clean the path again for good measure. + return filepath.Clean(path) +} + func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { root, err := getCgroupRoot() if err != nil { return nil, err } + // Clean the parent slice path. + c.Parent = pathClean(c.Parent) + return &cgroupData{ root: root, parent: c.Parent, diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go index eddba0bf8bd..a142cb991df 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go @@ -22,31 +22,26 @@ func (s *BlkioGroup) Name() string { } func (s *BlkioGroup) Apply(d *cgroupData) error { - dir, err := d.join("blkio") + _, err := d.join("blkio") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error { - if cgroup.BlkioWeight != 0 { - if err := writeFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.BlkioWeight), 10)); err != nil { + if cgroup.Resources.BlkioWeight != 0 { + if err := writeFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil { return err } } - if cgroup.BlkioLeafWeight != 0 { - if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.BlkioLeafWeight), 10)); err != nil { + if cgroup.Resources.BlkioLeafWeight != 0 { + if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil { return err } } - for _, wd := range cgroup.BlkioWeightDevice { + for _, wd := range cgroup.Resources.BlkioWeightDevice { if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil { return err } @@ -54,22 +49,22 @@ func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error { return err } } - for _, td := range cgroup.BlkioThrottleReadBpsDevice { + for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice { if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { return err } } - for _, td := range cgroup.BlkioThrottleWriteBpsDevice { + for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice { if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { return err } } - for _, td := range cgroup.BlkioThrottleReadIOPSDevice { + for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice { if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { return err } } - for _, td := range cgroup.BlkioThrottleWriteIOPSDevice { + for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice { if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { return err } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go index 762a68fe807..a4ef28a60f8 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go @@ -22,41 +22,36 @@ func (s *CpuGroup) Name() string { func (s *CpuGroup) Apply(d *cgroupData) error { // We always want to join the cpu group, to allow fair cpu scheduling // on a container basis - dir, err := d.join("cpu") + _, err := d.join("cpu") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error { - if cgroup.CpuShares != 0 { - if err := writeFile(path, "cpu.shares", strconv.FormatInt(cgroup.CpuShares, 10)); err != nil { + if cgroup.Resources.CpuShares != 0 { + if err := writeFile(path, "cpu.shares", strconv.FormatInt(cgroup.Resources.CpuShares, 10)); err != nil { return err } } - if cgroup.CpuPeriod != 0 { - if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(cgroup.CpuPeriod, 10)); err != nil { + if cgroup.Resources.CpuPeriod != 0 { + if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(cgroup.Resources.CpuPeriod, 10)); err != nil { return err } } - if cgroup.CpuQuota != 0 { - if err := writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.CpuQuota, 10)); err != nil { + if cgroup.Resources.CpuQuota != 0 { + if err := writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil { return err } } - if cgroup.CpuRtPeriod != 0 { - if err := writeFile(path, "cpu.rt_period_us", strconv.FormatInt(cgroup.CpuRtPeriod, 10)); err != nil { + if cgroup.Resources.CpuRtPeriod != 0 { + if err := writeFile(path, "cpu.rt_period_us", strconv.FormatInt(cgroup.Resources.CpuRtPeriod, 10)); err != nil { return err } } - if cgroup.CpuRtRuntime != 0 { - if err := writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.CpuRtRuntime, 10)); err != nil { + if cgroup.Resources.CpuRtRuntime != 0 { + if err := writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil { return err } } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go index 088a665b97b..ed1002316e4 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go @@ -4,6 +4,7 @@ package fs import ( "bytes" + "fmt" "io/ioutil" "os" "path/filepath" @@ -29,13 +30,13 @@ func (s *CpusetGroup) Apply(d *cgroupData) error { } func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error { - if cgroup.CpusetCpus != "" { - if err := writeFile(path, "cpuset.cpus", cgroup.CpusetCpus); err != nil { + if cgroup.Resources.CpusetCpus != "" { + if err := writeFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil { return err } } - if cgroup.CpusetMems != "" { - if err := writeFile(path, "cpuset.mems", cgroup.CpusetMems); err != nil { + if cgroup.Resources.CpusetMems != "" { + if err := writeFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil { return err } } @@ -63,11 +64,6 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro if err := s.ensureParent(dir, root); err != nil { return err } - // the default values inherit from parent cgroup are already set in - // s.ensureParent, cover these if we have our own - if err := s.Set(dir, cgroup); err != nil { - return err - } // because we are not using d.join we need to place the pid into the procs file // unlike the other subsystems if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil { @@ -95,6 +91,10 @@ func (s *CpusetGroup) ensureParent(current, root string) error { if filepath.Clean(parent) == root { return nil } + // Avoid infinite recursion. + if parent == current { + return fmt.Errorf("cpuset: cgroup parent path outside cgroup root") + } if err := s.ensureParent(parent, root); err != nil { return err } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go index 1e39618a460..a41ce801ffd 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go @@ -15,27 +15,22 @@ func (s *DevicesGroup) Name() string { } func (s *DevicesGroup) Apply(d *cgroupData) error { - dir, err := d.join("devices") + _, err := d.join("devices") if err != nil { // We will return error even it's `not found` error, devices // cgroup is hard requirement for container's security. return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { - if !cgroup.AllowAllDevices { + if !cgroup.Resources.AllowAllDevices { if err := writeFile(path, "devices.deny", "a"); err != nil { return err } - for _, dev := range cgroup.AllowedDevices { + for _, dev := range cgroup.Resources.AllowedDevices { if err := writeFile(path, "devices.allow", dev.CgroupString()); err != nil { return err } @@ -47,7 +42,7 @@ func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { return err } - for _, dev := range cgroup.DeniedDevices { + for _, dev := range cgroup.Resources.DeniedDevices { if err := writeFile(path, "devices.deny", dev.CgroupString()); err != nil { return err } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go index 8960ec789c9..e70dfe3b950 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go @@ -19,22 +19,17 @@ func (s *FreezerGroup) Name() string { } func (s *FreezerGroup) Apply(d *cgroupData) error { - dir, err := d.join("freezer") + _, err := d.join("freezer") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error { - switch cgroup.Freezer { + switch cgroup.Resources.Freezer { case configs.Frozen, configs.Thawed: - if err := writeFile(path, "freezer.state", string(cgroup.Freezer)); err != nil { + if err := writeFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil { return err } @@ -43,7 +38,7 @@ func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error { if err != nil { return err } - if strings.TrimSpace(state) == string(cgroup.Freezer) { + if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) { break } time.Sleep(1 * time.Millisecond) @@ -51,7 +46,7 @@ func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error { case configs.Undefined: return nil default: - return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Freezer)) + return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer)) } return nil diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go index b11365370fb..2f9727719d0 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go @@ -19,20 +19,15 @@ func (s *HugetlbGroup) Name() string { } func (s *HugetlbGroup) Apply(d *cgroupData) error { - dir, err := d.join("hugetlb") + _, err := d.join("hugetlb") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error { - for _, hugetlb := range cgroup.HugetlbLimit { + for _, hugetlb := range cgroup.Resources.HugetlbLimit { if err := writeFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil { return err } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go index e5ffde4b6d0..8c3e963fe01 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go @@ -32,8 +32,9 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) { return err } } - - if err := s.Set(path, d.config); err != nil { + // We have to set kernel memory here, as we can't change it once + // processes have been attached. + if err := s.SetKernelMemory(path, d.config); err != nil { return err } } @@ -50,45 +51,49 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) { if err != nil && !cgroups.IsNotFound(err) { return err } + return nil +} +func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error { + // This has to be done separately because it has special constraints (it + // can't be done after there are processes attached to the cgroup). + if cgroup.Resources.KernelMemory > 0 { + if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil { + return err + } + } return nil } func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { - if cgroup.Memory != 0 { - if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Memory, 10)); err != nil { + if cgroup.Resources.Memory != 0 { + if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil { return err } } - if cgroup.MemoryReservation != 0 { - if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.MemoryReservation, 10)); err != nil { + if cgroup.Resources.MemoryReservation != 0 { + if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { return err } } - if cgroup.MemorySwap > 0 { - if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.MemorySwap, 10)); err != nil { + if cgroup.Resources.MemorySwap > 0 { + if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil { return err } } - if cgroup.KernelMemory > 0 { - if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.KernelMemory, 10)); err != nil { - return err - } - } - - if cgroup.OomKillDisable { + if cgroup.Resources.OomKillDisable { if err := writeFile(path, "memory.oom_control", "1"); err != nil { return err } } - if cgroup.MemorySwappiness >= 0 && cgroup.MemorySwappiness <= 100 { - if err := writeFile(path, "memory.swappiness", strconv.FormatInt(cgroup.MemorySwappiness, 10)); err != nil { + if cgroup.Resources.MemorySwappiness >= 0 && cgroup.Resources.MemorySwappiness <= 100 { + if err := writeFile(path, "memory.swappiness", strconv.FormatInt(cgroup.Resources.MemorySwappiness, 10)); err != nil { return err } - } else if cgroup.MemorySwappiness == -1 { + } else if cgroup.Resources.MemorySwappiness == -1 { return nil } else { - return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", cgroup.MemorySwappiness) + return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", cgroup.Resources.MemorySwappiness) } return nil @@ -139,12 +144,12 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { } func memoryAssigned(cgroup *configs.Cgroup) bool { - return cgroup.Memory != 0 || - cgroup.MemoryReservation != 0 || - cgroup.MemorySwap > 0 || - cgroup.KernelMemory > 0 || - cgroup.OomKillDisable || - cgroup.MemorySwappiness != -1 + return cgroup.Resources.Memory != 0 || + cgroup.Resources.MemoryReservation != 0 || + cgroup.Resources.MemorySwap > 0 || + cgroup.Resources.KernelMemory > 0 || + cgroup.Resources.OomKillDisable || + cgroup.Resources.MemorySwappiness != -1 } func getMemoryData(path, name string) (cgroups.MemoryData, error) { diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go index b09a1760e47..8a4054ba877 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go @@ -15,21 +15,16 @@ func (s *NetClsGroup) Name() string { } func (s *NetClsGroup) Apply(d *cgroupData) error { - dir, err := d.join("net_cls") + _, err := d.join("net_cls") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error { - if cgroup.NetClsClassid != "" { - if err := writeFile(path, "net_cls.classid", cgroup.NetClsClassid); err != nil { + if cgroup.Resources.NetClsClassid != "" { + if err := writeFile(path, "net_cls.classid", cgroup.Resources.NetClsClassid); err != nil { return err } } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go index 59117cad91b..d0ab2af894f 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go @@ -15,20 +15,15 @@ func (s *NetPrioGroup) Name() string { } func (s *NetPrioGroup) Apply(d *cgroupData) error { - dir, err := d.join("net_prio") + _, err := d.join("net_prio") if err != nil && !cgroups.IsNotFound(err) { return err } - - if err := s.Set(dir, d.config); err != nil { - return err - } - return nil } func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error { - for _, prioMap := range cgroup.NetPrioIfpriomap { + for _, prioMap := range cgroup.Resources.NetPrioIfpriomap { if err := writeFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil { return err } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go new file mode 100644 index 00000000000..96cbb896cb1 --- /dev/null +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go @@ -0,0 +1,57 @@ +// +build linux + +package fs + +import ( + "fmt" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PidsGroup struct { +} + +func (s *PidsGroup) Name() string { + return "pids" +} + +func (s *PidsGroup) Apply(d *cgroupData) error { + _, err := d.join("pids") + if err != nil && !cgroups.IsNotFound(err) { + return err + } + return nil +} + +func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.PidsLimit != 0 { + // "max" is the fallback value. + limit := "max" + + if cgroup.Resources.PidsLimit > 0 { + limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10) + } + + if err := writeFile(path, "pids.max", limit); err != nil { + return err + } + } + + return nil +} + +func (s *PidsGroup) Remove(d *cgroupData) error { + return removePath(d.path("pids")) +} + +func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { + value, err := getCgroupParamUint(path, "pids.current") + if err != nil { + return fmt.Errorf("failed to parse pids.current - %s", err) + } + + stats.PidsStats.Current = value + return nil +} diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/stats.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/stats.go index bda32b20c3b..74c65abf13b 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/stats.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/stats.go @@ -49,6 +49,11 @@ type MemoryStats struct { Stats map[string]uint64 `json:"stats,omitempty"` } +type PidsStats struct { + // number of pids in the cgroup + Current uint64 `json:"current,omitempty"` +} + type BlkioStatEntry struct { Major uint64 `json:"major,omitempty"` Minor uint64 `json:"minor,omitempty"` @@ -80,6 +85,7 @@ type HugetlbStats struct { type Stats struct { CpuStats CpuStats `json:"cpu_stats,omitempty"` MemoryStats MemoryStats `json:"memory_stats,omitempty"` + PidsStats PidsStats `json:"pids_stats,omitempty"` BlkioStats BlkioStats `json:"blkio_stats,omitempty"` // the map is in the format "size of hugepage: stats of the hugepage" HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go index fa3485f1c0c..7de9ae6050b 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go @@ -26,6 +26,10 @@ func (m *Manager) GetPids() ([]int, error) { return nil, fmt.Errorf("Systemd not supported") } +func (m *Manager) GetAllPids() ([]int, error) { + return nil, fmt.Errorf("Systemd not supported") +} + func (m *Manager) Destroy() error { return fmt.Errorf("Systemd not supported") } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go index 7a422b3c794..db020a971fe 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go @@ -55,6 +55,7 @@ var subsystems = subsystemSet{ &fs.MemoryGroup{}, &fs.CpuGroup{}, &fs.CpuacctGroup{}, + &fs.PidsGroup{}, &fs.BlkioGroup{}, &fs.HugetlbGroup{}, &fs.PerfEventGroup{}, @@ -167,8 +168,25 @@ func (m *Manager) Apply(pid int) error { properties []systemdDbus.Property ) - if c.Slice != "" { - slice = c.Slice + if c.Paths != nil { + paths := make(map[string]string) + for name, path := range c.Paths { + _, err := getSubsystemPath(m.Cgroups, name) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return err + } + paths[name] = path + } + m.Paths = paths + return cgroups.EnterPid(m.Paths, pid) + } + + if c.Parent != "" { + slice = c.Parent } properties = append(properties, @@ -189,26 +207,26 @@ func (m *Manager) Apply(pid int) error { newProp("DefaultDependencies", false)) } - if c.Memory != 0 { + if c.Resources.Memory != 0 { properties = append(properties, - newProp("MemoryLimit", uint64(c.Memory))) + newProp("MemoryLimit", uint64(c.Resources.Memory))) } - if c.CpuShares != 0 { + if c.Resources.CpuShares != 0 { properties = append(properties, - newProp("CPUShares", uint64(c.CpuShares))) + newProp("CPUShares", uint64(c.Resources.CpuShares))) } - if c.BlkioWeight != 0 { + if c.Resources.BlkioWeight != 0 { properties = append(properties, - newProp("BlockIOWeight", uint64(c.BlkioWeight))) + newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight))) } // We need to set kernel memory before processes join cgroup because // kmem.limit_in_bytes can only be set when the cgroup is empty. // And swap memory limit needs to be set after memory limit, only // memory limit is handled by systemd, so it's kind of ugly here. - if c.KernelMemory > 0 { + if c.Resources.KernelMemory > 0 { if err := setKernelMemory(c); err != nil { return err } @@ -233,7 +251,7 @@ func (m *Manager) Apply(pid int) error { return err } - // we need to manually join the freezer, net_cls, net_prio and cpuset cgroup in systemd + // we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd // because it does not currently support it via the dbus api. if err := joinFreezer(c, pid); err != nil { return err @@ -246,6 +264,10 @@ func (m *Manager) Apply(pid int) error { return err } + if err := joinPids(c, pid); err != nil { + return err + } + if err := joinCpuset(c, pid); err != nil { return err } @@ -277,17 +299,13 @@ func (m *Manager) Apply(pid int) error { paths[s.Name()] = subsystemPath } m.Paths = paths - - if paths["cpu"] != "" { - if err := fs.CheckCpushares(paths["cpu"], c.CpuShares); err != nil { - return err - } - } - return nil } func (m *Manager) Destroy() error { + if m.Cgroups.Paths != nil { + return nil + } m.mu.Lock() defer m.mu.Unlock() theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil) @@ -330,68 +348,65 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { } func joinCpu(c *configs.Cgroup, pid int) error { - path, err := getSubsystemPath(c, "cpu") + _, err := join(c, "cpu", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - if c.CpuQuota != 0 { - if err = writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(c.CpuQuota, 10)); err != nil { - return err - } - } - if c.CpuPeriod != 0 { - if err = writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(c.CpuPeriod, 10)); err != nil { - return err - } - } - if c.CpuRtPeriod != 0 { - if err = writeFile(path, "cpu.rt_period_us", strconv.FormatInt(c.CpuRtPeriod, 10)); err != nil { - return err - } - } - if c.CpuRtRuntime != 0 { - if err = writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(c.CpuRtRuntime, 10)); err != nil { - return err - } - } - return nil } func joinFreezer(c *configs.Cgroup, pid int) error { - path, err := join(c, "freezer", pid) + _, err := join(c, "freezer", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - freezer, err := subsystems.Get("freezer") - if err != nil { - return err - } - return freezer.Set(path, c) + return nil } func joinNetPrio(c *configs.Cgroup, pid int) error { - path, err := join(c, "net_prio", pid) + _, err := join(c, "net_prio", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - netPrio, err := subsystems.Get("net_prio") - if err != nil { - return err - } - return netPrio.Set(path, c) + return nil } func joinNetCls(c *configs.Cgroup, pid int) error { - path, err := join(c, "net_cls", pid) + _, err := join(c, "net_cls", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - netcls, err := subsystems.Get("net_cls") - if err != nil { + return nil +} + +func joinPids(c *configs.Cgroup, pid int) error { + _, err := join(c, "pids", pid) + if err != nil && !cgroups.IsNotFound(err) { return err } - return netcls.Set(path, c) + return nil +} + +// systemd represents slice heirarchy using `-`, so we need to follow suit when +// generating the path of slice. Essentially, test-a-b.slice becomes +// test.slice/test-a.slice/test-a-b.slice. +func expandSlice(slice string) (string, error) { + suffix := ".slice" + sliceName := strings.TrimSuffix(slice, suffix) + + var path, prefix string + for _, component := range strings.Split(sliceName, "-") { + // test--a.slice isn't permitted, nor is -test.slice. + if component == "" { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Append the component to the path and to the prefix. + path += prefix + component + suffix + "/" + prefix += component + "-" + } + + return path, nil } func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { @@ -406,8 +421,13 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { } slice := "system.slice" - if c.Slice != "" { - slice = c.Slice + if c.Parent != "" { + slice = c.Parent + } + + slice, err = expandSlice(slice) + if err != nil { + return "", err } return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil @@ -418,15 +438,15 @@ func (m *Manager) Freeze(state configs.FreezerState) error { if err != nil { return err } - prevState := m.Cgroups.Freezer - m.Cgroups.Freezer = state + prevState := m.Cgroups.Resources.Freezer + m.Cgroups.Resources.Freezer = state freezer, err := subsystems.Get("freezer") if err != nil { return err } err = freezer.Set(path, m.Cgroups) if err != nil { - m.Cgroups.Freezer = prevState + m.Cgroups.Resources.Freezer = prevState return err } return nil @@ -440,6 +460,14 @@ func (m *Manager) GetPids() ([]int, error) { return cgroups.GetPids(path) } +func (m *Manager) GetAllPids() ([]int, error) { + path, err := getSubsystemPath(m.Cgroups, "devices") + if err != nil { + return nil, err + } + return cgroups.GetAllPids(path) +} + func (m *Manager) GetStats() (*cgroups.Stats, error) { m.mu.Lock() defer m.mu.Unlock() @@ -458,21 +486,28 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { } func (m *Manager) Set(container *configs.Config) error { - for name, path := range m.Paths { - sys, err := subsystems.Get(name) - if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) { - continue + for _, sys := range subsystems { + // Get the subsystem path, but don't error out for not found cgroups. + path, err := getSubsystemPath(container.Cgroups, sys.Name()) + if err != nil && !cgroups.IsNotFound(err) { + return err } + if err := sys.Set(path, container.Cgroups); err != nil { return err } } + if m.Paths["cpu"] != "" { + if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil { + return err + } + } return nil } func getUnitName(c *configs.Cgroup) string { - return fmt.Sprintf("%s-%s.scope", c.Parent, c.Name) + return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name) } // Atm we can't use the systemd device support because of two missing things: @@ -487,17 +522,13 @@ func getUnitName(c *configs.Cgroup) string { // because systemd will re-write the device settings if it needs to re-apply the cgroup context. // This happens at least for v208 when any sibling unit is started. func joinDevices(c *configs.Cgroup, pid int) error { - path, err := join(c, "devices", pid) + _, err := join(c, "devices", pid) // Even if it's `not found` error, we'll return err because devices cgroup // is hard requirement for container security. if err != nil { return err } - devices, err := subsystems.Get("devices") - if err != nil { - return err - } - return devices.Set(path, c) + return nil } func setKernelMemory(c *configs.Cgroup) error { @@ -510,52 +541,16 @@ func setKernelMemory(c *configs.Cgroup) error { return err } - if c.KernelMemory > 0 { - err = writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(c.KernelMemory, 10)) - if err != nil { - return err - } - } - - return nil + // This doesn't get called by manager.Set, so we need to do it here. + s := &fs.MemoryGroup{} + return s.SetKernelMemory(path, c) } func joinMemory(c *configs.Cgroup, pid int) error { - path, err := getSubsystemPath(c, "memory") + _, err := join(c, "memory", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - - // -1 disables memoryswap - if c.MemorySwap > 0 { - err = writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.MemorySwap, 10)) - if err != nil { - return err - } - } - if c.MemoryReservation > 0 { - err = writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(c.MemoryReservation, 10)) - if err != nil { - return err - } - } - if c.OomKillDisable { - if err := writeFile(path, "memory.oom_control", "1"); err != nil { - return err - } - } - - if c.MemorySwappiness >= 0 && c.MemorySwappiness <= 100 { - err = writeFile(path, "memory.swappiness", strconv.FormatInt(c.MemorySwappiness, 10)) - if err != nil { - return err - } - } else if c.MemorySwappiness == -1 { - return nil - } else { - return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", c.MemorySwappiness) - } - return nil } @@ -577,68 +572,25 @@ func joinCpuset(c *configs.Cgroup, pid int) error { // expects device path instead of major minor numbers, which is also confusing // for users. So we use fs work around for now. func joinBlkio(c *configs.Cgroup, pid int) error { - path, err := getSubsystemPath(c, "blkio") + _, err := join(c, "blkio", pid) if err != nil { return err } - // systemd doesn't directly support this in the dbus properties - if c.BlkioLeafWeight != 0 { - if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(c.BlkioLeafWeight), 10)); err != nil { - return err - } - } - for _, wd := range c.BlkioWeightDevice { - if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil { - return err - } - if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { - return err - } - } - for _, td := range c.BlkioThrottleReadBpsDevice { - if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { - return err - } - } - for _, td := range c.BlkioThrottleWriteBpsDevice { - if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { - return err - } - } - for _, td := range c.BlkioThrottleReadIOPSDevice { - if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { - return err - } - } - for _, td := range c.BlkioThrottleWriteIOPSDevice { - if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { - return err - } - } - return nil } func joinHugetlb(c *configs.Cgroup, pid int) error { - path, err := join(c, "hugetlb", pid) + _, err := join(c, "hugetlb", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - hugetlb, err := subsystems.Get("hugetlb") - if err != nil { - return err - } - return hugetlb.Set(path, c) + return nil } func joinPerfEvent(c *configs.Cgroup, pid int) error { - path, err := join(c, "perf_event", pid) + _, err := join(c, "perf_event", pid) if err != nil && !cgroups.IsNotFound(err) { return err } - perfEvent, err := subsystems.Get("perf_event") - if err != nil { - return err - } - return perfEvent.Set(path, c) + return nil } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index d00e0e2ea58..88620aaee93 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -13,7 +13,7 @@ import ( "time" "github.com/docker/docker/pkg/mount" - "github.com/docker/docker/pkg/units" + "github.com/docker/go-units" ) const cgroupNamePrefix = "name=" @@ -84,10 +84,19 @@ func FindCgroupMountpointDir() (string, error) { // Safe as mountinfo encodes mountpoints with spaces as \040. index := strings.Index(text, " - ") postSeparatorFields := strings.Fields(text[index+3:]) - if len(postSeparatorFields) < 3 { - return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + numPostFields := len(postSeparatorFields) + + // This is an error as we can't detect if the mount is for "cgroup" + if numPostFields == 0 { + return "", fmt.Errorf("Found no fields post '-' in %q", text) } + if postSeparatorFields[0] == "cgroup" { + // Check that the mount is properly formated. + if numPostFields < 3 { + return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + } + return filepath.Dir(fields[4]), nil } } @@ -323,9 +332,14 @@ func GetHugePageSize() ([]string, error) { return pageSizes, nil } -// GetPids returns all pids, that were added to cgroup at path and to all its -// subcgroups. +// GetPids returns all pids, that were added to cgroup at path. func GetPids(path string) ([]int, error) { + return readProcsFile(path) +} + +// GetAllPids returns all pids, that were added to cgroup at path and to all its +// subcgroups. +func GetAllPids(path string) ([]int, error) { var pids []int // collect pids from all sub-cgroups err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error { diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go index 24f93c1ad6e..c186d289aca 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unix.go @@ -16,6 +16,17 @@ type Cgroup struct { // name of parent cgroup or slice Parent string `json:"parent"` + // ScopePrefix decribes prefix for the scope name + ScopePrefix string `json:"scope_prefix"` + + // Paths represent the cgroups paths to join + Paths map[string]string + + // Resources contains various cgroups settings to apply + *Resources +} + +type Resources struct { // If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list. AllowAllDevices bool `json:"allow_all_devices"` @@ -29,7 +40,7 @@ type Cgroup struct { // Memory reservation or soft_limit (in bytes) MemoryReservation int64 `json:"memory_reservation"` - // Total memory usage (memory + swap); set `-1' to disable swap + // Total memory usage (memory + swap); set `-1` to enable unlimited swap MemorySwap int64 `json:"memory_swap"` // Kernel memory limit (in bytes) @@ -56,6 +67,9 @@ type Cgroup struct { // MEM to use CpusetMems string `json:"cpuset_mems"` + // Process limit; set <= `0' to disable limit. + PidsLimit int64 `json:"pids_limit"` + // Specifies per cgroup weight, range is from 10 to 1000. BlkioWeight uint16 `json:"blkio_weight"` @@ -83,9 +97,6 @@ type Cgroup struct { // Hugetlb limit (in bytes) HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"` - // Parent slice to use for systemd TODO: remove in favor or parent - Slice string `json:"slice"` - // Whether to disable OOM Killer OomKillDisable bool `json:"oom_kill_disable"` diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go new file mode 100644 index 00000000000..95e2830a436 --- /dev/null +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go @@ -0,0 +1,6 @@ +// +build !windows,!linux,!freebsd + +package configs + +type Cgroup struct { +} diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go index 0ce040fd342..e45299264c8 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/configs/device_defaults.go @@ -82,20 +82,6 @@ var ( Minor: 1, Permissions: "rwm", }, - { - Path: "/dev/tty0", - Type: 'c', - Major: 4, - Minor: 0, - Permissions: "rwm", - }, - { - Path: "/dev/tty1", - Type: 'c', - Major: 4, - Minor: 1, - Permissions: "rwm", - }, // /dev/pts/ - pts namespaces are "coming soon" { Path: "", diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/console_freebsd.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/console_freebsd.go index 4d20b8da406..3c89eda0799 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/console_freebsd.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/console_freebsd.go @@ -6,8 +6,8 @@ import ( "errors" ) -// newConsole returns an initalized console that can be used within a container by copying bytes +// NewConsole returns an initalized console that can be used within a container by copying bytes // from the master side to the slave that is attached as the tty for the container's init process. -func newConsole(uid, gid int) (Console, error) { +func NewConsole(uid, gid int) (Console, error) { return nil, errors.New("libcontainer console is not supported on FreeBSD") } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/console_linux.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/console_linux.go index f345f572b3f..7af771b65e3 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/console_linux.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/console_linux.go @@ -10,9 +10,9 @@ import ( "github.com/opencontainers/runc/libcontainer/label" ) -// newConsole returns an initalized console that can be used within a container by copying bytes +// NewConsole returns an initalized console that can be used within a container by copying bytes // from the master side to the slave that is attached as the tty for the container's init process. -func newConsole(uid, gid int) (Console, error) { +func NewConsole(uid, gid int) (Console, error) { master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) if err != nil { return nil, err diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/console_windows.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/console_windows.go index 80c7463bc41..a68c02f66b4 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/console_windows.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/console_windows.go @@ -1,7 +1,7 @@ package libcontainer -// newConsole returns an initalized console that can be used within a container -func newConsole(uid, gid int) (Console, error) { +// NewConsole returns an initalized console that can be used within a container +func NewConsole(uid, gid int) (Console, error) { return &windowsConsole{}, nil } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/container.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/container.go index 6292fd1852d..03c8c559322 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/container.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/container.go @@ -14,8 +14,11 @@ import ( type Status int const ( + // The container exists but has not been run yet + Created Status = iota + // The container exists and is running. - Running Status = iota + 1 + Running // The container exists, it is in the process of being paused. Pausing @@ -30,6 +33,25 @@ const ( Destroyed ) +func (s Status) String() string { + switch s { + case Created: + return "created" + case Running: + return "running" + case Pausing: + return "pausing" + case Paused: + return "paused" + case Checkpointed: + return "checkpointed" + case Destroyed: + return "destroyed" + default: + return "unknown" + } +} + // BaseState represents the platform agnostic pieces relating to a // running container's state type BaseState struct { diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/container_linux.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/container_linux.go index 912673a34b5..4015c957628 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/container_linux.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/container_linux.go @@ -3,8 +3,10 @@ package libcontainer import ( + "bytes" "encoding/json" "fmt" + "io" "io/ioutil" "os" "os/exec" @@ -19,6 +21,8 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/criurpc" + "github.com/opencontainers/runc/libcontainer/utils" + "github.com/vishvananda/netlink/nl" ) const stdioFdCount = 3 @@ -34,6 +38,7 @@ type linuxContainer struct { criuPath string m sync.Mutex criuVersion int + state containerState } // State represents a running container's state @@ -100,6 +105,12 @@ type Container interface { // errors: // Systemerror - System error. NotifyOOM() (<-chan struct{}, error) + + // NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level + // + // errors: + // Systemerror - System error. + NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) } // ID returns the container's unique ID @@ -125,7 +136,7 @@ func (c *linuxContainer) State() (*State, error) { } func (c *linuxContainer) Processes() ([]int, error) { - pids, err := c.cgroupManager.GetPids() + pids, err := c.cgroupManager.GetAllPids() if err != nil { return nil, newSystemError(err) } @@ -179,22 +190,27 @@ func (c *linuxContainer) Start(process *Process) error { } return newSystemError(err) } - if doInit { - c.updateState(parent) + c.state = &runningState{ + c: c, } - if c.config.Hooks != nil { - s := configs.HookState{ - Version: c.config.Version, - ID: c.id, - Pid: parent.pid(), - Root: c.config.Rootfs, + if doInit { + if err := c.updateState(parent); err != nil { + return err } - for _, hook := range c.config.Hooks.Poststart { - if err := hook.Run(s); err != nil { - if err := parent.terminate(); err != nil { - logrus.Warn(err) + if c.config.Hooks != nil { + s := configs.HookState{ + Version: c.config.Version, + ID: c.id, + Pid: parent.pid(), + Root: c.config.Rootfs, + } + for _, hook := range c.config.Hooks.Poststart { + if err := hook.Run(s); err != nil { + if err := parent.terminate(); err != nil { + logrus.Warn(err) + } + return newSystemError(err) } - return newSystemError(err) } } } @@ -218,7 +234,7 @@ func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProces return nil, newSystemError(err) } if !doInit { - return c.newSetnsProcess(p, cmd, parentPipe, childPipe), nil + return c.newSetnsProcess(p, cmd, parentPipe, childPipe) } return c.newInitProcess(p, cmd, parentPipe, childPipe) } @@ -247,7 +263,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec. } func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) { - t := "_LIBCONTAINER_INITTYPE=standard" + t := "_LIBCONTAINER_INITTYPE=" + string(initStandard) cloneFlags := c.config.Namespaces.CloneFlags() if cloneFlags&syscall.CLONE_NEWUSER != 0 { if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil { @@ -273,23 +289,24 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c }, nil } -func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) *setnsProcess { - cmd.Env = append(cmd.Env, - fmt.Sprintf("_LIBCONTAINER_INITPID=%d", c.initProcess.pid()), - "_LIBCONTAINER_INITTYPE=setns", - ) - if p.consolePath != "" { - cmd.Env = append(cmd.Env, "_LIBCONTAINER_CONSOLE_PATH="+p.consolePath) +func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns)) + // for setns process, we dont have to set cloneflags as the process namespaces + // will only be set via setns syscall + data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath) + if err != nil { + return nil, err } // TODO: set on container for process management return &setnsProcess{ - cmd: cmd, - cgroupPaths: c.cgroupManager.GetPaths(), - childPipe: childPipe, - parentPipe: parentPipe, - config: c.newInitConfig(p), - process: p, - } + cmd: cmd, + cgroupPaths: c.cgroupManager.GetPaths(), + childPipe: childPipe, + parentPipe: parentPipe, + config: c.newInitConfig(p), + process: p, + bootstrapData: data, + }, nil } func (c *linuxContainer) newInitConfig(process *Process) *initConfig { @@ -316,54 +333,53 @@ func newPipe() (parent *os.File, child *os.File, err error) { func (c *linuxContainer) Destroy() error { c.m.Lock() defer c.m.Unlock() - status, err := c.currentStatus() - if err != nil { - return err - } - if status != Destroyed { - return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped) - } - if !c.config.Namespaces.Contains(configs.NEWPID) { - if err := killCgroupProcesses(c.cgroupManager); err != nil { - logrus.Warn(err) - } - } - err = c.cgroupManager.Destroy() - if rerr := os.RemoveAll(c.root); err == nil { - err = rerr - } - c.initProcess = nil - if c.config.Hooks != nil { - s := configs.HookState{ - Version: c.config.Version, - ID: c.id, - Root: c.config.Rootfs, - } - for _, hook := range c.config.Hooks.Poststop { - if err := hook.Run(s); err != nil { - return err - } - } - } - return err + return c.state.destroy() } func (c *linuxContainer) Pause() error { c.m.Lock() defer c.m.Unlock() - return c.cgroupManager.Freeze(configs.Frozen) + status, err := c.currentStatus() + if err != nil { + return err + } + if status != Running { + return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning) + } + if err := c.cgroupManager.Freeze(configs.Frozen); err != nil { + return err + } + return c.state.transition(&pausedState{ + c: c, + }) } func (c *linuxContainer) Resume() error { c.m.Lock() defer c.m.Unlock() - return c.cgroupManager.Freeze(configs.Thawed) + status, err := c.currentStatus() + if err != nil { + return err + } + if status != Paused { + return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused) + } + if err := c.cgroupManager.Freeze(configs.Thawed); err != nil { + return err + } + return c.state.transition(&runningState{ + c: c, + }) } func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { return notifyOnOOM(c.cgroupManager.GetPaths()) } +func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { + return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) +} + // XXX debug support, remove when debugging done. func addArgsFromEnv(evar string, args *[]string) { if e := os.Getenv(evar); e != "" { @@ -455,7 +471,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { } if criuOpts.ImagesDirectory == "" { - criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image") + return fmt.Errorf("invalid directory to save checkpoint") } // Since a container can be C/R'ed multiple times, @@ -574,11 +590,9 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() - if err := c.checkCriuVersion("1.5.2"); err != nil { return err } - if criuOpts.WorkDirectory == "" { criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work") } @@ -587,22 +601,19 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) { return err } - workDir, err := os.Open(criuOpts.WorkDirectory) if err != nil { return err } defer workDir.Close() - if criuOpts.ImagesDirectory == "" { - criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image") + return fmt.Errorf("invalid directory to restore checkpoint") } imageDir, err := os.Open(criuOpts.ImagesDirectory) if err != nil { return err } defer imageDir.Close() - // CRIU has a few requirements for a root directory: // * it must be a mount point // * its parent must not be overmounted @@ -613,18 +624,15 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { return err } defer os.Remove(root) - root, err = filepath.EvalSymlinks(root) if err != nil { return err } - err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "") if err != nil { return err } defer syscall.Unmount(root, syscall.MNT_DETACH) - t := criurpc.CriuReqType_RESTORE req := &criurpc.CriuReq{ Type: &t, @@ -692,15 +700,13 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { fds []string fdJSON []byte ) - if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil { return err } - if err = json.Unmarshal(fdJSON, &fds); err != nil { + if err := json.Unmarshal(fdJSON, &fds); err != nil { return err } - for i := range fds { if s := fds[i]; strings.Contains(s, "pipe:") { inheritFd := new(criurpc.InheritFd) @@ -709,12 +715,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd) } } - - err = c.criuSwrk(process, req, criuOpts, true) - if err != nil { - return err - } - return nil + return c.criuSwrk(process, req, criuOpts, true) } func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { @@ -909,46 +910,43 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc if notify == nil { return fmt.Errorf("invalid response: %s", resp.String()) } - switch { case notify.GetScript() == "post-dump": - if !opts.LeaveRunning { - f, err := os.Create(filepath.Join(c.root, "checkpoint")) - if err != nil { - return err - } - f.Close() + f, err := os.Create(filepath.Join(c.root, "checkpoint")) + if err != nil { + return err } - break - + f.Close() case notify.GetScript() == "network-unlock": if err := unlockNetwork(c.config); err != nil { return err } - break - case notify.GetScript() == "network-lock": if err := lockNetwork(c.config); err != nil { return err } - break - case notify.GetScript() == "post-restore": pid := notify.GetPid() r, err := newRestoredProcess(int(pid), fds) if err != nil { return err } - - // TODO: crosbymichael restore previous process information by saving the init process information in - // the container's state file or separate process state files. + process.ops = r + if err := c.state.transition(&restoredState{ + imageDir: opts.ImagesDirectory, + c: c, + }); err != nil { + return err + } if err := c.updateState(r); err != nil { return err } - process.ops = r - break + if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil { + if !os.IsNotExist(err) { + logrus.Error(err) + } + } } - return nil } @@ -958,66 +956,130 @@ func (c *linuxContainer) updateState(process parentProcess) error { if err != nil { return err } + return c.saveState(state) +} + +func (c *linuxContainer) saveState(s *State) error { f, err := os.Create(filepath.Join(c.root, stateFilename)) if err != nil { return err } defer f.Close() - os.Remove(filepath.Join(c.root, "checkpoint")) - return json.NewEncoder(f).Encode(state) + return utils.WriteJSON(f, s) +} + +func (c *linuxContainer) deleteState() error { + return os.Remove(filepath.Join(c.root, stateFilename)) } func (c *linuxContainer) currentStatus() (Status, error) { - if _, err := os.Stat(filepath.Join(c.root, "checkpoint")); err == nil { - return Checkpointed, nil + if err := c.refreshState(); err != nil { + return -1, err } + return c.state.status(), nil +} + +// refreshState needs to be called to verify that the current state on the +// container is what is true. Because consumers of libcontainer can use it +// out of process we need to verify the container's status based on runtime +// information and not rely on our in process info. +func (c *linuxContainer) refreshState() error { + paused, err := c.isPaused() + if err != nil { + return err + } + if paused { + return c.state.transition(&pausedState{c: c}) + } + running, err := c.isRunning() + if err != nil { + return err + } + if running { + return c.state.transition(&runningState{c: c}) + } + return c.state.transition(&stoppedState{c: c}) +} + +func (c *linuxContainer) isRunning() (bool, error) { if c.initProcess == nil { - return Destroyed, nil + return false, nil } // return Running if the init process is alive if err := syscall.Kill(c.initProcess.pid(), 0); err != nil { if err == syscall.ESRCH { - return Destroyed, nil + return false, nil } - return 0, newSystemError(err) + return false, newSystemError(err) } - if c.config.Cgroups != nil && c.config.Cgroups.Freezer == configs.Frozen { - return Paused, nil + return true, nil +} + +func (c *linuxContainer) isPaused() (bool, error) { + data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state")) + if err != nil { + if os.IsNotExist(err) { + return false, nil + } + return false, newSystemError(err) } - return Running, nil + return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil } func (c *linuxContainer) currentState() (*State, error) { - status, err := c.currentStatus() - if err != nil { - return nil, err - } - if status == Destroyed { - return nil, newGenericError(fmt.Errorf("container destroyed"), ContainerNotExists) - } - startTime, err := c.initProcess.startTime() - if err != nil { - return nil, newSystemError(err) + var ( + startTime string + externalDescriptors []string + pid = -1 + ) + if c.initProcess != nil { + pid = c.initProcess.pid() + startTime, _ = c.initProcess.startTime() + externalDescriptors = c.initProcess.externalDescriptors() } state := &State{ BaseState: BaseState{ ID: c.ID(), Config: *c.config, - InitProcessPid: c.initProcess.pid(), + InitProcessPid: pid, InitProcessStartTime: startTime, }, CgroupPaths: c.cgroupManager.GetPaths(), NamespacePaths: make(map[configs.NamespaceType]string), - ExternalDescriptors: c.initProcess.externalDescriptors(), + ExternalDescriptors: externalDescriptors, } - for _, ns := range c.config.Namespaces { - state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid()) - } - for _, nsType := range configs.NamespaceTypes() { - if _, ok := state.NamespacePaths[nsType]; !ok { - ns := configs.Namespace{Type: nsType} - state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid()) + if pid > 0 { + for _, ns := range c.config.Namespaces { + state.NamespacePaths[ns.Type] = ns.GetPath(pid) + } + for _, nsType := range configs.NamespaceTypes() { + if _, ok := state.NamespacePaths[nsType]; !ok { + ns := configs.Namespace{Type: nsType} + state.NamespacePaths[ns.Type] = ns.GetPath(pid) + } } } return state, nil } + +// bootstrapData encodes the necessary data in netlink binary format as a io.Reader. +// Consumer can write the data to a bootstrap program such as one that uses +// nsenter package to bootstrap the container's init process correctly, i.e. with +// correct namespaces, uid/gid mapping etc. +func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath string) (io.Reader, error) { + // create the netlink message + r := nl.NewNetlinkRequest(int(InitMsg), 0) + // write pid + r.AddData(&Int32msg{ + Type: PidAttr, + Value: uint32(pid), + }) + // write console path + if consolePath != "" { + r.AddData(&Bytemsg{ + Type: ConsolePathAttr, + Value: []byte(consolePath), + }) + } + return bytes.NewReader(r.Serialize()), nil +} diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/error.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/error.go index 6c266620e7b..378ef469434 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/error.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/error.go @@ -16,12 +16,14 @@ const ( ContainerPaused ContainerNotStopped ContainerNotRunning + ContainerNotPaused // Process errors ProcessNotExecuted // Common errors ConfigInvalid + ConsoleExists SystemError ) @@ -43,6 +45,10 @@ func (c ErrorCode) String() string { return "Container is not stopped" case ContainerNotRunning: return "Container is not running" + case ConsoleExists: + return "Console exists for process" + case ContainerNotPaused: + return "Container is not paused" default: return "Unknown error" } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/factory_linux.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/factory_linux.go index 70513f7b671..0e4e9dfd5ab 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/factory_linux.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/factory_linux.go @@ -5,7 +5,6 @@ package libcontainer import ( "encoding/json" "fmt" - "io/ioutil" "os" "os/exec" "path/filepath" @@ -19,6 +18,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs/validate" + "github.com/opencontainers/runc/libcontainer/utils" ) const ( @@ -166,7 +166,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err if err := os.MkdirAll(containerRoot, 0700); err != nil { return nil, newGenericError(err, SystemError) } - return &linuxContainer{ + c := &linuxContainer{ id: id, root: containerRoot, config: config, @@ -174,7 +174,9 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err initArgs: l.InitArgs, criuPath: l.CriuPath, cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), - }, nil + } + c.state = &stoppedState{c: c} + return c, nil } func (l *LinuxFactory) Load(id string) (Container, error) { @@ -191,7 +193,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) { processStartTime: state.InitProcessStartTime, fds: state.ExternalDescriptors, } - return &linuxContainer{ + c := &linuxContainer{ initProcess: r, id: id, config: &state.Config, @@ -200,7 +202,12 @@ func (l *LinuxFactory) Load(id string) (Container, error) { criuPath: l.CriuPath, cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), root: containerRoot, - }, nil + } + c.state = &createdState{c: c, s: Created} + if err := c.refreshState(); err != nil { + return nil, err + } + return c, nil } func (l *LinuxFactory) Type() string { @@ -222,21 +229,29 @@ func (l *LinuxFactory) StartInitialization() (err error) { // clear the current process's environment to clean any libcontainer // specific env vars. os.Clearenv() + var i initer defer func() { // if we have an error during the initialization of the container's init then send it back to the // parent process in the form of an initError. if err != nil { - // ensure that any data sent from the parent is consumed so it doesn't - // receive ECONNRESET when the child writes to the pipe. - ioutil.ReadAll(pipe) - if err := json.NewEncoder(pipe).Encode(newSystemError(err)); err != nil { + if _, ok := i.(*linuxStandardInit); ok { + // Synchronisation only necessary for standard init. + if err := utils.WriteJSON(pipe, syncT{procError}); err != nil { + panic(err) + } + } + if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil { + panic(err) + } + } else { + if err := utils.WriteJSON(pipe, syncT{procStart}); err != nil { panic(err) } } // ensure that this pipe is always closed pipe.Close() }() - i, err := newContainerInit(it, pipe) + i, err = newContainerInit(it, pipe) if err != nil { return err } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/generic_error.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/generic_error.go index 6fbc2d75a58..924d637b25d 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/generic_error.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/generic_error.go @@ -9,6 +9,19 @@ import ( "github.com/opencontainers/runc/libcontainer/stacktrace" ) +type syncType uint8 + +const ( + procReady syncType = iota + procError + procStart + procRun +) + +type syncT struct { + Type syncType `json:"type"` +} + var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}} Code: {{.ECode}} {{if .Message }} diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/hack/validate.sh b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/hack/validate.sh deleted file mode 100644 index 0bf1541ff38..00000000000 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/hack/validate.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env bash -set -e - -# This script runs all validations - -validate() { - export MAKEDIR=/go/src/github.com/docker/docker/hack/make - sed -i 's!docker/docker!opencontainers/runc/libcontainer!' /go/src/github.com/docker/docker/hack/make/.validate - bash /go/src/github.com/docker/docker/hack/make/validate-dco - bash /go/src/github.com/docker/docker/hack/make/validate-gofmt - go get golang.org/x/tools/cmd/vet - bash /go/src/github.com/docker/docker/hack/make/validate-vet -} - -# run validations -validate diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/init_linux.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/init_linux.go index ddb11865958..918f1030167 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/init_linux.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/init_linux.go @@ -5,6 +5,7 @@ package libcontainer import ( "encoding/json" "fmt" + "io" "io/ioutil" "net" "os" @@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) { }, nil case initStandard: return &linuxStandardInit{ + pipe: pipe, parentPid: syscall.Getppid(), config: config, }, nil @@ -140,6 +142,27 @@ func finalizeNamespace(config *initConfig) error { return nil } +// syncParentReady sends to the given pipe a JSON payload which indicates that +// the init is ready to Exec the child process. It then waits for the parent to +// indicate that it is cleared to Exec. +func syncParentReady(pipe io.ReadWriter) error { + // Tell parent. + if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil { + return err + } + // Wait for parent to give the all-clear. + var procSync syncT + if err := json.NewDecoder(pipe).Decode(&procSync); err != nil { + if err == io.EOF { + return fmt.Errorf("parent closed synchronisation channel") + } + if procSync.Type != procRun { + return fmt.Errorf("invalid synchronisation flag from parent") + } + } + return nil +} + // joinExistingNamespaces gets all the namespace paths specified for the container and // does a setns on the namespace fd so that the current process joins the namespace. func joinExistingNamespaces(namespaces []configs.Namespace) error { @@ -309,7 +332,7 @@ func killCgroupProcesses(m cgroups.Manager) error { if err := m.Freeze(configs.Frozen); err != nil { logrus.Warn(err) } - pids, err := m.GetPids() + pids, err := m.GetAllPids() if err != nil { m.Freeze(configs.Thawed) return err diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/message_linux.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/message_linux.go new file mode 100644 index 00000000000..0c3301f2bb5 --- /dev/null +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/message_linux.go @@ -0,0 +1,62 @@ +// +build linux + +package libcontainer + +import ( + "syscall" + + "github.com/vishvananda/netlink/nl" +) + +// list of known message types we want to send to bootstrap program +// The number is randomly chosen to not conflict with known netlink types +const ( + InitMsg uint16 = 62000 + PidAttr uint16 = 27281 + ConsolePathAttr uint16 = 27282 + // When syscall.NLA_HDRLEN is in gccgo, take this out. + syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1) +) + +type Int32msg struct { + Type uint16 + Value uint32 +} + +// int32msg has the following representation +// | nlattr len | nlattr type | +// | uint32 value | +func (msg *Int32msg) Serialize() []byte { + buf := make([]byte, msg.Len()) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(msg.Len())) + native.PutUint16(buf[2:4], msg.Type) + native.PutUint32(buf[4:8], msg.Value) + return buf +} + +func (msg *Int32msg) Len() int { + return syscall_NLA_HDRLEN + 4 +} + +// bytemsg has the following representation +// | nlattr len | nlattr type | +// | value | pad | +type Bytemsg struct { + Type uint16 + Value []byte +} + +func (msg *Bytemsg) Serialize() []byte { + l := msg.Len() + buf := make([]byte, (l+syscall.NLA_ALIGNTO-1) & ^(syscall.NLA_ALIGNTO-1)) + native := nl.NativeEndian() + native.PutUint16(buf[0:2], uint16(l)) + native.PutUint16(buf[2:4], msg.Type) + copy(buf[4:], msg.Value) + return buf +} + +func (msg *Bytemsg) Len() int { + return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated +} diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/network_linux.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/network_linux.go index ce93277a522..5075bee4db5 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/network_linux.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/network_linux.go @@ -93,7 +93,7 @@ func (l *loopback) create(n *network, nspid int) error { } func (l *loopback) initialize(config *network) error { - return netlink.LinkSetUp(&netlink.Device{netlink.LinkAttrs{Name: "lo"}}) + return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}}) } func (l *loopback) attach(n *configs.Network) (err error) { @@ -111,7 +111,7 @@ type veth struct { } func (v *veth) detach(n *configs.Network) (err error) { - return netlink.LinkSetMaster(&netlink.Device{netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil) + return netlink.LinkSetMaster(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil) } // attach a container network interface to an external network diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/notify_linux.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/notify_linux.go index cf81e24d442..839a50c55a3 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/notify_linux.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/notify_linux.go @@ -12,31 +12,32 @@ import ( const oomCgroupName = "memory" -// notifyOnOOM returns channel on which you can expect event about OOM, -// if process died without OOM this channel will be closed. -// s is current *libcontainer.State for container. -func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) { - dir := paths[oomCgroupName] - if dir == "" { - return nil, fmt.Errorf("There is no path for %q in state", oomCgroupName) - } - oomControl, err := os.Open(filepath.Join(dir, "memory.oom_control")) +type PressureLevel uint + +const ( + LowPressure PressureLevel = iota + MediumPressure + CriticalPressure +) + +func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) { + evFile, err := os.Open(filepath.Join(cgDir, evName)) if err != nil { return nil, err } fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0) if syserr != 0 { - oomControl.Close() + evFile.Close() return nil, syserr } eventfd := os.NewFile(fd, "eventfd") - eventControlPath := filepath.Join(dir, "cgroup.event_control") - data := fmt.Sprintf("%d %d", eventfd.Fd(), oomControl.Fd()) + eventControlPath := filepath.Join(cgDir, "cgroup.event_control") + data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg) if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil { eventfd.Close() - oomControl.Close() + evFile.Close() return nil, err } ch := make(chan struct{}) @@ -44,7 +45,7 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) { defer func() { close(ch) eventfd.Close() - oomControl.Close() + evFile.Close() }() buf := make([]byte, 8) for { @@ -61,3 +62,28 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) { }() return ch, nil } + +// notifyOnOOM returns channel on which you can expect event about OOM, +// if process died without OOM this channel will be closed. +func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) { + dir := paths[oomCgroupName] + if dir == "" { + return nil, fmt.Errorf("path %q missing", oomCgroupName) + } + + return registerMemoryEvent(dir, "memory.oom_control", "") +} + +func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) { + dir := paths[oomCgroupName] + if dir == "" { + return nil, fmt.Errorf("path %q missing", oomCgroupName) + } + + if level > CriticalPressure { + return nil, fmt.Errorf("invalid pressure level %d", level) + } + + levelStr := []string{"low", "medium", "critical"}[level] + return registerMemoryEvent(dir, "memory.pressure_level", levelStr) +} diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c index 01450a90a6b..27e6e53d4da 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c @@ -17,6 +17,11 @@ #include #include +#include +#include +#include +#include + /* All arguments should be above stack, because it grows down */ struct clone_arg { /* @@ -63,24 +68,33 @@ static int clone_parent(jmp_buf * env) return child; } +static uint32_t readint32(char *buf) +{ + return *(uint32_t *) buf; +} + +// list of known message types we want to send to bootstrap program +// These are defined in libcontainer/message_linux.go +#define INIT_MSG 62000 +#define PID_ATTR 27281 +#define CONSOLE_PATH_ATTR 27282 + void nsexec() { char *namespaces[] = { "ipc", "uts", "net", "pid", "mnt", "user" }; const int num = sizeof(namespaces) / sizeof(char *); jmp_buf env; char buf[PATH_MAX], *val; - int i, tfd, self_tfd, child, len, pipenum, consolefd = -1; - pid_t pid; - char *console; + int i, tfd, self_tfd, child, n, len, pipenum, consolefd = -1; + pid_t pid = 0; - val = getenv("_LIBCONTAINER_INITPID"); - if (val == NULL) + // if we dont have INITTYPE or this is the init process, skip the bootstrap process + val = getenv("_LIBCONTAINER_INITTYPE"); + if (val == NULL || strcmp(val, "standard") == 0) { return; - - pid = atoi(val); - snprintf(buf, sizeof(buf), "%d", pid); - if (strcmp(val, buf)) { - pr_perror("Unable to parse _LIBCONTAINER_INITPID"); + } + if (strcmp(val, "setns") != 0) { + pr_perror("Invalid inittype %s", val); exit(1); } @@ -89,7 +103,6 @@ void nsexec() pr_perror("Child pipe not found"); exit(1); } - pipenum = atoi(val); snprintf(buf, sizeof(buf), "%d", pipenum); if (strcmp(val, buf)) { @@ -97,13 +110,56 @@ void nsexec() exit(1); } - console = getenv("_LIBCONTAINER_CONSOLE_PATH"); - if (console != NULL) { - consolefd = open(console, O_RDWR); - if (consolefd < 0) { - pr_perror("Failed to open console %s", console); - exit(1); + char nlbuf[NLMSG_HDRLEN]; + struct nlmsghdr *nh; + if ((n = read(pipenum, nlbuf, NLMSG_HDRLEN)) != NLMSG_HDRLEN) { + pr_perror("Failed to read netlink header, got %d", n); + exit(1); + } + + nh = (struct nlmsghdr *)nlbuf; + if (nh->nlmsg_type == NLMSG_ERROR) { + pr_perror("Invalid netlink header message"); + exit(1); + } + if (nh->nlmsg_type != INIT_MSG) { + pr_perror("Unexpected netlink message type %d", nh->nlmsg_type); + exit(1); + } + // read the netlink payload + len = NLMSG_PAYLOAD(nh, 0); + char data[len]; + if ((n = read(pipenum, data, len)) != len) { + pr_perror("Failed to read netlink payload, got %d", n); + exit(1); + } + + int start = 0; + struct nlattr *attr; + while (start < len) { + int payload_len; + attr = (struct nlattr *)((void *)data + start); + start += NLA_HDRLEN; + payload_len = attr->nla_len - NLA_HDRLEN; + switch (attr->nla_type) { + case PID_ATTR: + pid = (pid_t) readint32(data + start); + break; + case CONSOLE_PATH_ATTR: + consolefd = open((char *)data + start, O_RDWR); + if (consolefd < 0) { + pr_perror("Failed to open console %s", (char *)data + start); + exit(1); + } + break; } + start += NLA_ALIGN(payload_len); + } + + // required pid to be passed + if (pid == 0) { + pr_perror("missing pid"); + exit(1); } /* Check that the specified process exists */ @@ -133,15 +189,13 @@ void nsexec() } /* Skip namespaces we're already part of */ - if (fstatat(self_tfd, namespaces[i], &self_st, 0) != -1 && - st.st_ino == self_st.st_ino) { + if (fstatat(self_tfd, namespaces[i], &self_st, 0) != -1 && st.st_ino == self_st.st_ino) { continue; } fd = openat(tfd, namespaces[i], O_RDONLY); if (fd == -1) { - pr_perror("Failed to open ns file %s for ns %s", buf, - namespaces[i]); + pr_perror("Failed to open ns file %s for ns %s", buf, namespaces[i]); exit(1); } // Set the namespace. diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/process.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/process.go index 7902d08ce4c..9661df80a81 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/process.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/process.go @@ -78,12 +78,28 @@ func (p Process) Signal(sig os.Signal) error { return p.ops.signal(sig) } +// IO holds the process's STDIO +type IO struct { + Stdin io.WriteCloser + Stdout io.ReadCloser + Stderr io.ReadCloser +} + // NewConsole creates new console for process and returns it func (p *Process) NewConsole(rootuid int) (Console, error) { - console, err := newConsole(rootuid, rootuid) + console, err := NewConsole(rootuid, rootuid) if err != nil { return nil, err } p.consolePath = console.Path() return console, nil } + +// ConsoleFromPath sets the process's console with the path provided +func (p *Process) ConsoleFromPath(path string) error { + if p.consolePath != "" { + return newGenericError(fmt.Errorf("console path already exists for process"), ConsoleExists) + } + p.consolePath = path + return nil +} diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/process_linux.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/process_linux.go index 4d17cbc5768..ac457c26c8a 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/process_linux.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/process_linux.go @@ -5,6 +5,7 @@ package libcontainer import ( "encoding/json" "errors" + "fmt" "io" "os" "os/exec" @@ -15,6 +16,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) type parentProcess interface { @@ -41,13 +43,14 @@ type parentProcess interface { } type setnsProcess struct { - cmd *exec.Cmd - parentPipe *os.File - childPipe *os.File - cgroupPaths map[string]string - config *initConfig - fds []string - process *Process + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + cgroupPaths map[string]string + config *initConfig + fds []string + process *Process + bootstrapData io.Reader } func (p *setnsProcess) startTime() (string, error) { @@ -64,6 +67,16 @@ func (p *setnsProcess) signal(sig os.Signal) error { func (p *setnsProcess) start() (err error) { defer p.parentPipe.Close() + err = p.cmd.Start() + p.childPipe.Close() + if err != nil { + return newSystemError(err) + } + if p.bootstrapData != nil { + if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil { + return newSystemError(err) + } + } if err = p.execSetns(); err != nil { return newSystemError(err) } @@ -72,9 +85,10 @@ func (p *setnsProcess) start() (err error) { return newSystemError(err) } } - if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil { + if err := utils.WriteJSON(p.parentPipe, p.config); err != nil { return newSystemError(err) } + if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemError(err) } @@ -84,6 +98,7 @@ func (p *setnsProcess) start() (err error) { if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { return newSystemError(err) } + // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { p.wait() return newSystemError(ierr) @@ -96,11 +111,6 @@ func (p *setnsProcess) start() (err error) { // before the go runtime boots, we wait on the process to die and receive the child's pid // over the provided pipe. func (p *setnsProcess) execSetns() error { - err := p.cmd.Start() - p.childPipe.Close() - if err != nil { - return newSystemError(err) - } status, err := p.cmd.Process.Wait() if err != nil { p.cmd.Wait() @@ -192,7 +202,6 @@ func (p *initProcess) start() (err error) { return newSystemError(err) } p.setExternalDescriptors(fds) - // Do this before syncing with child so that no children // can escape the cgroup if err := p.manager.Apply(p.pid()); err != nil { @@ -223,13 +232,56 @@ func (p *initProcess) start() (err error) { if err := p.sendConfig(); err != nil { return newSystemError(err) } - // wait for the child process to fully complete and receive an error message - // if one was encoutered - var ierr *genericError - if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { + var ( + procSync syncT + sentRun bool + ierr *genericError + ) + +loop: + for { + if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil { + if err == io.EOF { + break loop + } + return newSystemError(err) + } + switch procSync.Type { + case procStart: + break loop + case procReady: + if err := p.manager.Set(p.config.Config); err != nil { + return newSystemError(err) + } + // Sync with child. + if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil { + return newSystemError(err) + } + sentRun = true + case procError: + // wait for the child process to fully complete and receive an error message + // if one was encoutered + if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF { + return newSystemError(err) + } + if ierr != nil { + break loop + } + // Programmer error. + panic("No error following JSON procError payload.") + default: + return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child")) + } + } + if !sentRun { + return newSystemError(fmt.Errorf("could not synchronise with container process")) + } + if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil { return newSystemError(err) } + // Must be done after Shutdown so the child will exit and we can wait for it. if ierr != nil { + p.wait() return newSystemError(ierr) } return nil @@ -264,11 +316,7 @@ func (p *initProcess) startTime() (string, error) { func (p *initProcess) sendConfig() error { // send the state to the container's init process then shutdown writes for the parent - if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil { - return err - } - // shutdown writes for the parent side of the pipe - return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR) + return utils.WriteJSON(p.parentPipe, p.config) } func (p *initProcess) createNetworkInterfaces() error { @@ -314,3 +362,44 @@ func getPipeFds(pid int) ([]string, error) { } return fds, nil } + +// InitializeIO creates pipes for use with the process's STDIO +// and returns the opposite side for each +func (p *Process) InitializeIO(rootuid int) (i *IO, err error) { + var fds []uintptr + i = &IO{} + // cleanup in case of an error + defer func() { + if err != nil { + for _, fd := range fds { + syscall.Close(int(fd)) + } + } + }() + // STDIN + r, w, err := os.Pipe() + if err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stdin, i.Stdin = r, w + // STDOUT + if r, w, err = os.Pipe(); err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stdout, i.Stdout = w, r + // STDERR + if r, w, err = os.Pipe(); err != nil { + return nil, err + } + fds = append(fds, r.Fd(), w.Fd()) + p.Stderr, i.Stderr = w, r + // change ownership of the pipes incase we are in a user namespace + for _, fd := range fds { + if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil { + return nil, err + } + } + return i, nil +} diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/rootfs_linux.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/rootfs_linux.go index 5a2fad88183..aa061ab0817 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/rootfs_linux.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/rootfs_linux.go @@ -18,6 +18,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/label" + "github.com/opencontainers/runc/libcontainer/system" ) const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV @@ -299,6 +300,24 @@ func checkMountDestination(rootfs, dest string) error { invalidDestinations := []string{ "/proc", } + // White list, it should be sub directories of invalid destinations + validDestinations := []string{ + // These entries can be bind mounted by files emulated by fuse, + // so commands like top, free displays stats in container. + "/proc/cpuinfo", + "/proc/diskstats", + "/proc/meminfo", + "/proc/stats", + } + for _, valid := range validDestinations { + path, err := filepath.Rel(filepath.Join(rootfs, valid), dest) + if err != nil { + return err + } + if path == "." { + return nil + } + } for _, invalid := range invalidDestinations { path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest) if err != nil { @@ -365,11 +384,12 @@ func reOpenDevNull() error { // Create the device nodes in the container. func createDevices(config *configs.Config) error { + useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER) oldMask := syscall.Umask(0000) for _, node := range config.Devices { // containers running in a user namespace are not allowed to mknod // devices so we can just bind mount it from the host. - if err := createDeviceNode(config.Rootfs, node, config.Namespaces.Contains(configs.NEWUSER)); err != nil { + if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil { syscall.Umask(oldMask) return err } diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/seccomp/fixtures/proc_self_status b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/seccomp/fixtures/proc_self_status new file mode 100644 index 00000000000..0e0084f6c28 --- /dev/null +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/seccomp/fixtures/proc_self_status @@ -0,0 +1,47 @@ +Name: cat +State: R (running) +Tgid: 19383 +Ngid: 0 +Pid: 19383 +PPid: 19275 +TracerPid: 0 +Uid: 1000 1000 1000 1000 +Gid: 1000 1000 1000 1000 +FDSize: 256 +Groups: 24 25 27 29 30 44 46 102 104 108 111 1000 1001 +NStgid: 19383 +NSpid: 19383 +NSpgid: 19383 +NSsid: 19275 +VmPeak: 5944 kB +VmSize: 5944 kB +VmLck: 0 kB +VmPin: 0 kB +VmHWM: 744 kB +VmRSS: 744 kB +VmData: 324 kB +VmStk: 136 kB +VmExe: 48 kB +VmLib: 1776 kB +VmPTE: 32 kB +VmPMD: 12 kB +VmSwap: 0 kB +Threads: 1 +SigQ: 0/30067 +SigPnd: 0000000000000000 +ShdPnd: 0000000000000000 +SigBlk: 0000000000000000 +SigIgn: 0000000000000080 +SigCgt: 0000000000000000 +CapInh: 0000000000000000 +CapPrm: 0000000000000000 +CapEff: 0000000000000000 +CapBnd: 0000003fffffffff +CapAmb: 0000000000000000 +Seccomp: 0 +Cpus_allowed: f +Cpus_allowed_list: 0-3 +Mems_allowed: 00000000,00000001 +Mems_allowed_list: 0 +voluntary_ctxt_switches: 0 +nonvoluntary_ctxt_switches: 1 diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go index aff1b63a53a..623e227748d 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go @@ -3,8 +3,11 @@ package seccomp import ( + "bufio" "fmt" "log" + "os" + "strings" "syscall" "github.com/opencontainers/runc/libcontainer/configs" @@ -17,6 +20,9 @@ var ( actKill = libseccomp.ActKill actTrace = libseccomp.ActTrace.SetReturnCode(int16(syscall.EPERM)) actErrno = libseccomp.ActErrno.SetReturnCode(int16(syscall.EPERM)) + + // SeccompModeFilter refers to the syscall argument SECCOMP_MODE_FILTER. + SeccompModeFilter = uintptr(2) ) // Filters given syscalls in a container, preventing them from being used @@ -73,6 +79,24 @@ func InitSeccomp(config *configs.Seccomp) error { return nil } +// IsEnabled returns if the kernel has been configured to support seccomp. +func IsEnabled() bool { + // Try to read from /proc/self/status for kernels > 3.8 + s, err := parseStatusFile("/proc/self/status") + if err != nil { + // Check if Seccomp is supported, via CONFIG_SECCOMP. + if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_GET_SECCOMP, 0, 0); err != syscall.EINVAL { + // Make sure the kernel has CONFIG_SECCOMP_FILTER. + if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_SECCOMP, SeccompModeFilter, 0); err != syscall.EINVAL { + return true + } + } + return false + } + _, ok := s["Seccomp"] + return ok +} + // Convert Libcontainer Action to Libseccomp ScmpAction func getAction(act configs.Action) (libseccomp.ScmpAction, error) { switch act { @@ -178,3 +202,30 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error { return nil } + +func parseStatusFile(path string) (map[string]string, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + s := bufio.NewScanner(f) + status := make(map[string]string) + + for s.Scan() { + if err := s.Err(); err != nil { + return nil, err + } + + text := s.Text() + parts := strings.Split(text, ":") + + if len(parts) <= 1 { + continue + } + + status[parts[0]] = parts[1] + } + return status, nil +} diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go index 87d3abbc645..888483e7687 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go @@ -17,3 +17,8 @@ func InitSeccomp(config *configs.Seccomp) error { } return nil } + +// IsEnabled returns false, because it is not supported. +func IsEnabled() bool { + return false +} diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/selinux/selinux.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/selinux/selinux.go index 2771bb50e0b..88d612cade2 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/selinux/selinux.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/selinux/selinux.go @@ -231,10 +231,14 @@ func ReserveLabel(scon string) { } } +func selinuxEnforcePath() string { + return fmt.Sprintf("%s/enforce", selinuxPath) +} + func SelinuxGetEnforce() int { var enforce int - enforceS, err := readCon(fmt.Sprintf("%s/enforce", selinuxPath)) + enforceS, err := readCon(selinuxEnforcePath()) if err != nil { return -1 } @@ -246,6 +250,10 @@ func SelinuxGetEnforce() int { return enforce } +func SelinuxSetEnforce(mode int) error { + return writeCon(selinuxEnforcePath(), fmt.Sprintf("%d", mode)) +} + func SelinuxGetEnforceMode() int { switch readConfig(selinuxTag) { case "enforcing": diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/standard_init_linux.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/standard_init_linux.go index ec1005789c5..d3b50863bd4 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/standard_init_linux.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/standard_init_linux.go @@ -3,6 +3,7 @@ package libcontainer import ( + "io" "os" "syscall" @@ -14,6 +15,7 @@ import ( ) type linuxStandardInit struct { + pipe io.ReadWriter parentPid int config *initConfig } @@ -50,7 +52,6 @@ func (l *linuxStandardInit) Init() error { if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil { return err } - label.Init() // InitializeMountNamespace() can be executed only for a new mount namespace if l.config.Config.Namespaces.Contains(configs.NEWNS) { @@ -75,7 +76,6 @@ func (l *linuxStandardInit) Init() error { return err } } - for _, path := range l.config.Config.ReadonlyPaths { if err := remountReadonly(path); err != nil { return err @@ -90,6 +90,12 @@ func (l *linuxStandardInit) Init() error { if err != nil { return err } + // Tell our parent that we're ready to Execv. This must be done before the + // Seccomp rules have been applied, because we need to be able to read and + // write to a socket. + if err := syncParentReady(l.pipe); err != nil { + return err + } if l.config.Config.Seccomp != nil { if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil { return err diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/state_linux.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/state_linux.go new file mode 100644 index 00000000000..fb71ef97d39 --- /dev/null +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/state_linux.go @@ -0,0 +1,223 @@ +// +build linux + +package libcontainer + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/Sirupsen/logrus" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func newStateTransitionError(from, to containerState) error { + return &stateTransitionError{ + From: from.status().String(), + To: to.status().String(), + } +} + +// stateTransitionError is returned when an invalid state transition happens from one +// state to another. +type stateTransitionError struct { + From string + To string +} + +func (s *stateTransitionError) Error() string { + return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To) +} + +type containerState interface { + transition(containerState) error + destroy() error + status() Status +} + +func destroy(c *linuxContainer) error { + if !c.config.Namespaces.Contains(configs.NEWPID) { + if err := killCgroupProcesses(c.cgroupManager); err != nil { + logrus.Warn(err) + } + } + err := c.cgroupManager.Destroy() + if rerr := os.RemoveAll(c.root); err == nil { + err = rerr + } + c.initProcess = nil + if herr := runPoststopHooks(c); err == nil { + err = herr + } + c.state = &stoppedState{c: c} + return err +} + +func runPoststopHooks(c *linuxContainer) error { + if c.config.Hooks != nil { + s := configs.HookState{ + Version: c.config.Version, + ID: c.id, + Root: c.config.Rootfs, + } + for _, hook := range c.config.Hooks.Poststop { + if err := hook.Run(s); err != nil { + return err + } + } + } + return nil +} + +// stoppedState represents a container is a stopped/destroyed state. +type stoppedState struct { + c *linuxContainer +} + +func (b *stoppedState) status() Status { + return Destroyed +} + +func (b *stoppedState) transition(s containerState) error { + switch s.(type) { + case *runningState: + b.c.state = s + return nil + case *restoredState: + b.c.state = s + return nil + case *stoppedState: + return nil + } + return newStateTransitionError(b, s) +} + +func (b *stoppedState) destroy() error { + return destroy(b.c) +} + +// runningState represents a container that is currently running. +type runningState struct { + c *linuxContainer +} + +func (r *runningState) status() Status { + return Running +} + +func (r *runningState) transition(s containerState) error { + switch s.(type) { + case *stoppedState: + running, err := r.c.isRunning() + if err != nil { + return err + } + if running { + return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped) + } + r.c.state = s + return nil + case *pausedState: + r.c.state = s + return nil + case *runningState: + return nil + } + return newStateTransitionError(r, s) +} + +func (r *runningState) destroy() error { + running, err := r.c.isRunning() + if err != nil { + return err + } + if running { + return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped) + } + return destroy(r.c) +} + +// pausedState represents a container that is currently pause. It cannot be destroyed in a +// paused state and must transition back to running first. +type pausedState struct { + c *linuxContainer +} + +func (p *pausedState) status() Status { + return Paused +} + +func (p *pausedState) transition(s containerState) error { + switch s.(type) { + case *runningState, *stoppedState: + p.c.state = s + return nil + case *pausedState: + return nil + } + return newStateTransitionError(p, s) +} + +func (p *pausedState) destroy() error { + isRunning, err := p.c.isRunning() + if err != nil { + return err + } + if !isRunning { + if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil { + return err + } + return destroy(p.c) + } + return newGenericError(fmt.Errorf("container is paused"), ContainerPaused) +} + +// restoredState is the same as the running state but also has accociated checkpoint +// information that maybe need destroyed when the container is stopped and destory is called. +type restoredState struct { + imageDir string + c *linuxContainer +} + +func (r *restoredState) status() Status { + return Running +} + +func (r *restoredState) transition(s containerState) error { + switch s.(type) { + case *stoppedState: + return nil + case *runningState: + return nil + } + return newStateTransitionError(r, s) +} + +func (r *restoredState) destroy() error { + if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil { + if !os.IsNotExist(err) { + return err + } + } + return destroy(r.c) +} + +// createdState is used whenever a container is restored, loaded, or setting additional +// processes inside and it should not be destroyed when it is exiting. +type createdState struct { + c *linuxContainer + s Status +} + +func (n *createdState) status() Status { + return n.s +} + +func (n *createdState) transition(s containerState) error { + n.c.state = s + return nil +} + +func (n *createdState) destroy() error { + return nil +} diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/system/linux.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/system/linux.go index 2cc3ef803a3..6c835e68ec2 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/system/linux.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/system/linux.go @@ -3,6 +3,9 @@ package system import ( + "bufio" + "fmt" + "os" "os/exec" "syscall" "unsafe" @@ -75,3 +78,37 @@ func Setctty() error { } return nil } + +/* + * Detect whether we are currently running in a user namespace. + * Copied from github.com/lxc/lxd/shared/util.go + */ +func RunningInUserNS() bool { + file, err := os.Open("/proc/self/uid_map") + if err != nil { + /* + * This kernel-provided file only exists if user namespaces are + * supported + */ + return false + } + defer file.Close() + + buf := bufio.NewReader(file) + l, _, err := buf.ReadLine() + if err != nil { + return false + } + + line := string(l) + var a, b, c int64 + fmt.Sscanf(line, "%d %d %d", &a, &b, &c) + /* + * We assume we are in the initial user namespace if we have a full + * range - 4294967295 uids starting at uid 0. + */ + if a == 0 && b == 0 && c == 4294967295 { + return false + } + return true +} diff --git a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/utils/utils.go b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/utils/utils.go index 86cf1d65e71..1378006b0a6 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/utils/utils.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -3,6 +3,7 @@ package utils import ( "crypto/rand" "encoding/hex" + "encoding/json" "io" "path/filepath" "syscall" @@ -36,10 +37,20 @@ func ResolveRootfs(uncleanRootfs string) (string, error) { } // ExitStatus returns the correct exit status for a process based on if it -// was signaled or exited cleanly. +// was signaled or exited cleanly func ExitStatus(status syscall.WaitStatus) int { if status.Signaled() { return exitSignalOffset + int(status.Signal()) } return status.ExitStatus() } + +// WriteJSON writes the provided struct v to w using standard json marshaling +func WriteJSON(w io.Writer, v interface{}) error { + data, err := json.Marshal(v) + if err != nil { + return err + } + _, err = w.Write(data) + return err +} diff --git a/contrib/mesos/pkg/executor/node.go b/contrib/mesos/pkg/executor/node.go index 1fe3662492c..ac0c8590277 100644 --- a/contrib/mesos/pkg/executor/node.go +++ b/contrib/mesos/pkg/executor/node.go @@ -22,7 +22,7 @@ import ( type NodeInfo struct { Cores int - Mem int64 // in bytes + Mem uint64 // in bytes } func nodeInfo(si *mesos.SlaveInfo, ei *mesos.ExecutorInfo) NodeInfo { @@ -57,13 +57,13 @@ func nodeInfo(si *mesos.SlaveInfo, ei *mesos.ExecutorInfo) NodeInfo { // TODO(sttts): switch to float64 when "Machine Allocables" are implemented ni.Cores += int(r.GetScalar().GetValue()) case "mem": - ni.Mem += int64(r.GetScalar().GetValue()) * 1024 * 1024 + ni.Mem += uint64(r.GetScalar().GetValue()) * 1024 * 1024 } } // TODO(sttts): subtract executorCPU/Mem from static pod resources before subtracting them from the capacity ni.Cores -= int(executorCPU) - ni.Mem -= int64(executorMem) * 1024 * 1024 + ni.Mem -= uint64(executorMem) * 1024 * 1024 return ni } diff --git a/contrib/mesos/pkg/executor/service/cadvisor.go b/contrib/mesos/pkg/executor/service/cadvisor.go index a38a1523d02..663dbdbada1 100644 --- a/contrib/mesos/pkg/executor/service/cadvisor.go +++ b/contrib/mesos/pkg/executor/service/cadvisor.go @@ -25,10 +25,10 @@ import ( type MesosCadvisor struct { cadvisor.Interface cores int - mem int64 + mem uint64 } -func NewMesosCadvisor(cores int, mem int64, port uint) (*MesosCadvisor, error) { +func NewMesosCadvisor(cores int, mem uint64, port uint) (*MesosCadvisor, error) { c, err := cadvisor.New(port) if err != nil { return nil, err diff --git a/pkg/kubelet/cadvisor/util.go b/pkg/kubelet/cadvisor/util.go index 283349d4d3a..2dac2175652 100644 --- a/pkg/kubelet/cadvisor/util.go +++ b/pkg/kubelet/cadvisor/util.go @@ -28,7 +28,7 @@ func CapacityFromMachineInfo(info *cadvisorApi.MachineInfo) api.ResourceList { int64(info.NumCores*1000), resource.DecimalSI), api.ResourceMemory: *resource.NewQuantity( - info.MemoryCapacity, + int64(info.MemoryCapacity), resource.BinarySI), } return c diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go index eb2f4aff778..31d016e9ca7 100644 --- a/pkg/kubelet/cm/container_manager_linux.go +++ b/pkg/kubelet/cm/container_manager_linux.go @@ -126,8 +126,11 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I func createManager(containerName string) *fs.Manager { return &fs.Manager{ Cgroups: &configs.Cgroup{ - Name: containerName, - AllowAllDevices: true, + Parent: "/", + Name: containerName, + Resources: &configs.Resources{ + AllowAllDevices: true, + }, }, } } @@ -208,10 +211,13 @@ func (cm *containerManagerImpl) setupNode() error { dockerContainer := &fs.Manager{ Cgroups: &configs.Cgroup{ - Name: cm.DockerDaemonContainerName, - Memory: memoryLimit, - MemorySwap: -1, - AllowAllDevices: true, + Parent: "/", + Name: cm.DockerDaemonContainerName, + Resources: &configs.Resources{ + Memory: memoryLimit, + MemorySwap: -1, + AllowAllDevices: true, + }, }, } cont.ensureStateFunc = func(manager *fs.Manager) error { @@ -227,7 +233,8 @@ func (cm *containerManagerImpl) setupNode() error { rootContainer := &fs.Manager{ Cgroups: &configs.Cgroup{ - Name: "/", + Parent: "/", + Name: "/", }, } manager := createManager(cm.SystemContainerName) @@ -377,40 +384,23 @@ func ensureSystemContainer(rootContainer *fs.Manager, manager *fs.Manager) error continue } - // Get PIDs already in target group so we can remove them from the list of - // PIDs to move. - systemCgroupPIDs, err := manager.GetPids() - if err != nil { - errs = append(errs, fmt.Errorf("failed to list PIDs for %s: %v", manager.Cgroups.Name, err)) - continue - } - - systemCgroupPIDMap := make(map[int]struct{}, len(systemCgroupPIDs)) - for _, pid := range systemCgroupPIDs { - systemCgroupPIDMap[pid] = struct{}{} - } - - // Remove kernel pids and process 1 + // Remove kernel pids and other protected PIDs (pid 1, PIDs already in system & kubelet containers) pids := make([]int, 0, len(allPids)) for _, pid := range allPids { if isKernelPid(pid) { continue } - if _, ok := systemCgroupPIDMap[pid]; ok { - continue - } - pids = append(pids, pid) } - glog.Infof("Found %d PIDs in root, %d of them are kernel related", len(allPids), len(allPids)-len(pids)) + glog.Infof("Found %d PIDs in root, %d of them are not to be moved", len(allPids), len(allPids)-len(pids)) - // Check if we moved all the non-kernel PIDs. + // Check if we have moved all the non-kernel PIDs. if len(pids) == 0 { break } - glog.Infof("Moving non-kernel threads: %v", pids) + glog.Infof("Moving non-kernel processes: %v", pids) for _, pid := range pids { err := manager.Apply(pid) if err != nil { diff --git a/pkg/kubelet/dockertools/manager.go b/pkg/kubelet/dockertools/manager.go index 35e6f0c0717..2d1fa7d07b1 100644 --- a/pkg/kubelet/dockertools/manager.go +++ b/pkg/kubelet/dockertools/manager.go @@ -1572,7 +1572,7 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe if container.Name == PodInfraContainerName { oomScoreAdj = qos.PodInfraOOMAdj } else { - oomScoreAdj = qos.GetContainerOOMScoreAdjust(container, dm.machineInfo.MemoryCapacity) + oomScoreAdj = qos.GetContainerOOMScoreAdjust(container, int64(dm.machineInfo.MemoryCapacity)) } cgroupName, err := dm.procFs.GetFullContainerName(containerInfo.State.Pid) if err != nil { diff --git a/pkg/util/oom/oom_linux.go b/pkg/util/oom/oom_linux.go index 0c87ba4dd3b..9a89dbde261 100644 --- a/pkg/util/oom/oom_linux.go +++ b/pkg/util/oom/oom_linux.go @@ -41,7 +41,8 @@ func NewOOMAdjuster() *OOMAdjuster { func getPids(cgroupName string) ([]int, error) { fsManager := fs.Manager{ Cgroups: &configs.Cgroup{ - Name: cgroupName, + Parent: "/", + Name: cgroupName, }, } return fsManager.GetPids() diff --git a/pkg/util/resource_container_linux.go b/pkg/util/resource_container_linux.go index e0405dd3318..8d166045e04 100644 --- a/pkg/util/resource_container_linux.go +++ b/pkg/util/resource_container_linux.go @@ -33,8 +33,11 @@ import ( func RunInResourceContainer(containerName string) error { manager := fs.Manager{ Cgroups: &configs.Cgroup{ - Name: containerName, - AllowAllDevices: true, + Parent: "/", + Name: containerName, + Resources: &configs.Resources{ + AllowAllDevices: true, + }, }, }