Ensure kubelet pid is not moved to system container

This commit is contained in:
Jimmi Dyson 2016-01-06 23:36:48 +00:00
parent 6e6974a38f
commit 1c289943f5
62 changed files with 1765 additions and 774 deletions

76
Godeps/Godeps.json generated
View File

@ -579,93 +579,93 @@
},
{
"ImportPath": "github.com/google/cadvisor/api",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/cache/memory",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/collector",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/container",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/events",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/fs",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/healthz",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/http",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/info/v1",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/info/v2",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/manager",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/metrics",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/pages",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/storage",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/summary",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/utils",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/validate",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/cadvisor/version",
"Comment": "v0.20.4",
"Rev": "59488ce2c4197f501283739c6a4dd3169999f317"
"Comment": "v0.20.5",
"Rev": "9aa348ff5e191fcf3eccd59e5a434022aca77b87"
},
{
"ImportPath": "github.com/google/gofuzz",
@ -793,8 +793,8 @@
},
{
"ImportPath": "github.com/opencontainers/runc/libcontainer",
"Comment": "v0.0.5",
"Rev": "97bc9a7faf3dd660d9be90a2880b2e37f3cdbf38"
"Comment": "v0.0.7",
"Rev": "7ca2aa4873aea7cb4265b1726acb24b90d8726c6"
},
{
"ImportPath": "github.com/pborman/uuid",

View File

@ -139,6 +139,7 @@ func newDockerContainerHandler(
rootFs: rootFs,
rootfsStorageDir: rootfsStorageDir,
fsHandler: newFsHandler(time.Minute, rootfsStorageDir, otherStorageDir, fsInfo),
envs: make(map[string]string),
}
// We assume that if Inspect fails then the container is not known to docker.
@ -206,36 +207,31 @@ func libcontainerConfigToContainerSpec(config *libcontainerconfigs.Config, mi *i
spec.HasMemory = true
spec.Memory.Limit = math.MaxUint64
spec.Memory.SwapLimit = math.MaxUint64
if config.Cgroups.Memory > 0 {
spec.Memory.Limit = uint64(config.Cgroups.Memory)
}
if config.Cgroups.MemorySwap > 0 {
spec.Memory.SwapLimit = uint64(config.Cgroups.MemorySwap)
}
// Get CPU info
spec.HasCpu = true
spec.Cpu.Limit = 1024
if config.Cgroups.CpuShares != 0 {
spec.Cpu.Limit = uint64(config.Cgroups.CpuShares)
if config.Cgroups.Resources != nil {
if config.Cgroups.Resources.Memory > 0 {
spec.Memory.Limit = uint64(config.Cgroups.Resources.Memory)
}
if config.Cgroups.Resources.MemorySwap > 0 {
spec.Memory.SwapLimit = uint64(config.Cgroups.Resources.MemorySwap)
}
// Get CPU info
spec.HasCpu = true
spec.Cpu.Limit = 1024
if config.Cgroups.Resources.CpuShares != 0 {
spec.Cpu.Limit = uint64(config.Cgroups.Resources.CpuShares)
}
spec.Cpu.Mask = utils.FixCpuMask(config.Cgroups.Resources.CpusetCpus, mi.NumCores)
}
spec.Cpu.Mask = utils.FixCpuMask(config.Cgroups.CpusetCpus, mi.NumCores)
spec.HasDiskIo = true
return spec
}
var (
hasNetworkModes = map[string]bool{
"host": true,
"bridge": true,
"default": true,
}
)
func hasNet(networkMode string) bool {
return hasNetworkModes[networkMode]
return !strings.HasPrefix(networkMode, "container:")
}
func (self *dockerContainerHandler) GetSpec() (info.ContainerSpec, error) {

View File

@ -292,31 +292,32 @@ func convertOldConfigToNew(config v1Config) *configs.Config {
result.Routes = config.Config.Routes
var newCgroup = &configs.Cgroup{
Name: old.Name,
Parent: old.Parent,
AllowAllDevices: old.AllowAllDevices,
AllowedDevices: old.AllowedDevices,
DeniedDevices: old.DeniedDevices,
Memory: old.Memory,
MemoryReservation: old.MemoryReservation,
MemorySwap: old.MemorySwap,
KernelMemory: old.KernelMemory,
CpuShares: old.CpuShares,
CpuQuota: old.CpuQuota,
CpuPeriod: old.CpuPeriod,
CpuRtRuntime: old.CpuRtRuntime,
CpuRtPeriod: old.CpuRtPeriod,
CpusetCpus: old.CpusetCpus,
CpusetMems: old.CpusetMems,
BlkioWeight: old.BlkioWeight,
BlkioLeafWeight: old.BlkioLeafWeight,
Freezer: old.Freezer,
HugetlbLimit: old.HugetlbLimit,
Slice: old.Slice,
OomKillDisable: old.OomKillDisable,
MemorySwappiness: old.MemorySwappiness,
NetPrioIfpriomap: old.NetPrioIfpriomap,
NetClsClassid: old.NetClsClassid,
Name: old.Name,
Parent: old.Parent,
Resources: &configs.Resources{
AllowAllDevices: old.Resources.AllowAllDevices,
AllowedDevices: old.Resources.AllowedDevices,
DeniedDevices: old.Resources.DeniedDevices,
Memory: old.Resources.Memory,
MemoryReservation: old.Resources.MemoryReservation,
MemorySwap: old.Resources.MemorySwap,
KernelMemory: old.Resources.KernelMemory,
CpuShares: old.Resources.CpuShares,
CpuQuota: old.Resources.CpuQuota,
CpuPeriod: old.Resources.CpuPeriod,
CpuRtRuntime: old.Resources.CpuRtRuntime,
CpuRtPeriod: old.Resources.CpuRtPeriod,
CpusetCpus: old.Resources.CpusetCpus,
CpusetMems: old.Resources.CpusetMems,
BlkioWeight: old.Resources.BlkioWeight,
BlkioLeafWeight: old.Resources.BlkioLeafWeight,
Freezer: old.Resources.Freezer,
HugetlbLimit: old.Resources.HugetlbLimit,
OomKillDisable: old.Resources.OomKillDisable,
MemorySwappiness: old.Resources.MemorySwappiness,
NetPrioIfpriomap: old.Resources.NetPrioIfpriomap,
NetClsClassid: old.Resources.NetClsClassid,
},
}
result.Cgroups = newCgroup

View File

@ -54,6 +54,8 @@ type RealFsInfo struct {
// Map from label to block device path.
// Labels are intent-specific tags that are auto-detected.
labels map[string]string
dmsetup dmsetupClient
}
type Context struct {
@ -67,9 +69,11 @@ func NewFsInfo(context Context) (FsInfo, error) {
if err != nil {
return nil, err
}
partitions := make(map[string]partition, 0)
fsInfo := &RealFsInfo{}
fsInfo.labels = make(map[string]string, 0)
fsInfo := &RealFsInfo{
partitions: make(map[string]partition, 0),
labels: make(map[string]string, 0),
dmsetup: &defaultDmsetupClient{},
}
supportedFsType := map[string]bool{
// all ext systems are checked through prefix.
"btrfs": true,
@ -82,49 +86,87 @@ func NewFsInfo(context Context) (FsInfo, error) {
continue
}
// Avoid bind mounts.
if _, ok := partitions[mount.Source]; ok {
if _, ok := fsInfo.partitions[mount.Source]; ok {
continue
}
if mount.Fstype == "zfs" {
Fstype = mount.Fstype
}
partitions[mount.Source] = partition{
fsInfo.partitions[mount.Source] = partition{
fsType: Fstype,
mountpoint: mount.Mountpoint,
major: uint(mount.Major),
minor: uint(mount.Minor),
}
}
if storageDriver, ok := context.DockerInfo["Driver"]; ok && storageDriver == "devicemapper" {
dev, major, minor, blockSize, err := dockerDMDevice(context.DockerInfo["DriverStatus"])
if err != nil {
glog.Warningf("Could not get Docker devicemapper device: %v", err)
} else {
partitions[dev] = partition{
fsType: "devicemapper",
major: major,
minor: minor,
blockSize: blockSize,
}
fsInfo.labels[LabelDockerImages] = dev
}
}
glog.Infof("Filesystem partitions: %+v", partitions)
fsInfo.partitions = partitions
fsInfo.addLabels(context)
// need to call this before the log line below printing out the partitions, as this function may
// add a "partition" for devicemapper to fsInfo.partitions
fsInfo.addDockerImagesLabel(context)
glog.Infof("Filesystem partitions: %+v", fsInfo.partitions)
fsInfo.addSystemRootLabel()
return fsInfo, nil
}
func (self *RealFsInfo) addLabels(context Context) {
dockerPaths := getDockerImagePaths(context)
// getDockerDeviceMapperInfo returns information about the devicemapper device and "partition" if
// docker is using devicemapper for its storage driver. If a loopback device is being used, don't
// return any information or error, as we want to report based on the actual partition where the
// loopback file resides, inside of the loopback file itself.
func (self *RealFsInfo) getDockerDeviceMapperInfo(dockerInfo map[string]string) (string, *partition, error) {
if storageDriver, ok := dockerInfo["Driver"]; ok && storageDriver != "devicemapper" {
return "", nil, nil
}
var driverStatus [][]string
if err := json.Unmarshal([]byte(dockerInfo["DriverStatus"]), &driverStatus); err != nil {
return "", nil, err
}
dataLoopFile := dockerStatusValue(driverStatus, "Data loop file")
if len(dataLoopFile) > 0 {
return "", nil, nil
}
dev, major, minor, blockSize, err := dockerDMDevice(driverStatus, self.dmsetup)
if err != nil {
return "", nil, err
}
return dev, &partition{
fsType: "devicemapper",
major: major,
minor: minor,
blockSize: blockSize,
}, nil
}
// addSystemRootLabel attempts to determine which device contains the mount for /.
func (self *RealFsInfo) addSystemRootLabel() {
for src, p := range self.partitions {
if p.mountpoint == "/" {
if _, ok := self.labels[LabelSystemRoot]; !ok {
self.labels[LabelSystemRoot] = src
}
}
self.updateDockerImagesPath(src, p.mountpoint, dockerPaths)
// TODO(rjnagal): Add label for docker devicemapper pool.
}
}
// addDockerImagesLabel attempts to determine which device contains the mount for docker images.
func (self *RealFsInfo) addDockerImagesLabel(context Context) {
dockerDev, dockerPartition, err := self.getDockerDeviceMapperInfo(context.DockerInfo)
if err != nil {
glog.Warningf("Could not get Docker devicemapper device: %v", err)
}
if len(dockerDev) > 0 && dockerPartition != nil {
self.partitions[dockerDev] = *dockerPartition
self.labels[LabelDockerImages] = dockerDev
} else {
dockerPaths := getDockerImagePaths(context)
for src, p := range self.partitions {
self.updateDockerImagesPath(src, p.mountpoint, dockerPaths)
}
}
}
@ -345,20 +387,30 @@ func dockerStatusValue(status [][]string, target string) string {
return ""
}
// dmsetupClient knows to to interact with dmsetup to retrieve information about devicemapper.
type dmsetupClient interface {
table(poolName string) ([]byte, error)
//TODO add status(poolName string) ([]byte, error) and use it in getDMStats so we can unit test
}
// defaultDmsetupClient implements the standard behavior for interacting with dmsetup.
type defaultDmsetupClient struct{}
var _ dmsetupClient = &defaultDmsetupClient{}
func (*defaultDmsetupClient) table(poolName string) ([]byte, error) {
return exec.Command("dmsetup", "table", poolName).Output()
}
// Devicemapper thin provisioning is detailed at
// https://www.kernel.org/doc/Documentation/device-mapper/thin-provisioning.txt
func dockerDMDevice(driverStatus string) (string, uint, uint, uint, error) {
var config [][]string
err := json.Unmarshal([]byte(driverStatus), &config)
if err != nil {
return "", 0, 0, 0, err
}
poolName := dockerStatusValue(config, "Pool Name")
func dockerDMDevice(driverStatus [][]string, dmsetup dmsetupClient) (string, uint, uint, uint, error) {
poolName := dockerStatusValue(driverStatus, "Pool Name")
if len(poolName) == 0 {
return "", 0, 0, 0, fmt.Errorf("Could not get dm pool name")
}
out, err := exec.Command("dmsetup", "table", poolName).Output()
out, err := dmsetup.table(poolName)
if err != nil {
return "", 0, 0, 0, err
}

View File

@ -136,7 +136,7 @@ type MachineInfo struct {
CpuFrequency uint64 `json:"cpu_frequency_khz"`
// The amount of memory (in bytes) in this machine
MemoryCapacity int64 `json:"memory_capacity"`
MemoryCapacity uint64 `json:"memory_capacity"`
// The machine id
MachineID string `json:"machine_id"`

View File

@ -41,7 +41,7 @@ type Attributes struct {
CpuFrequency uint64 `json:"cpu_frequency_khz"`
// The amount of memory (in bytes) in this machine
MemoryCapacity int64 `json:"memory_capacity"`
MemoryCapacity uint64 `json:"memory_capacity"`
// The machine id
MachineID string `json:"machine_id"`

View File

@ -82,8 +82,8 @@ func GetClockSpeed(procInfo []byte) (uint64, error) {
}
// GetMachineMemoryCapacity returns the machine's total memory from /proc/meminfo.
// Returns the total memory capacity as an int64 (number of bytes).
func GetMachineMemoryCapacity() (int64, error) {
// Returns the total memory capacity as an uint64 (number of bytes).
func GetMachineMemoryCapacity() (uint64, error) {
out, err := ioutil.ReadFile("/proc/meminfo")
if err != nil {
return 0, err
@ -97,8 +97,8 @@ func GetMachineMemoryCapacity() (int64, error) {
}
// GetMachineSwapCapacity returns the machine's total swap from /proc/meminfo.
// Returns the total swap capacity as an int64 (number of bytes).
func GetMachineSwapCapacity() (int64, error) {
// Returns the total swap capacity as an uint64 (number of bytes).
func GetMachineSwapCapacity() (uint64, error) {
out, err := ioutil.ReadFile("/proc/meminfo")
if err != nil {
return 0, err
@ -113,14 +113,14 @@ func GetMachineSwapCapacity() (int64, error) {
// parseCapacity matches a Regexp in a []byte, returning the resulting value in bytes.
// Assumes that the value matched by the Regexp is in KB.
func parseCapacity(b []byte, r *regexp.Regexp) (int64, error) {
func parseCapacity(b []byte, r *regexp.Regexp) (uint64, error) {
matches := r.FindSubmatch(b)
if len(matches) != 2 {
return -1, fmt.Errorf("failed to match regexp in output: %q", string(b))
return 0, fmt.Errorf("failed to match regexp in output: %q", string(b))
}
m, err := strconv.ParseInt(string(matches[1]), 10, 64)
m, err := strconv.ParseUint(string(matches[1]), 10, 64)
if err != nil {
return -1, err
return 0, err
}
// Convert to bytes.

View File

@ -1 +1 @@
0.20.4
0.20.5

View File

@ -10,80 +10,165 @@ host system and which is (optionally) isolated from other containers in the syst
#### Using libcontainer
To create a container you first have to initialize an instance of a factory
that will handle the creation and initialization for a container.
Because containers are spawned in a two step process you will need to provide
arguments to a binary that will be executed as the init process for the container.
To use the current binary that is spawning the containers and acting as the parent
you can use `os.Args[0]` and we have a command called `init` setup.
Because containers are spawned in a two step process you will need a binary that
will be executed as the init process for the container. In libcontainer, we use
the current binary (/proc/self/exe) to be executed as the init process, and use
arg "init", we call the first step process "bootstrap", so you always need a "init"
function as the entry of "bootstrap".
```go
root, err := libcontainer.New("/var/lib/container", libcontainer.InitArgs(os.Args[0], "init"))
func init() {
if len(os.Args) > 1 && os.Args[1] == "init" {
runtime.GOMAXPROCS(1)
runtime.LockOSThread()
factory, _ := libcontainer.New("")
if err := factory.StartInitialization(); err != nil {
logrus.Fatal(err)
}
panic("--this line should have never been executed, congratulations--")
}
}
```
Then to create a container you first have to initialize an instance of a factory
that will handle the creation and initialization for a container.
```go
factory, err := libcontainer.New("/var/lib/container", libcontainer.Cgroupfs, libcontainer.InitArgs(os.Args[0], "init"))
if err != nil {
log.Fatal(err)
logrus.Fatal(err)
return
}
```
Once you have an instance of the factory created we can create a configuration
struct describing how the container is to be created. A sample would look similar to this:
struct describing how the container is to be created. A sample would look similar to this:
```go
defaultMountFlags := syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
config := &configs.Config{
Rootfs: rootfs,
Capabilities: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Namespaces: configs.Namespaces([]configs.Namespace{
{Type: configs.NEWNS},
{Type: configs.NEWUTS},
{Type: configs.NEWIPC},
{Type: configs.NEWPID},
{Type: configs.NEWNET},
}),
Cgroups: &configs.Cgroup{
Name: "test-container",
Parent: "system",
AllowAllDevices: false,
AllowedDevices: configs.DefaultAllowedDevices,
},
Devices: configs.DefaultAutoCreatedDevices,
Hostname: "testing",
Networks: []*configs.Network{
{
Type: "loopback",
Address: "127.0.0.1/0",
Gateway: "localhost",
},
},
Rlimits: []configs.Rlimit{
{
Type: syscall.RLIMIT_NOFILE,
Hard: uint64(1024),
Soft: uint64(1024),
},
},
Rootfs: "/your/path/to/rootfs",
Capabilities: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Namespaces: configs.Namespaces([]configs.Namespace{
{Type: configs.NEWNS},
{Type: configs.NEWUTS},
{Type: configs.NEWIPC},
{Type: configs.NEWPID},
{Type: configs.NEWUSER},
{Type: configs.NEWNET},
}),
Cgroups: &configs.Cgroup{
Name: "test-container",
Parent: "system",
Resources: &configs.Resources{
MemorySwappiness: -1,
AllowAllDevices: false,
AllowedDevices: configs.DefaultAllowedDevices,
},
},
MaskPaths: []string{
"/proc/kcore",
},
ReadonlyPaths: []string{
"/proc/sys", "/proc/sysrq-trigger", "/proc/irq", "/proc/bus",
},
Devices: configs.DefaultAutoCreatedDevices,
Hostname: "testing",
Mounts: []*configs.Mount{
{
Source: "proc",
Destination: "/proc",
Device: "proc",
Flags: defaultMountFlags,
},
{
Source: "tmpfs",
Destination: "/dev",
Device: "tmpfs",
Flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME,
Data: "mode=755",
},
{
Source: "devpts",
Destination: "/dev/pts",
Device: "devpts",
Flags: syscall.MS_NOSUID | syscall.MS_NOEXEC,
Data: "newinstance,ptmxmode=0666,mode=0620,gid=5",
},
{
Device: "tmpfs",
Source: "shm",
Destination: "/dev/shm",
Data: "mode=1777,size=65536k",
Flags: defaultMountFlags,
},
{
Source: "mqueue",
Destination: "/dev/mqueue",
Device: "mqueue",
Flags: defaultMountFlags,
},
{
Source: "sysfs",
Destination: "/sys",
Device: "sysfs",
Flags: defaultMountFlags | syscall.MS_RDONLY,
},
},
UidMappings: []configs.IDMap{
{
ContainerID: 0,
Host: 1000,
size: 65536,
},
},
GidMappings: []configs.IDMap{
{
ContainerID: 0,
Host: 1000,
size: 65536,
},
},
Networks: []*configs.Network{
{
Type: "loopback",
Address: "127.0.0.1/0",
Gateway: "localhost",
},
},
Rlimits: []configs.Rlimit{
{
Type: syscall.RLIMIT_NOFILE,
Hard: uint64(1025),
Soft: uint64(1025),
},
},
}
```
Once you have the configuration populated you can create a container:
```go
container, err := root.Create("container-id", config)
container, err := factory.Create("container-id", config)
if err != nil {
logrus.Fatal(err)
return
}
```
To spawn bash as the initial process inside the container and have the
@ -91,23 +176,25 @@ processes pid returned in order to wait, signal, or kill the process:
```go
process := &libcontainer.Process{
Args: []string{"/bin/bash"},
Env: []string{"PATH=/bin"},
User: "daemon",
Stdin: os.Stdin,
Stdout: os.Stdout,
Stderr: os.Stderr,
Args: []string{"/bin/bash"},
Env: []string{"PATH=/bin"},
User: "daemon",
Stdin: os.Stdin,
Stdout: os.Stdout,
Stderr: os.Stderr,
}
err := container.Start(process)
if err != nil {
log.Fatal(err)
logrus.Fatal(err)
container.Destroy()
return
}
// wait for the process to finish.
status, err := process.Wait()
_, err := process.Wait()
if err != nil {
log.Fatal(err)
logrus.Fatal(err)
}
// destroy the container.
@ -124,7 +211,6 @@ processes, err := container.Processes()
// it's processes.
stats, err := container.Stats()
// pause all processes inside the container.
container.Pause()

View File

@ -60,7 +60,7 @@ are required to be mounted within the rootfs that the runtime will setup.
After a container's filesystems are mounted within the newly created
mount namespace `/dev` will need to be populated with a set of device nodes.
It is expected that a rootfs does not need to have any device nodes specified
for `/dev` witin the rootfs as the container will setup the correct devices
for `/dev` within the rootfs as the container will setup the correct devices
that are required for executing a container's process.
| Path | Mode | Access |

View File

@ -2,10 +2,19 @@
package apparmor
import (
"errors"
)
var ErrApparmorNotEnabled = errors.New("apparmor: config provided but apparmor not supported")
func IsEnabled() bool {
return false
}
func ApplyProfile(name string) error {
if name != "" {
return ErrApparmorNotEnabled
}
return nil
}

View File

@ -15,6 +15,9 @@ type Manager interface {
// Returns the PIDs inside the cgroup set
GetPids() ([]int, error)
// Returns the PIDs inside the cgroup set & all sub-cgroups
GetAllPids() ([]int, error)
// Returns statistics for the cgroup set
GetStats() (*Stats, error)

View File

@ -23,6 +23,7 @@ var (
&MemoryGroup{},
&CpuGroup{},
&CpuacctGroup{},
&PidsGroup{},
&BlkioGroup{},
&HugetlbGroup{},
&NetClsGroup{},
@ -112,6 +113,22 @@ func (m *Manager) Apply(pid int) (err error) {
return err
}
if c.Paths != nil {
paths := make(map[string]string)
for name, path := range c.Paths {
_, err := d.path(name)
if err != nil {
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[name] = path
}
m.Paths = paths
return cgroups.EnterPid(m.Paths, pid)
}
paths := make(map[string]string)
defer func() {
if err != nil {
@ -135,17 +152,13 @@ func (m *Manager) Apply(pid int) (err error) {
paths[sys.Name()] = p
}
m.Paths = paths
if paths["cpu"] != "" {
if err := CheckCpushares(paths["cpu"], c.CpuShares); err != nil {
return err
}
}
return nil
}
func (m *Manager) Destroy() error {
if m.Cgroups.Paths != nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
if err := cgroups.RemovePaths(m.Paths); err != nil {
@ -179,15 +192,28 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
}
func (m *Manager) Set(container *configs.Config) error {
for name, path := range m.Paths {
sys, err := subsystems.Get(name)
if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
continue
for _, sys := range subsystems {
// Generate fake cgroup data.
d, err := getCgroupData(container.Cgroups, -1)
if err != nil {
return err
}
// Get the path, but don't error out if the cgroup wasn't found.
path, err := d.path(sys.Name())
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := sys.Set(path, container.Cgroups); err != nil {
return err
}
}
if m.Paths["cpu"] != "" {
if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
return err
}
}
return nil
}
@ -202,40 +228,78 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
if err != nil {
return err
}
prevState := m.Cgroups.Freezer
m.Cgroups.Freezer = state
prevState := m.Cgroups.Resources.Freezer
m.Cgroups.Resources.Freezer = state
freezer, err := subsystems.Get("freezer")
if err != nil {
return err
}
err = freezer.Set(dir, m.Cgroups)
if err != nil {
m.Cgroups.Freezer = prevState
m.Cgroups.Resources.Freezer = prevState
return err
}
return nil
}
func (m *Manager) GetPids() ([]int, error) {
d, err := getCgroupData(m.Cgroups, 0)
dir, err := getCgroupPath(m.Cgroups)
if err != nil {
return nil, err
}
dir, err := d.path("devices")
if err != nil {
return nil, err
}
return cgroups.GetPids(dir)
}
func (m *Manager) GetAllPids() ([]int, error) {
dir, err := getCgroupPath(m.Cgroups)
if err != nil {
return nil, err
}
return cgroups.GetAllPids(dir)
}
func getCgroupPath(c *configs.Cgroup) (string, error) {
d, err := getCgroupData(c, 0)
if err != nil {
return "", err
}
return d.path("devices")
}
// pathClean makes a path safe for use with filepath.Join. This is done by not
// only cleaning the path, but also (if the path is relative) adding a leading
// '/' and cleaning it (then removing the leading '/'). This ensures that a
// path resulting from prepending another path will always resolve to lexically
// be a subdirectory of the prefixed path. This is all done lexically, so paths
// that include symlinks won't be safe as a result of using pathClean.
func pathClean(path string) string {
// Ensure that all paths are cleaned (especially problematic ones like
// "/../../../../../" which can cause lots of issues).
path = filepath.Clean(path)
// If the path isn't absolute, we need to do more processing to fix paths
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
// paths to relative ones.
if !filepath.IsAbs(path) {
path = filepath.Clean(string(os.PathSeparator) + path)
// This can't fail, as (by definition) all paths are relative to root.
path, _ = filepath.Rel(string(os.PathSeparator), path)
}
// Clean the path again for good measure.
return filepath.Clean(path)
}
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
root, err := getCgroupRoot()
if err != nil {
return nil, err
}
// Clean the parent slice path.
c.Parent = pathClean(c.Parent)
return &cgroupData{
root: root,
parent: c.Parent,

View File

@ -22,31 +22,26 @@ func (s *BlkioGroup) Name() string {
}
func (s *BlkioGroup) Apply(d *cgroupData) error {
dir, err := d.join("blkio")
_, err := d.join("blkio")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil
}
func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.BlkioWeight != 0 {
if err := writeFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.BlkioWeight), 10)); err != nil {
if cgroup.Resources.BlkioWeight != 0 {
if err := writeFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil {
return err
}
}
if cgroup.BlkioLeafWeight != 0 {
if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.BlkioLeafWeight), 10)); err != nil {
if cgroup.Resources.BlkioLeafWeight != 0 {
if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil {
return err
}
}
for _, wd := range cgroup.BlkioWeightDevice {
for _, wd := range cgroup.Resources.BlkioWeightDevice {
if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
return err
}
@ -54,22 +49,22 @@ func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error {
return err
}
}
for _, td := range cgroup.BlkioThrottleReadBpsDevice {
for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range cgroup.BlkioThrottleWriteBpsDevice {
for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range cgroup.BlkioThrottleReadIOPSDevice {
for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
for _, td := range cgroup.BlkioThrottleWriteIOPSDevice {
for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err
}

View File

@ -22,41 +22,36 @@ func (s *CpuGroup) Name() string {
func (s *CpuGroup) Apply(d *cgroupData) error {
// We always want to join the cpu group, to allow fair cpu scheduling
// on a container basis
dir, err := d.join("cpu")
_, err := d.join("cpu")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil
}
func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.CpuShares != 0 {
if err := writeFile(path, "cpu.shares", strconv.FormatInt(cgroup.CpuShares, 10)); err != nil {
if cgroup.Resources.CpuShares != 0 {
if err := writeFile(path, "cpu.shares", strconv.FormatInt(cgroup.Resources.CpuShares, 10)); err != nil {
return err
}
}
if cgroup.CpuPeriod != 0 {
if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(cgroup.CpuPeriod, 10)); err != nil {
if cgroup.Resources.CpuPeriod != 0 {
if err := writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(cgroup.Resources.CpuPeriod, 10)); err != nil {
return err
}
}
if cgroup.CpuQuota != 0 {
if err := writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.CpuQuota, 10)); err != nil {
if cgroup.Resources.CpuQuota != 0 {
if err := writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil {
return err
}
}
if cgroup.CpuRtPeriod != 0 {
if err := writeFile(path, "cpu.rt_period_us", strconv.FormatInt(cgroup.CpuRtPeriod, 10)); err != nil {
if cgroup.Resources.CpuRtPeriod != 0 {
if err := writeFile(path, "cpu.rt_period_us", strconv.FormatInt(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
return err
}
}
if cgroup.CpuRtRuntime != 0 {
if err := writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.CpuRtRuntime, 10)); err != nil {
if cgroup.Resources.CpuRtRuntime != 0 {
if err := writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil {
return err
}
}

View File

@ -4,6 +4,7 @@ package fs
import (
"bytes"
"fmt"
"io/ioutil"
"os"
"path/filepath"
@ -29,13 +30,13 @@ func (s *CpusetGroup) Apply(d *cgroupData) error {
}
func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.CpusetCpus != "" {
if err := writeFile(path, "cpuset.cpus", cgroup.CpusetCpus); err != nil {
if cgroup.Resources.CpusetCpus != "" {
if err := writeFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
return err
}
}
if cgroup.CpusetMems != "" {
if err := writeFile(path, "cpuset.mems", cgroup.CpusetMems); err != nil {
if cgroup.Resources.CpusetMems != "" {
if err := writeFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
return err
}
}
@ -63,11 +64,6 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
if err := s.ensureParent(dir, root); err != nil {
return err
}
// the default values inherit from parent cgroup are already set in
// s.ensureParent, cover these if we have our own
if err := s.Set(dir, cgroup); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
if err := writeFile(dir, "cgroup.procs", strconv.Itoa(pid)); err != nil {
@ -95,6 +91,10 @@ func (s *CpusetGroup) ensureParent(current, root string) error {
if filepath.Clean(parent) == root {
return nil
}
// Avoid infinite recursion.
if parent == current {
return fmt.Errorf("cpuset: cgroup parent path outside cgroup root")
}
if err := s.ensureParent(parent, root); err != nil {
return err
}

View File

@ -15,27 +15,22 @@ func (s *DevicesGroup) Name() string {
}
func (s *DevicesGroup) Apply(d *cgroupData) error {
dir, err := d.join("devices")
_, err := d.join("devices")
if err != nil {
// We will return error even it's `not found` error, devices
// cgroup is hard requirement for container's security.
return err
}
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil
}
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
if !cgroup.AllowAllDevices {
if !cgroup.Resources.AllowAllDevices {
if err := writeFile(path, "devices.deny", "a"); err != nil {
return err
}
for _, dev := range cgroup.AllowedDevices {
for _, dev := range cgroup.Resources.AllowedDevices {
if err := writeFile(path, "devices.allow", dev.CgroupString()); err != nil {
return err
}
@ -47,7 +42,7 @@ func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
return err
}
for _, dev := range cgroup.DeniedDevices {
for _, dev := range cgroup.Resources.DeniedDevices {
if err := writeFile(path, "devices.deny", dev.CgroupString()); err != nil {
return err
}

View File

@ -19,22 +19,17 @@ func (s *FreezerGroup) Name() string {
}
func (s *FreezerGroup) Apply(d *cgroupData) error {
dir, err := d.join("freezer")
_, err := d.join("freezer")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil
}
func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
switch cgroup.Freezer {
switch cgroup.Resources.Freezer {
case configs.Frozen, configs.Thawed:
if err := writeFile(path, "freezer.state", string(cgroup.Freezer)); err != nil {
if err := writeFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
return err
}
@ -43,7 +38,7 @@ func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
if err != nil {
return err
}
if strings.TrimSpace(state) == string(cgroup.Freezer) {
if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) {
break
}
time.Sleep(1 * time.Millisecond)
@ -51,7 +46,7 @@ func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
case configs.Undefined:
return nil
default:
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Freezer))
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer))
}
return nil

View File

@ -19,20 +19,15 @@ func (s *HugetlbGroup) Name() string {
}
func (s *HugetlbGroup) Apply(d *cgroupData) error {
dir, err := d.join("hugetlb")
_, err := d.join("hugetlb")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil
}
func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error {
for _, hugetlb := range cgroup.HugetlbLimit {
for _, hugetlb := range cgroup.Resources.HugetlbLimit {
if err := writeFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}

View File

@ -32,8 +32,9 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
return err
}
}
if err := s.Set(path, d.config); err != nil {
// We have to set kernel memory here, as we can't change it once
// processes have been attached.
if err := s.SetKernelMemory(path, d.config); err != nil {
return err
}
}
@ -50,45 +51,49 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *MemoryGroup) SetKernelMemory(path string, cgroup *configs.Cgroup) error {
// This has to be done separately because it has special constraints (it
// can't be done after there are processes attached to the cgroup).
if cgroup.Resources.KernelMemory > 0 {
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemory, 10)); err != nil {
return err
}
}
return nil
}
func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Memory != 0 {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Memory, 10)); err != nil {
if cgroup.Resources.Memory != 0 {
if err := writeFile(path, "memory.limit_in_bytes", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
}
if cgroup.MemoryReservation != 0 {
if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.MemoryReservation, 10)); err != nil {
if cgroup.Resources.MemoryReservation != 0 {
if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
return err
}
}
if cgroup.MemorySwap > 0 {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.MemorySwap, 10)); err != nil {
if cgroup.Resources.MemorySwap > 0 {
if err := writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
}
if cgroup.KernelMemory > 0 {
if err := writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(cgroup.KernelMemory, 10)); err != nil {
return err
}
}
if cgroup.OomKillDisable {
if cgroup.Resources.OomKillDisable {
if err := writeFile(path, "memory.oom_control", "1"); err != nil {
return err
}
}
if cgroup.MemorySwappiness >= 0 && cgroup.MemorySwappiness <= 100 {
if err := writeFile(path, "memory.swappiness", strconv.FormatInt(cgroup.MemorySwappiness, 10)); err != nil {
if cgroup.Resources.MemorySwappiness >= 0 && cgroup.Resources.MemorySwappiness <= 100 {
if err := writeFile(path, "memory.swappiness", strconv.FormatInt(cgroup.Resources.MemorySwappiness, 10)); err != nil {
return err
}
} else if cgroup.MemorySwappiness == -1 {
} else if cgroup.Resources.MemorySwappiness == -1 {
return nil
} else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", cgroup.MemorySwappiness)
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", cgroup.Resources.MemorySwappiness)
}
return nil
@ -139,12 +144,12 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
}
func memoryAssigned(cgroup *configs.Cgroup) bool {
return cgroup.Memory != 0 ||
cgroup.MemoryReservation != 0 ||
cgroup.MemorySwap > 0 ||
cgroup.KernelMemory > 0 ||
cgroup.OomKillDisable ||
cgroup.MemorySwappiness != -1
return cgroup.Resources.Memory != 0 ||
cgroup.Resources.MemoryReservation != 0 ||
cgroup.Resources.MemorySwap > 0 ||
cgroup.Resources.KernelMemory > 0 ||
cgroup.Resources.OomKillDisable ||
cgroup.Resources.MemorySwappiness != -1
}
func getMemoryData(path, name string) (cgroups.MemoryData, error) {

View File

@ -15,21 +15,16 @@ func (s *NetClsGroup) Name() string {
}
func (s *NetClsGroup) Apply(d *cgroupData) error {
dir, err := d.join("net_cls")
_, err := d.join("net_cls")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil
}
func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.NetClsClassid != "" {
if err := writeFile(path, "net_cls.classid", cgroup.NetClsClassid); err != nil {
if cgroup.Resources.NetClsClassid != "" {
if err := writeFile(path, "net_cls.classid", cgroup.Resources.NetClsClassid); err != nil {
return err
}
}

View File

@ -15,20 +15,15 @@ func (s *NetPrioGroup) Name() string {
}
func (s *NetPrioGroup) Apply(d *cgroupData) error {
dir, err := d.join("net_prio")
_, err := d.join("net_prio")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := s.Set(dir, d.config); err != nil {
return err
}
return nil
}
func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error {
for _, prioMap := range cgroup.NetPrioIfpriomap {
for _, prioMap := range cgroup.Resources.NetPrioIfpriomap {
if err := writeFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
return err
}

View File

@ -0,0 +1,57 @@
// +build linux
package fs
import (
"fmt"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type PidsGroup struct {
}
func (s *PidsGroup) Name() string {
return "pids"
}
func (s *PidsGroup) Apply(d *cgroupData) error {
_, err := d.join("pids")
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return nil
}
func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.PidsLimit != 0 {
// "max" is the fallback value.
limit := "max"
if cgroup.Resources.PidsLimit > 0 {
limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
}
if err := writeFile(path, "pids.max", limit); err != nil {
return err
}
}
return nil
}
func (s *PidsGroup) Remove(d *cgroupData) error {
return removePath(d.path("pids"))
}
func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
value, err := getCgroupParamUint(path, "pids.current")
if err != nil {
return fmt.Errorf("failed to parse pids.current - %s", err)
}
stats.PidsStats.Current = value
return nil
}

View File

@ -49,6 +49,11 @@ type MemoryStats struct {
Stats map[string]uint64 `json:"stats,omitempty"`
}
type PidsStats struct {
// number of pids in the cgroup
Current uint64 `json:"current,omitempty"`
}
type BlkioStatEntry struct {
Major uint64 `json:"major,omitempty"`
Minor uint64 `json:"minor,omitempty"`
@ -80,6 +85,7 @@ type HugetlbStats struct {
type Stats struct {
CpuStats CpuStats `json:"cpu_stats,omitempty"`
MemoryStats MemoryStats `json:"memory_stats,omitempty"`
PidsStats PidsStats `json:"pids_stats,omitempty"`
BlkioStats BlkioStats `json:"blkio_stats,omitempty"`
// the map is in the format "size of hugepage: stats of the hugepage"
HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`

View File

@ -26,6 +26,10 @@ func (m *Manager) GetPids() ([]int, error) {
return nil, fmt.Errorf("Systemd not supported")
}
func (m *Manager) GetAllPids() ([]int, error) {
return nil, fmt.Errorf("Systemd not supported")
}
func (m *Manager) Destroy() error {
return fmt.Errorf("Systemd not supported")
}

View File

@ -55,6 +55,7 @@ var subsystems = subsystemSet{
&fs.MemoryGroup{},
&fs.CpuGroup{},
&fs.CpuacctGroup{},
&fs.PidsGroup{},
&fs.BlkioGroup{},
&fs.HugetlbGroup{},
&fs.PerfEventGroup{},
@ -167,8 +168,25 @@ func (m *Manager) Apply(pid int) error {
properties []systemdDbus.Property
)
if c.Slice != "" {
slice = c.Slice
if c.Paths != nil {
paths := make(map[string]string)
for name, path := range c.Paths {
_, err := getSubsystemPath(m.Cgroups, name)
if err != nil {
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[name] = path
}
m.Paths = paths
return cgroups.EnterPid(m.Paths, pid)
}
if c.Parent != "" {
slice = c.Parent
}
properties = append(properties,
@ -189,26 +207,26 @@ func (m *Manager) Apply(pid int) error {
newProp("DefaultDependencies", false))
}
if c.Memory != 0 {
if c.Resources.Memory != 0 {
properties = append(properties,
newProp("MemoryLimit", uint64(c.Memory)))
newProp("MemoryLimit", uint64(c.Resources.Memory)))
}
if c.CpuShares != 0 {
if c.Resources.CpuShares != 0 {
properties = append(properties,
newProp("CPUShares", uint64(c.CpuShares)))
newProp("CPUShares", uint64(c.Resources.CpuShares)))
}
if c.BlkioWeight != 0 {
if c.Resources.BlkioWeight != 0 {
properties = append(properties,
newProp("BlockIOWeight", uint64(c.BlkioWeight)))
newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
}
// We need to set kernel memory before processes join cgroup because
// kmem.limit_in_bytes can only be set when the cgroup is empty.
// And swap memory limit needs to be set after memory limit, only
// memory limit is handled by systemd, so it's kind of ugly here.
if c.KernelMemory > 0 {
if c.Resources.KernelMemory > 0 {
if err := setKernelMemory(c); err != nil {
return err
}
@ -233,7 +251,7 @@ func (m *Manager) Apply(pid int) error {
return err
}
// we need to manually join the freezer, net_cls, net_prio and cpuset cgroup in systemd
// we need to manually join the freezer, net_cls, net_prio, pids and cpuset cgroup in systemd
// because it does not currently support it via the dbus api.
if err := joinFreezer(c, pid); err != nil {
return err
@ -246,6 +264,10 @@ func (m *Manager) Apply(pid int) error {
return err
}
if err := joinPids(c, pid); err != nil {
return err
}
if err := joinCpuset(c, pid); err != nil {
return err
}
@ -277,17 +299,13 @@ func (m *Manager) Apply(pid int) error {
paths[s.Name()] = subsystemPath
}
m.Paths = paths
if paths["cpu"] != "" {
if err := fs.CheckCpushares(paths["cpu"], c.CpuShares); err != nil {
return err
}
}
return nil
}
func (m *Manager) Destroy() error {
if m.Cgroups.Paths != nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
@ -330,68 +348,65 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
}
func joinCpu(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "cpu")
_, err := join(c, "cpu", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if c.CpuQuota != 0 {
if err = writeFile(path, "cpu.cfs_quota_us", strconv.FormatInt(c.CpuQuota, 10)); err != nil {
return err
}
}
if c.CpuPeriod != 0 {
if err = writeFile(path, "cpu.cfs_period_us", strconv.FormatInt(c.CpuPeriod, 10)); err != nil {
return err
}
}
if c.CpuRtPeriod != 0 {
if err = writeFile(path, "cpu.rt_period_us", strconv.FormatInt(c.CpuRtPeriod, 10)); err != nil {
return err
}
}
if c.CpuRtRuntime != 0 {
if err = writeFile(path, "cpu.rt_runtime_us", strconv.FormatInt(c.CpuRtRuntime, 10)); err != nil {
return err
}
}
return nil
}
func joinFreezer(c *configs.Cgroup, pid int) error {
path, err := join(c, "freezer", pid)
_, err := join(c, "freezer", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
freezer, err := subsystems.Get("freezer")
if err != nil {
return err
}
return freezer.Set(path, c)
return nil
}
func joinNetPrio(c *configs.Cgroup, pid int) error {
path, err := join(c, "net_prio", pid)
_, err := join(c, "net_prio", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
netPrio, err := subsystems.Get("net_prio")
if err != nil {
return err
}
return netPrio.Set(path, c)
return nil
}
func joinNetCls(c *configs.Cgroup, pid int) error {
path, err := join(c, "net_cls", pid)
_, err := join(c, "net_cls", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
netcls, err := subsystems.Get("net_cls")
if err != nil {
return nil
}
func joinPids(c *configs.Cgroup, pid int) error {
_, err := join(c, "pids", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
return netcls.Set(path, c)
return nil
}
// systemd represents slice heirarchy using `-`, so we need to follow suit when
// generating the path of slice. Essentially, test-a-b.slice becomes
// test.slice/test-a.slice/test-a-b.slice.
func expandSlice(slice string) (string, error) {
suffix := ".slice"
sliceName := strings.TrimSuffix(slice, suffix)
var path, prefix string
for _, component := range strings.Split(sliceName, "-") {
// test--a.slice isn't permitted, nor is -test.slice.
if component == "" {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
// Append the component to the path and to the prefix.
path += prefix + component + suffix + "/"
prefix += component + "-"
}
return path, nil
}
func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
@ -406,8 +421,13 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
}
slice := "system.slice"
if c.Slice != "" {
slice = c.Slice
if c.Parent != "" {
slice = c.Parent
}
slice, err = expandSlice(slice)
if err != nil {
return "", err
}
return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
@ -418,15 +438,15 @@ func (m *Manager) Freeze(state configs.FreezerState) error {
if err != nil {
return err
}
prevState := m.Cgroups.Freezer
m.Cgroups.Freezer = state
prevState := m.Cgroups.Resources.Freezer
m.Cgroups.Resources.Freezer = state
freezer, err := subsystems.Get("freezer")
if err != nil {
return err
}
err = freezer.Set(path, m.Cgroups)
if err != nil {
m.Cgroups.Freezer = prevState
m.Cgroups.Resources.Freezer = prevState
return err
}
return nil
@ -440,6 +460,14 @@ func (m *Manager) GetPids() ([]int, error) {
return cgroups.GetPids(path)
}
func (m *Manager) GetAllPids() ([]int, error) {
path, err := getSubsystemPath(m.Cgroups, "devices")
if err != nil {
return nil, err
}
return cgroups.GetAllPids(path)
}
func (m *Manager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
@ -458,21 +486,28 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
}
func (m *Manager) Set(container *configs.Config) error {
for name, path := range m.Paths {
sys, err := subsystems.Get(name)
if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
continue
for _, sys := range subsystems {
// Get the subsystem path, but don't error out for not found cgroups.
path, err := getSubsystemPath(container.Cgroups, sys.Name())
if err != nil && !cgroups.IsNotFound(err) {
return err
}
if err := sys.Set(path, container.Cgroups); err != nil {
return err
}
}
if m.Paths["cpu"] != "" {
if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
return err
}
}
return nil
}
func getUnitName(c *configs.Cgroup) string {
return fmt.Sprintf("%s-%s.scope", c.Parent, c.Name)
return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
}
// Atm we can't use the systemd device support because of two missing things:
@ -487,17 +522,13 @@ func getUnitName(c *configs.Cgroup) string {
// because systemd will re-write the device settings if it needs to re-apply the cgroup context.
// This happens at least for v208 when any sibling unit is started.
func joinDevices(c *configs.Cgroup, pid int) error {
path, err := join(c, "devices", pid)
_, err := join(c, "devices", pid)
// Even if it's `not found` error, we'll return err because devices cgroup
// is hard requirement for container security.
if err != nil {
return err
}
devices, err := subsystems.Get("devices")
if err != nil {
return err
}
return devices.Set(path, c)
return nil
}
func setKernelMemory(c *configs.Cgroup) error {
@ -510,52 +541,16 @@ func setKernelMemory(c *configs.Cgroup) error {
return err
}
if c.KernelMemory > 0 {
err = writeFile(path, "memory.kmem.limit_in_bytes", strconv.FormatInt(c.KernelMemory, 10))
if err != nil {
return err
}
}
return nil
// This doesn't get called by manager.Set, so we need to do it here.
s := &fs.MemoryGroup{}
return s.SetKernelMemory(path, c)
}
func joinMemory(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "memory")
_, err := join(c, "memory", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
// -1 disables memoryswap
if c.MemorySwap > 0 {
err = writeFile(path, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.MemorySwap, 10))
if err != nil {
return err
}
}
if c.MemoryReservation > 0 {
err = writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(c.MemoryReservation, 10))
if err != nil {
return err
}
}
if c.OomKillDisable {
if err := writeFile(path, "memory.oom_control", "1"); err != nil {
return err
}
}
if c.MemorySwappiness >= 0 && c.MemorySwappiness <= 100 {
err = writeFile(path, "memory.swappiness", strconv.FormatInt(c.MemorySwappiness, 10))
if err != nil {
return err
}
} else if c.MemorySwappiness == -1 {
return nil
} else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", c.MemorySwappiness)
}
return nil
}
@ -577,68 +572,25 @@ func joinCpuset(c *configs.Cgroup, pid int) error {
// expects device path instead of major minor numbers, which is also confusing
// for users. So we use fs work around for now.
func joinBlkio(c *configs.Cgroup, pid int) error {
path, err := getSubsystemPath(c, "blkio")
_, err := join(c, "blkio", pid)
if err != nil {
return err
}
// systemd doesn't directly support this in the dbus properties
if c.BlkioLeafWeight != 0 {
if err := writeFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(c.BlkioLeafWeight), 10)); err != nil {
return err
}
}
for _, wd := range c.BlkioWeightDevice {
if err := writeFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
return err
}
if err := writeFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
return err
}
}
for _, td := range c.BlkioThrottleReadBpsDevice {
if err := writeFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range c.BlkioThrottleWriteBpsDevice {
if err := writeFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range c.BlkioThrottleReadIOPSDevice {
if err := writeFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
for _, td := range c.BlkioThrottleWriteIOPSDevice {
if err := writeFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err
}
}
return nil
}
func joinHugetlb(c *configs.Cgroup, pid int) error {
path, err := join(c, "hugetlb", pid)
_, err := join(c, "hugetlb", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
hugetlb, err := subsystems.Get("hugetlb")
if err != nil {
return err
}
return hugetlb.Set(path, c)
return nil
}
func joinPerfEvent(c *configs.Cgroup, pid int) error {
path, err := join(c, "perf_event", pid)
_, err := join(c, "perf_event", pid)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
perfEvent, err := subsystems.Get("perf_event")
if err != nil {
return err
}
return perfEvent.Set(path, c)
return nil
}

View File

@ -13,7 +13,7 @@ import (
"time"
"github.com/docker/docker/pkg/mount"
"github.com/docker/docker/pkg/units"
"github.com/docker/go-units"
)
const cgroupNamePrefix = "name="
@ -84,10 +84,19 @@ func FindCgroupMountpointDir() (string, error) {
// Safe as mountinfo encodes mountpoints with spaces as \040.
index := strings.Index(text, " - ")
postSeparatorFields := strings.Fields(text[index+3:])
if len(postSeparatorFields) < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
numPostFields := len(postSeparatorFields)
// This is an error as we can't detect if the mount is for "cgroup"
if numPostFields == 0 {
return "", fmt.Errorf("Found no fields post '-' in %q", text)
}
if postSeparatorFields[0] == "cgroup" {
// Check that the mount is properly formated.
if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
}
return filepath.Dir(fields[4]), nil
}
}
@ -323,9 +332,14 @@ func GetHugePageSize() ([]string, error) {
return pageSizes, nil
}
// GetPids returns all pids, that were added to cgroup at path and to all its
// subcgroups.
// GetPids returns all pids, that were added to cgroup at path.
func GetPids(path string) ([]int, error) {
return readProcsFile(path)
}
// GetAllPids returns all pids, that were added to cgroup at path and to all its
// subcgroups.
func GetAllPids(path string) ([]int, error) {
var pids []int
// collect pids from all sub-cgroups
err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {

View File

@ -16,6 +16,17 @@ type Cgroup struct {
// name of parent cgroup or slice
Parent string `json:"parent"`
// ScopePrefix decribes prefix for the scope name
ScopePrefix string `json:"scope_prefix"`
// Paths represent the cgroups paths to join
Paths map[string]string
// Resources contains various cgroups settings to apply
*Resources
}
type Resources struct {
// If this is true allow access to any kind of device within the container. If false, allow access only to devices explicitly listed in the allowed_devices list.
AllowAllDevices bool `json:"allow_all_devices"`
@ -29,7 +40,7 @@ type Cgroup struct {
// Memory reservation or soft_limit (in bytes)
MemoryReservation int64 `json:"memory_reservation"`
// Total memory usage (memory + swap); set `-1' to disable swap
// Total memory usage (memory + swap); set `-1` to enable unlimited swap
MemorySwap int64 `json:"memory_swap"`
// Kernel memory limit (in bytes)
@ -56,6 +67,9 @@ type Cgroup struct {
// MEM to use
CpusetMems string `json:"cpuset_mems"`
// Process limit; set <= `0' to disable limit.
PidsLimit int64 `json:"pids_limit"`
// Specifies per cgroup weight, range is from 10 to 1000.
BlkioWeight uint16 `json:"blkio_weight"`
@ -83,9 +97,6 @@ type Cgroup struct {
// Hugetlb limit (in bytes)
HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"`
// Parent slice to use for systemd TODO: remove in favor or parent
Slice string `json:"slice"`
// Whether to disable OOM Killer
OomKillDisable bool `json:"oom_kill_disable"`

View File

@ -0,0 +1,6 @@
// +build !windows,!linux,!freebsd
package configs
type Cgroup struct {
}

View File

@ -82,20 +82,6 @@ var (
Minor: 1,
Permissions: "rwm",
},
{
Path: "/dev/tty0",
Type: 'c',
Major: 4,
Minor: 0,
Permissions: "rwm",
},
{
Path: "/dev/tty1",
Type: 'c',
Major: 4,
Minor: 1,
Permissions: "rwm",
},
// /dev/pts/ - pts namespaces are "coming soon"
{
Path: "",

View File

@ -6,8 +6,8 @@ import (
"errors"
)
// newConsole returns an initalized console that can be used within a container by copying bytes
// NewConsole returns an initalized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func newConsole(uid, gid int) (Console, error) {
func NewConsole(uid, gid int) (Console, error) {
return nil, errors.New("libcontainer console is not supported on FreeBSD")
}

View File

@ -10,9 +10,9 @@ import (
"github.com/opencontainers/runc/libcontainer/label"
)
// newConsole returns an initalized console that can be used within a container by copying bytes
// NewConsole returns an initalized console that can be used within a container by copying bytes
// from the master side to the slave that is attached as the tty for the container's init process.
func newConsole(uid, gid int) (Console, error) {
func NewConsole(uid, gid int) (Console, error) {
master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0)
if err != nil {
return nil, err

View File

@ -1,7 +1,7 @@
package libcontainer
// newConsole returns an initalized console that can be used within a container
func newConsole(uid, gid int) (Console, error) {
// NewConsole returns an initalized console that can be used within a container
func NewConsole(uid, gid int) (Console, error) {
return &windowsConsole{}, nil
}

View File

@ -14,8 +14,11 @@ import (
type Status int
const (
// The container exists but has not been run yet
Created Status = iota
// The container exists and is running.
Running Status = iota + 1
Running
// The container exists, it is in the process of being paused.
Pausing
@ -30,6 +33,25 @@ const (
Destroyed
)
func (s Status) String() string {
switch s {
case Created:
return "created"
case Running:
return "running"
case Pausing:
return "pausing"
case Paused:
return "paused"
case Checkpointed:
return "checkpointed"
case Destroyed:
return "destroyed"
default:
return "unknown"
}
}
// BaseState represents the platform agnostic pieces relating to a
// running container's state
type BaseState struct {

View File

@ -3,8 +3,10 @@
package libcontainer
import (
"bytes"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"os"
"os/exec"
@ -19,6 +21,8 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/criurpc"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/vishvananda/netlink/nl"
)
const stdioFdCount = 3
@ -34,6 +38,7 @@ type linuxContainer struct {
criuPath string
m sync.Mutex
criuVersion int
state containerState
}
// State represents a running container's state
@ -100,6 +105,12 @@ type Container interface {
// errors:
// Systemerror - System error.
NotifyOOM() (<-chan struct{}, error)
// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
//
// errors:
// Systemerror - System error.
NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
}
// ID returns the container's unique ID
@ -125,7 +136,7 @@ func (c *linuxContainer) State() (*State, error) {
}
func (c *linuxContainer) Processes() ([]int, error) {
pids, err := c.cgroupManager.GetPids()
pids, err := c.cgroupManager.GetAllPids()
if err != nil {
return nil, newSystemError(err)
}
@ -179,22 +190,27 @@ func (c *linuxContainer) Start(process *Process) error {
}
return newSystemError(err)
}
if doInit {
c.updateState(parent)
c.state = &runningState{
c: c,
}
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Pid: parent.pid(),
Root: c.config.Rootfs,
if doInit {
if err := c.updateState(parent); err != nil {
return err
}
for _, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil {
if err := parent.terminate(); err != nil {
logrus.Warn(err)
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Pid: parent.pid(),
Root: c.config.Rootfs,
}
for _, hook := range c.config.Hooks.Poststart {
if err := hook.Run(s); err != nil {
if err := parent.terminate(); err != nil {
logrus.Warn(err)
}
return newSystemError(err)
}
return newSystemError(err)
}
}
}
@ -218,7 +234,7 @@ func (c *linuxContainer) newParentProcess(p *Process, doInit bool) (parentProces
return nil, newSystemError(err)
}
if !doInit {
return c.newSetnsProcess(p, cmd, parentPipe, childPipe), nil
return c.newSetnsProcess(p, cmd, parentPipe, childPipe)
}
return c.newInitProcess(p, cmd, parentPipe, childPipe)
}
@ -247,7 +263,7 @@ func (c *linuxContainer) commandTemplate(p *Process, childPipe *os.File) (*exec.
}
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*initProcess, error) {
t := "_LIBCONTAINER_INITTYPE=standard"
t := "_LIBCONTAINER_INITTYPE=" + string(initStandard)
cloneFlags := c.config.Namespaces.CloneFlags()
if cloneFlags&syscall.CLONE_NEWUSER != 0 {
if err := c.addUidGidMappings(cmd.SysProcAttr); err != nil {
@ -273,23 +289,24 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
}, nil
}
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) *setnsProcess {
cmd.Env = append(cmd.Env,
fmt.Sprintf("_LIBCONTAINER_INITPID=%d", c.initProcess.pid()),
"_LIBCONTAINER_INITTYPE=setns",
)
if p.consolePath != "" {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_CONSOLE_PATH="+p.consolePath)
func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, childPipe *os.File) (*setnsProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
// for setns process, we dont have to set cloneflags as the process namespaces
// will only be set via setns syscall
data, err := c.bootstrapData(0, c.initProcess.pid(), p.consolePath)
if err != nil {
return nil, err
}
// TODO: set on container for process management
return &setnsProcess{
cmd: cmd,
cgroupPaths: c.cgroupManager.GetPaths(),
childPipe: childPipe,
parentPipe: parentPipe,
config: c.newInitConfig(p),
process: p,
}
cmd: cmd,
cgroupPaths: c.cgroupManager.GetPaths(),
childPipe: childPipe,
parentPipe: parentPipe,
config: c.newInitConfig(p),
process: p,
bootstrapData: data,
}, nil
}
func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
@ -316,54 +333,53 @@ func newPipe() (parent *os.File, child *os.File, err error) {
func (c *linuxContainer) Destroy() error {
c.m.Lock()
defer c.m.Unlock()
status, err := c.currentStatus()
if err != nil {
return err
}
if status != Destroyed {
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
}
if !c.config.Namespaces.Contains(configs.NEWPID) {
if err := killCgroupProcesses(c.cgroupManager); err != nil {
logrus.Warn(err)
}
}
err = c.cgroupManager.Destroy()
if rerr := os.RemoveAll(c.root); err == nil {
err = rerr
}
c.initProcess = nil
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Root: c.config.Rootfs,
}
for _, hook := range c.config.Hooks.Poststop {
if err := hook.Run(s); err != nil {
return err
}
}
}
return err
return c.state.destroy()
}
func (c *linuxContainer) Pause() error {
c.m.Lock()
defer c.m.Unlock()
return c.cgroupManager.Freeze(configs.Frozen)
status, err := c.currentStatus()
if err != nil {
return err
}
if status != Running {
return newGenericError(fmt.Errorf("container not running"), ContainerNotRunning)
}
if err := c.cgroupManager.Freeze(configs.Frozen); err != nil {
return err
}
return c.state.transition(&pausedState{
c: c,
})
}
func (c *linuxContainer) Resume() error {
c.m.Lock()
defer c.m.Unlock()
return c.cgroupManager.Freeze(configs.Thawed)
status, err := c.currentStatus()
if err != nil {
return err
}
if status != Paused {
return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
}
if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
return c.state.transition(&runningState{
c: c,
})
}
func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) {
return notifyOnOOM(c.cgroupManager.GetPaths())
}
func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) {
return notifyMemoryPressure(c.cgroupManager.GetPaths(), level)
}
// XXX debug support, remove when debugging done.
func addArgsFromEnv(evar string, args *[]string) {
if e := os.Getenv(evar); e != "" {
@ -455,7 +471,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
}
if criuOpts.ImagesDirectory == "" {
criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image")
return fmt.Errorf("invalid directory to save checkpoint")
}
// Since a container can be C/R'ed multiple times,
@ -574,11 +590,9 @@ func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mo
func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
c.m.Lock()
defer c.m.Unlock()
if err := c.checkCriuVersion("1.5.2"); err != nil {
return err
}
if criuOpts.WorkDirectory == "" {
criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
}
@ -587,22 +601,19 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
if err := os.Mkdir(criuOpts.WorkDirectory, 0655); err != nil && !os.IsExist(err) {
return err
}
workDir, err := os.Open(criuOpts.WorkDirectory)
if err != nil {
return err
}
defer workDir.Close()
if criuOpts.ImagesDirectory == "" {
criuOpts.ImagesDirectory = filepath.Join(c.root, "criu.image")
return fmt.Errorf("invalid directory to restore checkpoint")
}
imageDir, err := os.Open(criuOpts.ImagesDirectory)
if err != nil {
return err
}
defer imageDir.Close()
// CRIU has a few requirements for a root directory:
// * it must be a mount point
// * its parent must not be overmounted
@ -613,18 +624,15 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
return err
}
defer os.Remove(root)
root, err = filepath.EvalSymlinks(root)
if err != nil {
return err
}
err = syscall.Mount(c.config.Rootfs, root, "", syscall.MS_BIND|syscall.MS_REC, "")
if err != nil {
return err
}
defer syscall.Unmount(root, syscall.MNT_DETACH)
t := criurpc.CriuReqType_RESTORE
req := &criurpc.CriuReq{
Type: &t,
@ -692,15 +700,13 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
fds []string
fdJSON []byte
)
if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
return err
}
if err = json.Unmarshal(fdJSON, &fds); err != nil {
if err := json.Unmarshal(fdJSON, &fds); err != nil {
return err
}
for i := range fds {
if s := fds[i]; strings.Contains(s, "pipe:") {
inheritFd := new(criurpc.InheritFd)
@ -709,12 +715,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
req.Opts.InheritFd = append(req.Opts.InheritFd, inheritFd)
}
}
err = c.criuSwrk(process, req, criuOpts, true)
if err != nil {
return err
}
return nil
return c.criuSwrk(process, req, criuOpts, true)
}
func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
@ -909,46 +910,43 @@ func (c *linuxContainer) criuNotifications(resp *criurpc.CriuResp, process *Proc
if notify == nil {
return fmt.Errorf("invalid response: %s", resp.String())
}
switch {
case notify.GetScript() == "post-dump":
if !opts.LeaveRunning {
f, err := os.Create(filepath.Join(c.root, "checkpoint"))
if err != nil {
return err
}
f.Close()
f, err := os.Create(filepath.Join(c.root, "checkpoint"))
if err != nil {
return err
}
break
f.Close()
case notify.GetScript() == "network-unlock":
if err := unlockNetwork(c.config); err != nil {
return err
}
break
case notify.GetScript() == "network-lock":
if err := lockNetwork(c.config); err != nil {
return err
}
break
case notify.GetScript() == "post-restore":
pid := notify.GetPid()
r, err := newRestoredProcess(int(pid), fds)
if err != nil {
return err
}
// TODO: crosbymichael restore previous process information by saving the init process information in
// the container's state file or separate process state files.
process.ops = r
if err := c.state.transition(&restoredState{
imageDir: opts.ImagesDirectory,
c: c,
}); err != nil {
return err
}
if err := c.updateState(r); err != nil {
return err
}
process.ops = r
break
if err := os.Remove(filepath.Join(c.root, "checkpoint")); err != nil {
if !os.IsNotExist(err) {
logrus.Error(err)
}
}
}
return nil
}
@ -958,66 +956,130 @@ func (c *linuxContainer) updateState(process parentProcess) error {
if err != nil {
return err
}
return c.saveState(state)
}
func (c *linuxContainer) saveState(s *State) error {
f, err := os.Create(filepath.Join(c.root, stateFilename))
if err != nil {
return err
}
defer f.Close()
os.Remove(filepath.Join(c.root, "checkpoint"))
return json.NewEncoder(f).Encode(state)
return utils.WriteJSON(f, s)
}
func (c *linuxContainer) deleteState() error {
return os.Remove(filepath.Join(c.root, stateFilename))
}
func (c *linuxContainer) currentStatus() (Status, error) {
if _, err := os.Stat(filepath.Join(c.root, "checkpoint")); err == nil {
return Checkpointed, nil
if err := c.refreshState(); err != nil {
return -1, err
}
return c.state.status(), nil
}
// refreshState needs to be called to verify that the current state on the
// container is what is true. Because consumers of libcontainer can use it
// out of process we need to verify the container's status based on runtime
// information and not rely on our in process info.
func (c *linuxContainer) refreshState() error {
paused, err := c.isPaused()
if err != nil {
return err
}
if paused {
return c.state.transition(&pausedState{c: c})
}
running, err := c.isRunning()
if err != nil {
return err
}
if running {
return c.state.transition(&runningState{c: c})
}
return c.state.transition(&stoppedState{c: c})
}
func (c *linuxContainer) isRunning() (bool, error) {
if c.initProcess == nil {
return Destroyed, nil
return false, nil
}
// return Running if the init process is alive
if err := syscall.Kill(c.initProcess.pid(), 0); err != nil {
if err == syscall.ESRCH {
return Destroyed, nil
return false, nil
}
return 0, newSystemError(err)
return false, newSystemError(err)
}
if c.config.Cgroups != nil && c.config.Cgroups.Freezer == configs.Frozen {
return Paused, nil
return true, nil
}
func (c *linuxContainer) isPaused() (bool, error) {
data, err := ioutil.ReadFile(filepath.Join(c.cgroupManager.GetPaths()["freezer"], "freezer.state"))
if err != nil {
if os.IsNotExist(err) {
return false, nil
}
return false, newSystemError(err)
}
return Running, nil
return bytes.Equal(bytes.TrimSpace(data), []byte("FROZEN")), nil
}
func (c *linuxContainer) currentState() (*State, error) {
status, err := c.currentStatus()
if err != nil {
return nil, err
}
if status == Destroyed {
return nil, newGenericError(fmt.Errorf("container destroyed"), ContainerNotExists)
}
startTime, err := c.initProcess.startTime()
if err != nil {
return nil, newSystemError(err)
var (
startTime string
externalDescriptors []string
pid = -1
)
if c.initProcess != nil {
pid = c.initProcess.pid()
startTime, _ = c.initProcess.startTime()
externalDescriptors = c.initProcess.externalDescriptors()
}
state := &State{
BaseState: BaseState{
ID: c.ID(),
Config: *c.config,
InitProcessPid: c.initProcess.pid(),
InitProcessPid: pid,
InitProcessStartTime: startTime,
},
CgroupPaths: c.cgroupManager.GetPaths(),
NamespacePaths: make(map[configs.NamespaceType]string),
ExternalDescriptors: c.initProcess.externalDescriptors(),
ExternalDescriptors: externalDescriptors,
}
for _, ns := range c.config.Namespaces {
state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid())
}
for _, nsType := range configs.NamespaceTypes() {
if _, ok := state.NamespacePaths[nsType]; !ok {
ns := configs.Namespace{Type: nsType}
state.NamespacePaths[ns.Type] = ns.GetPath(c.initProcess.pid())
if pid > 0 {
for _, ns := range c.config.Namespaces {
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
}
for _, nsType := range configs.NamespaceTypes() {
if _, ok := state.NamespacePaths[nsType]; !ok {
ns := configs.Namespace{Type: nsType}
state.NamespacePaths[ns.Type] = ns.GetPath(pid)
}
}
}
return state, nil
}
// bootstrapData encodes the necessary data in netlink binary format as a io.Reader.
// Consumer can write the data to a bootstrap program such as one that uses
// nsenter package to bootstrap the container's init process correctly, i.e. with
// correct namespaces, uid/gid mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, pid int, consolePath string) (io.Reader, error) {
// create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0)
// write pid
r.AddData(&Int32msg{
Type: PidAttr,
Value: uint32(pid),
})
// write console path
if consolePath != "" {
r.AddData(&Bytemsg{
Type: ConsolePathAttr,
Value: []byte(consolePath),
})
}
return bytes.NewReader(r.Serialize()), nil
}

View File

@ -16,12 +16,14 @@ const (
ContainerPaused
ContainerNotStopped
ContainerNotRunning
ContainerNotPaused
// Process errors
ProcessNotExecuted
// Common errors
ConfigInvalid
ConsoleExists
SystemError
)
@ -43,6 +45,10 @@ func (c ErrorCode) String() string {
return "Container is not stopped"
case ContainerNotRunning:
return "Container is not running"
case ConsoleExists:
return "Console exists for process"
case ContainerNotPaused:
return "Container is not paused"
default:
return "Unknown error"
}

View File

@ -5,7 +5,6 @@ package libcontainer
import (
"encoding/json"
"fmt"
"io/ioutil"
"os"
"os/exec"
"path/filepath"
@ -19,6 +18,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/configs/validate"
"github.com/opencontainers/runc/libcontainer/utils"
)
const (
@ -166,7 +166,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
if err := os.MkdirAll(containerRoot, 0700); err != nil {
return nil, newGenericError(err, SystemError)
}
return &linuxContainer{
c := &linuxContainer{
id: id,
root: containerRoot,
config: config,
@ -174,7 +174,9 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
initArgs: l.InitArgs,
criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
}, nil
}
c.state = &stoppedState{c: c}
return c, nil
}
func (l *LinuxFactory) Load(id string) (Container, error) {
@ -191,7 +193,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
processStartTime: state.InitProcessStartTime,
fds: state.ExternalDescriptors,
}
return &linuxContainer{
c := &linuxContainer{
initProcess: r,
id: id,
config: &state.Config,
@ -200,7 +202,12 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
criuPath: l.CriuPath,
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
root: containerRoot,
}, nil
}
c.state = &createdState{c: c, s: Created}
if err := c.refreshState(); err != nil {
return nil, err
}
return c, nil
}
func (l *LinuxFactory) Type() string {
@ -222,21 +229,29 @@ func (l *LinuxFactory) StartInitialization() (err error) {
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
var i initer
defer func() {
// if we have an error during the initialization of the container's init then send it back to the
// parent process in the form of an initError.
if err != nil {
// ensure that any data sent from the parent is consumed so it doesn't
// receive ECONNRESET when the child writes to the pipe.
ioutil.ReadAll(pipe)
if err := json.NewEncoder(pipe).Encode(newSystemError(err)); err != nil {
if _, ok := i.(*linuxStandardInit); ok {
// Synchronisation only necessary for standard init.
if err := utils.WriteJSON(pipe, syncT{procError}); err != nil {
panic(err)
}
}
if err := utils.WriteJSON(pipe, newSystemError(err)); err != nil {
panic(err)
}
} else {
if err := utils.WriteJSON(pipe, syncT{procStart}); err != nil {
panic(err)
}
}
// ensure that this pipe is always closed
pipe.Close()
}()
i, err := newContainerInit(it, pipe)
i, err = newContainerInit(it, pipe)
if err != nil {
return err
}

View File

@ -9,6 +9,19 @@ import (
"github.com/opencontainers/runc/libcontainer/stacktrace"
)
type syncType uint8
const (
procReady syncType = iota
procError
procStart
procRun
)
type syncT struct {
Type syncType `json:"type"`
}
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
Code: {{.ECode}}
{{if .Message }}

View File

@ -1,16 +0,0 @@
#!/usr/bin/env bash
set -e
# This script runs all validations
validate() {
export MAKEDIR=/go/src/github.com/docker/docker/hack/make
sed -i 's!docker/docker!opencontainers/runc/libcontainer!' /go/src/github.com/docker/docker/hack/make/.validate
bash /go/src/github.com/docker/docker/hack/make/validate-dco
bash /go/src/github.com/docker/docker/hack/make/validate-gofmt
go get golang.org/x/tools/cmd/vet
bash /go/src/github.com/docker/docker/hack/make/validate-vet
}
# run validations
validate

View File

@ -5,6 +5,7 @@ package libcontainer
import (
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net"
"os"
@ -73,6 +74,7 @@ func newContainerInit(t initType, pipe *os.File) (initer, error) {
}, nil
case initStandard:
return &linuxStandardInit{
pipe: pipe,
parentPid: syscall.Getppid(),
config: config,
}, nil
@ -140,6 +142,27 @@ func finalizeNamespace(config *initConfig) error {
return nil
}
// syncParentReady sends to the given pipe a JSON payload which indicates that
// the init is ready to Exec the child process. It then waits for the parent to
// indicate that it is cleared to Exec.
func syncParentReady(pipe io.ReadWriter) error {
// Tell parent.
if err := utils.WriteJSON(pipe, syncT{procReady}); err != nil {
return err
}
// Wait for parent to give the all-clear.
var procSync syncT
if err := json.NewDecoder(pipe).Decode(&procSync); err != nil {
if err == io.EOF {
return fmt.Errorf("parent closed synchronisation channel")
}
if procSync.Type != procRun {
return fmt.Errorf("invalid synchronisation flag from parent")
}
}
return nil
}
// joinExistingNamespaces gets all the namespace paths specified for the container and
// does a setns on the namespace fd so that the current process joins the namespace.
func joinExistingNamespaces(namespaces []configs.Namespace) error {
@ -309,7 +332,7 @@ func killCgroupProcesses(m cgroups.Manager) error {
if err := m.Freeze(configs.Frozen); err != nil {
logrus.Warn(err)
}
pids, err := m.GetPids()
pids, err := m.GetAllPids()
if err != nil {
m.Freeze(configs.Thawed)
return err

View File

@ -0,0 +1,62 @@
// +build linux
package libcontainer
import (
"syscall"
"github.com/vishvananda/netlink/nl"
)
// list of known message types we want to send to bootstrap program
// The number is randomly chosen to not conflict with known netlink types
const (
InitMsg uint16 = 62000
PidAttr uint16 = 27281
ConsolePathAttr uint16 = 27282
// When syscall.NLA_HDRLEN is in gccgo, take this out.
syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1)
)
type Int32msg struct {
Type uint16
Value uint32
}
// int32msg has the following representation
// | nlattr len | nlattr type |
// | uint32 value |
func (msg *Int32msg) Serialize() []byte {
buf := make([]byte, msg.Len())
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(msg.Len()))
native.PutUint16(buf[2:4], msg.Type)
native.PutUint32(buf[4:8], msg.Value)
return buf
}
func (msg *Int32msg) Len() int {
return syscall_NLA_HDRLEN + 4
}
// bytemsg has the following representation
// | nlattr len | nlattr type |
// | value | pad |
type Bytemsg struct {
Type uint16
Value []byte
}
func (msg *Bytemsg) Serialize() []byte {
l := msg.Len()
buf := make([]byte, (l+syscall.NLA_ALIGNTO-1) & ^(syscall.NLA_ALIGNTO-1))
native := nl.NativeEndian()
native.PutUint16(buf[0:2], uint16(l))
native.PutUint16(buf[2:4], msg.Type)
copy(buf[4:], msg.Value)
return buf
}
func (msg *Bytemsg) Len() int {
return syscall_NLA_HDRLEN + len(msg.Value) + 1 // null-terminated
}

View File

@ -93,7 +93,7 @@ func (l *loopback) create(n *network, nspid int) error {
}
func (l *loopback) initialize(config *network) error {
return netlink.LinkSetUp(&netlink.Device{netlink.LinkAttrs{Name: "lo"}})
return netlink.LinkSetUp(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: "lo"}})
}
func (l *loopback) attach(n *configs.Network) (err error) {
@ -111,7 +111,7 @@ type veth struct {
}
func (v *veth) detach(n *configs.Network) (err error) {
return netlink.LinkSetMaster(&netlink.Device{netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil)
return netlink.LinkSetMaster(&netlink.Device{LinkAttrs: netlink.LinkAttrs{Name: n.HostInterfaceName}}, nil)
}
// attach a container network interface to an external network

View File

@ -12,31 +12,32 @@ import (
const oomCgroupName = "memory"
// notifyOnOOM returns channel on which you can expect event about OOM,
// if process died without OOM this channel will be closed.
// s is current *libcontainer.State for container.
func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
dir := paths[oomCgroupName]
if dir == "" {
return nil, fmt.Errorf("There is no path for %q in state", oomCgroupName)
}
oomControl, err := os.Open(filepath.Join(dir, "memory.oom_control"))
type PressureLevel uint
const (
LowPressure PressureLevel = iota
MediumPressure
CriticalPressure
)
func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct{}, error) {
evFile, err := os.Open(filepath.Join(cgDir, evName))
if err != nil {
return nil, err
}
fd, _, syserr := syscall.RawSyscall(syscall.SYS_EVENTFD2, 0, syscall.FD_CLOEXEC, 0)
if syserr != 0 {
oomControl.Close()
evFile.Close()
return nil, syserr
}
eventfd := os.NewFile(fd, "eventfd")
eventControlPath := filepath.Join(dir, "cgroup.event_control")
data := fmt.Sprintf("%d %d", eventfd.Fd(), oomControl.Fd())
eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
if err := ioutil.WriteFile(eventControlPath, []byte(data), 0700); err != nil {
eventfd.Close()
oomControl.Close()
evFile.Close()
return nil, err
}
ch := make(chan struct{})
@ -44,7 +45,7 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
defer func() {
close(ch)
eventfd.Close()
oomControl.Close()
evFile.Close()
}()
buf := make([]byte, 8)
for {
@ -61,3 +62,28 @@ func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
}()
return ch, nil
}
// notifyOnOOM returns channel on which you can expect event about OOM,
// if process died without OOM this channel will be closed.
func notifyOnOOM(paths map[string]string) (<-chan struct{}, error) {
dir := paths[oomCgroupName]
if dir == "" {
return nil, fmt.Errorf("path %q missing", oomCgroupName)
}
return registerMemoryEvent(dir, "memory.oom_control", "")
}
func notifyMemoryPressure(paths map[string]string, level PressureLevel) (<-chan struct{}, error) {
dir := paths[oomCgroupName]
if dir == "" {
return nil, fmt.Errorf("path %q missing", oomCgroupName)
}
if level > CriticalPressure {
return nil, fmt.Errorf("invalid pressure level %d", level)
}
levelStr := []string{"low", "medium", "critical"}[level]
return registerMemoryEvent(dir, "memory.pressure_level", levelStr)
}

View File

@ -17,6 +17,11 @@
#include <sched.h>
#include <signal.h>
#include <linux/netlink.h>
#include <linux/types.h>
#include <stdint.h>
#include <sys/socket.h>
/* All arguments should be above stack, because it grows down */
struct clone_arg {
/*
@ -63,24 +68,33 @@ static int clone_parent(jmp_buf * env)
return child;
}
static uint32_t readint32(char *buf)
{
return *(uint32_t *) buf;
}
// list of known message types we want to send to bootstrap program
// These are defined in libcontainer/message_linux.go
#define INIT_MSG 62000
#define PID_ATTR 27281
#define CONSOLE_PATH_ATTR 27282
void nsexec()
{
char *namespaces[] = { "ipc", "uts", "net", "pid", "mnt", "user" };
const int num = sizeof(namespaces) / sizeof(char *);
jmp_buf env;
char buf[PATH_MAX], *val;
int i, tfd, self_tfd, child, len, pipenum, consolefd = -1;
pid_t pid;
char *console;
int i, tfd, self_tfd, child, n, len, pipenum, consolefd = -1;
pid_t pid = 0;
val = getenv("_LIBCONTAINER_INITPID");
if (val == NULL)
// if we dont have INITTYPE or this is the init process, skip the bootstrap process
val = getenv("_LIBCONTAINER_INITTYPE");
if (val == NULL || strcmp(val, "standard") == 0) {
return;
pid = atoi(val);
snprintf(buf, sizeof(buf), "%d", pid);
if (strcmp(val, buf)) {
pr_perror("Unable to parse _LIBCONTAINER_INITPID");
}
if (strcmp(val, "setns") != 0) {
pr_perror("Invalid inittype %s", val);
exit(1);
}
@ -89,7 +103,6 @@ void nsexec()
pr_perror("Child pipe not found");
exit(1);
}
pipenum = atoi(val);
snprintf(buf, sizeof(buf), "%d", pipenum);
if (strcmp(val, buf)) {
@ -97,13 +110,56 @@ void nsexec()
exit(1);
}
console = getenv("_LIBCONTAINER_CONSOLE_PATH");
if (console != NULL) {
consolefd = open(console, O_RDWR);
if (consolefd < 0) {
pr_perror("Failed to open console %s", console);
exit(1);
char nlbuf[NLMSG_HDRLEN];
struct nlmsghdr *nh;
if ((n = read(pipenum, nlbuf, NLMSG_HDRLEN)) != NLMSG_HDRLEN) {
pr_perror("Failed to read netlink header, got %d", n);
exit(1);
}
nh = (struct nlmsghdr *)nlbuf;
if (nh->nlmsg_type == NLMSG_ERROR) {
pr_perror("Invalid netlink header message");
exit(1);
}
if (nh->nlmsg_type != INIT_MSG) {
pr_perror("Unexpected netlink message type %d", nh->nlmsg_type);
exit(1);
}
// read the netlink payload
len = NLMSG_PAYLOAD(nh, 0);
char data[len];
if ((n = read(pipenum, data, len)) != len) {
pr_perror("Failed to read netlink payload, got %d", n);
exit(1);
}
int start = 0;
struct nlattr *attr;
while (start < len) {
int payload_len;
attr = (struct nlattr *)((void *)data + start);
start += NLA_HDRLEN;
payload_len = attr->nla_len - NLA_HDRLEN;
switch (attr->nla_type) {
case PID_ATTR:
pid = (pid_t) readint32(data + start);
break;
case CONSOLE_PATH_ATTR:
consolefd = open((char *)data + start, O_RDWR);
if (consolefd < 0) {
pr_perror("Failed to open console %s", (char *)data + start);
exit(1);
}
break;
}
start += NLA_ALIGN(payload_len);
}
// required pid to be passed
if (pid == 0) {
pr_perror("missing pid");
exit(1);
}
/* Check that the specified process exists */
@ -133,15 +189,13 @@ void nsexec()
}
/* Skip namespaces we're already part of */
if (fstatat(self_tfd, namespaces[i], &self_st, 0) != -1 &&
st.st_ino == self_st.st_ino) {
if (fstatat(self_tfd, namespaces[i], &self_st, 0) != -1 && st.st_ino == self_st.st_ino) {
continue;
}
fd = openat(tfd, namespaces[i], O_RDONLY);
if (fd == -1) {
pr_perror("Failed to open ns file %s for ns %s", buf,
namespaces[i]);
pr_perror("Failed to open ns file %s for ns %s", buf, namespaces[i]);
exit(1);
}
// Set the namespace.

View File

@ -78,12 +78,28 @@ func (p Process) Signal(sig os.Signal) error {
return p.ops.signal(sig)
}
// IO holds the process's STDIO
type IO struct {
Stdin io.WriteCloser
Stdout io.ReadCloser
Stderr io.ReadCloser
}
// NewConsole creates new console for process and returns it
func (p *Process) NewConsole(rootuid int) (Console, error) {
console, err := newConsole(rootuid, rootuid)
console, err := NewConsole(rootuid, rootuid)
if err != nil {
return nil, err
}
p.consolePath = console.Path()
return console, nil
}
// ConsoleFromPath sets the process's console with the path provided
func (p *Process) ConsoleFromPath(path string) error {
if p.consolePath != "" {
return newGenericError(fmt.Errorf("console path already exists for process"), ConsoleExists)
}
p.consolePath = path
return nil
}

View File

@ -5,6 +5,7 @@ package libcontainer
import (
"encoding/json"
"errors"
"fmt"
"io"
"os"
"os/exec"
@ -15,6 +16,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
)
type parentProcess interface {
@ -41,13 +43,14 @@ type parentProcess interface {
}
type setnsProcess struct {
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
cgroupPaths map[string]string
config *initConfig
fds []string
process *Process
cmd *exec.Cmd
parentPipe *os.File
childPipe *os.File
cgroupPaths map[string]string
config *initConfig
fds []string
process *Process
bootstrapData io.Reader
}
func (p *setnsProcess) startTime() (string, error) {
@ -64,6 +67,16 @@ func (p *setnsProcess) signal(sig os.Signal) error {
func (p *setnsProcess) start() (err error) {
defer p.parentPipe.Close()
err = p.cmd.Start()
p.childPipe.Close()
if err != nil {
return newSystemError(err)
}
if p.bootstrapData != nil {
if _, err := io.Copy(p.parentPipe, p.bootstrapData); err != nil {
return newSystemError(err)
}
}
if err = p.execSetns(); err != nil {
return newSystemError(err)
}
@ -72,9 +85,10 @@ func (p *setnsProcess) start() (err error) {
return newSystemError(err)
}
}
if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil {
if err := utils.WriteJSON(p.parentPipe, p.config); err != nil {
return newSystemError(err)
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err)
}
@ -84,6 +98,7 @@ func (p *setnsProcess) start() (err error) {
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return newSystemError(ierr)
@ -96,11 +111,6 @@ func (p *setnsProcess) start() (err error) {
// before the go runtime boots, we wait on the process to die and receive the child's pid
// over the provided pipe.
func (p *setnsProcess) execSetns() error {
err := p.cmd.Start()
p.childPipe.Close()
if err != nil {
return newSystemError(err)
}
status, err := p.cmd.Process.Wait()
if err != nil {
p.cmd.Wait()
@ -192,7 +202,6 @@ func (p *initProcess) start() (err error) {
return newSystemError(err)
}
p.setExternalDescriptors(fds)
// Do this before syncing with child so that no children
// can escape the cgroup
if err := p.manager.Apply(p.pid()); err != nil {
@ -223,13 +232,56 @@ func (p *initProcess) start() (err error) {
if err := p.sendConfig(); err != nil {
return newSystemError(err)
}
// wait for the child process to fully complete and receive an error message
// if one was encoutered
var ierr *genericError
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
var (
procSync syncT
sentRun bool
ierr *genericError
)
loop:
for {
if err := json.NewDecoder(p.parentPipe).Decode(&procSync); err != nil {
if err == io.EOF {
break loop
}
return newSystemError(err)
}
switch procSync.Type {
case procStart:
break loop
case procReady:
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemError(err)
}
// Sync with child.
if err := utils.WriteJSON(p.parentPipe, syncT{procRun}); err != nil {
return newSystemError(err)
}
sentRun = true
case procError:
// wait for the child process to fully complete and receive an error message
// if one was encoutered
if err := json.NewDecoder(p.parentPipe).Decode(&ierr); err != nil && err != io.EOF {
return newSystemError(err)
}
if ierr != nil {
break loop
}
// Programmer error.
panic("No error following JSON procError payload.")
default:
return newSystemError(fmt.Errorf("invalid JSON synchronisation payload from child"))
}
}
if !sentRun {
return newSystemError(fmt.Errorf("could not synchronise with container process"))
}
if err := syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR); err != nil {
return newSystemError(err)
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
p.wait()
return newSystemError(ierr)
}
return nil
@ -264,11 +316,7 @@ func (p *initProcess) startTime() (string, error) {
func (p *initProcess) sendConfig() error {
// send the state to the container's init process then shutdown writes for the parent
if err := json.NewEncoder(p.parentPipe).Encode(p.config); err != nil {
return err
}
// shutdown writes for the parent side of the pipe
return syscall.Shutdown(int(p.parentPipe.Fd()), syscall.SHUT_WR)
return utils.WriteJSON(p.parentPipe, p.config)
}
func (p *initProcess) createNetworkInterfaces() error {
@ -314,3 +362,44 @@ func getPipeFds(pid int) ([]string, error) {
}
return fds, nil
}
// InitializeIO creates pipes for use with the process's STDIO
// and returns the opposite side for each
func (p *Process) InitializeIO(rootuid int) (i *IO, err error) {
var fds []uintptr
i = &IO{}
// cleanup in case of an error
defer func() {
if err != nil {
for _, fd := range fds {
syscall.Close(int(fd))
}
}
}()
// STDIN
r, w, err := os.Pipe()
if err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stdin, i.Stdin = r, w
// STDOUT
if r, w, err = os.Pipe(); err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stdout, i.Stdout = w, r
// STDERR
if r, w, err = os.Pipe(); err != nil {
return nil, err
}
fds = append(fds, r.Fd(), w.Fd())
p.Stderr, i.Stderr = w, r
// change ownership of the pipes incase we are in a user namespace
for _, fd := range fds {
if err := syscall.Fchown(int(fd), rootuid, rootuid); err != nil {
return nil, err
}
}
return i, nil
}

View File

@ -18,6 +18,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/label"
"github.com/opencontainers/runc/libcontainer/system"
)
const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV
@ -299,6 +300,24 @@ func checkMountDestination(rootfs, dest string) error {
invalidDestinations := []string{
"/proc",
}
// White list, it should be sub directories of invalid destinations
validDestinations := []string{
// These entries can be bind mounted by files emulated by fuse,
// so commands like top, free displays stats in container.
"/proc/cpuinfo",
"/proc/diskstats",
"/proc/meminfo",
"/proc/stats",
}
for _, valid := range validDestinations {
path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
if err != nil {
return err
}
if path == "." {
return nil
}
}
for _, invalid := range invalidDestinations {
path, err := filepath.Rel(filepath.Join(rootfs, invalid), dest)
if err != nil {
@ -365,11 +384,12 @@ func reOpenDevNull() error {
// Create the device nodes in the container.
func createDevices(config *configs.Config) error {
useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
oldMask := syscall.Umask(0000)
for _, node := range config.Devices {
// containers running in a user namespace are not allowed to mknod
// devices so we can just bind mount it from the host.
if err := createDeviceNode(config.Rootfs, node, config.Namespaces.Contains(configs.NEWUSER)); err != nil {
if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
syscall.Umask(oldMask)
return err
}

View File

@ -0,0 +1,47 @@
Name: cat
State: R (running)
Tgid: 19383
Ngid: 0
Pid: 19383
PPid: 19275
TracerPid: 0
Uid: 1000 1000 1000 1000
Gid: 1000 1000 1000 1000
FDSize: 256
Groups: 24 25 27 29 30 44 46 102 104 108 111 1000 1001
NStgid: 19383
NSpid: 19383
NSpgid: 19383
NSsid: 19275
VmPeak: 5944 kB
VmSize: 5944 kB
VmLck: 0 kB
VmPin: 0 kB
VmHWM: 744 kB
VmRSS: 744 kB
VmData: 324 kB
VmStk: 136 kB
VmExe: 48 kB
VmLib: 1776 kB
VmPTE: 32 kB
VmPMD: 12 kB
VmSwap: 0 kB
Threads: 1
SigQ: 0/30067
SigPnd: 0000000000000000
ShdPnd: 0000000000000000
SigBlk: 0000000000000000
SigIgn: 0000000000000080
SigCgt: 0000000000000000
CapInh: 0000000000000000
CapPrm: 0000000000000000
CapEff: 0000000000000000
CapBnd: 0000003fffffffff
CapAmb: 0000000000000000
Seccomp: 0
Cpus_allowed: f
Cpus_allowed_list: 0-3
Mems_allowed: 00000000,00000001
Mems_allowed_list: 0
voluntary_ctxt_switches: 0
nonvoluntary_ctxt_switches: 1

View File

@ -3,8 +3,11 @@
package seccomp
import (
"bufio"
"fmt"
"log"
"os"
"strings"
"syscall"
"github.com/opencontainers/runc/libcontainer/configs"
@ -17,6 +20,9 @@ var (
actKill = libseccomp.ActKill
actTrace = libseccomp.ActTrace.SetReturnCode(int16(syscall.EPERM))
actErrno = libseccomp.ActErrno.SetReturnCode(int16(syscall.EPERM))
// SeccompModeFilter refers to the syscall argument SECCOMP_MODE_FILTER.
SeccompModeFilter = uintptr(2)
)
// Filters given syscalls in a container, preventing them from being used
@ -73,6 +79,24 @@ func InitSeccomp(config *configs.Seccomp) error {
return nil
}
// IsEnabled returns if the kernel has been configured to support seccomp.
func IsEnabled() bool {
// Try to read from /proc/self/status for kernels > 3.8
s, err := parseStatusFile("/proc/self/status")
if err != nil {
// Check if Seccomp is supported, via CONFIG_SECCOMP.
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_GET_SECCOMP, 0, 0); err != syscall.EINVAL {
// Make sure the kernel has CONFIG_SECCOMP_FILTER.
if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_SECCOMP, SeccompModeFilter, 0); err != syscall.EINVAL {
return true
}
}
return false
}
_, ok := s["Seccomp"]
return ok
}
// Convert Libcontainer Action to Libseccomp ScmpAction
func getAction(act configs.Action) (libseccomp.ScmpAction, error) {
switch act {
@ -178,3 +202,30 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
return nil
}
func parseStatusFile(path string) (map[string]string, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
s := bufio.NewScanner(f)
status := make(map[string]string)
for s.Scan() {
if err := s.Err(); err != nil {
return nil, err
}
text := s.Text()
parts := strings.Split(text, ":")
if len(parts) <= 1 {
continue
}
status[parts[0]] = parts[1]
}
return status, nil
}

View File

@ -17,3 +17,8 @@ func InitSeccomp(config *configs.Seccomp) error {
}
return nil
}
// IsEnabled returns false, because it is not supported.
func IsEnabled() bool {
return false
}

View File

@ -231,10 +231,14 @@ func ReserveLabel(scon string) {
}
}
func selinuxEnforcePath() string {
return fmt.Sprintf("%s/enforce", selinuxPath)
}
func SelinuxGetEnforce() int {
var enforce int
enforceS, err := readCon(fmt.Sprintf("%s/enforce", selinuxPath))
enforceS, err := readCon(selinuxEnforcePath())
if err != nil {
return -1
}
@ -246,6 +250,10 @@ func SelinuxGetEnforce() int {
return enforce
}
func SelinuxSetEnforce(mode int) error {
return writeCon(selinuxEnforcePath(), fmt.Sprintf("%d", mode))
}
func SelinuxGetEnforceMode() int {
switch readConfig(selinuxTag) {
case "enforcing":

View File

@ -3,6 +3,7 @@
package libcontainer
import (
"io"
"os"
"syscall"
@ -14,6 +15,7 @@ import (
)
type linuxStandardInit struct {
pipe io.ReadWriter
parentPid int
config *initConfig
}
@ -50,7 +52,6 @@ func (l *linuxStandardInit) Init() error {
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
return err
}
label.Init()
// InitializeMountNamespace() can be executed only for a new mount namespace
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
@ -75,7 +76,6 @@ func (l *linuxStandardInit) Init() error {
return err
}
}
for _, path := range l.config.Config.ReadonlyPaths {
if err := remountReadonly(path); err != nil {
return err
@ -90,6 +90,12 @@ func (l *linuxStandardInit) Init() error {
if err != nil {
return err
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return err
}
if l.config.Config.Seccomp != nil {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return err

View File

@ -0,0 +1,223 @@
// +build linux
package libcontainer
import (
"fmt"
"os"
"path/filepath"
"github.com/Sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/configs"
)
func newStateTransitionError(from, to containerState) error {
return &stateTransitionError{
From: from.status().String(),
To: to.status().String(),
}
}
// stateTransitionError is returned when an invalid state transition happens from one
// state to another.
type stateTransitionError struct {
From string
To string
}
func (s *stateTransitionError) Error() string {
return fmt.Sprintf("invalid state transition from %s to %s", s.From, s.To)
}
type containerState interface {
transition(containerState) error
destroy() error
status() Status
}
func destroy(c *linuxContainer) error {
if !c.config.Namespaces.Contains(configs.NEWPID) {
if err := killCgroupProcesses(c.cgroupManager); err != nil {
logrus.Warn(err)
}
}
err := c.cgroupManager.Destroy()
if rerr := os.RemoveAll(c.root); err == nil {
err = rerr
}
c.initProcess = nil
if herr := runPoststopHooks(c); err == nil {
err = herr
}
c.state = &stoppedState{c: c}
return err
}
func runPoststopHooks(c *linuxContainer) error {
if c.config.Hooks != nil {
s := configs.HookState{
Version: c.config.Version,
ID: c.id,
Root: c.config.Rootfs,
}
for _, hook := range c.config.Hooks.Poststop {
if err := hook.Run(s); err != nil {
return err
}
}
}
return nil
}
// stoppedState represents a container is a stopped/destroyed state.
type stoppedState struct {
c *linuxContainer
}
func (b *stoppedState) status() Status {
return Destroyed
}
func (b *stoppedState) transition(s containerState) error {
switch s.(type) {
case *runningState:
b.c.state = s
return nil
case *restoredState:
b.c.state = s
return nil
case *stoppedState:
return nil
}
return newStateTransitionError(b, s)
}
func (b *stoppedState) destroy() error {
return destroy(b.c)
}
// runningState represents a container that is currently running.
type runningState struct {
c *linuxContainer
}
func (r *runningState) status() Status {
return Running
}
func (r *runningState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
running, err := r.c.isRunning()
if err != nil {
return err
}
if running {
return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
}
r.c.state = s
return nil
case *pausedState:
r.c.state = s
return nil
case *runningState:
return nil
}
return newStateTransitionError(r, s)
}
func (r *runningState) destroy() error {
running, err := r.c.isRunning()
if err != nil {
return err
}
if running {
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
}
return destroy(r.c)
}
// pausedState represents a container that is currently pause. It cannot be destroyed in a
// paused state and must transition back to running first.
type pausedState struct {
c *linuxContainer
}
func (p *pausedState) status() Status {
return Paused
}
func (p *pausedState) transition(s containerState) error {
switch s.(type) {
case *runningState, *stoppedState:
p.c.state = s
return nil
case *pausedState:
return nil
}
return newStateTransitionError(p, s)
}
func (p *pausedState) destroy() error {
isRunning, err := p.c.isRunning()
if err != nil {
return err
}
if !isRunning {
if err := p.c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
}
return destroy(p.c)
}
return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
}
// restoredState is the same as the running state but also has accociated checkpoint
// information that maybe need destroyed when the container is stopped and destory is called.
type restoredState struct {
imageDir string
c *linuxContainer
}
func (r *restoredState) status() Status {
return Running
}
func (r *restoredState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
return nil
case *runningState:
return nil
}
return newStateTransitionError(r, s)
}
func (r *restoredState) destroy() error {
if _, err := os.Stat(filepath.Join(r.c.root, "checkpoint")); err != nil {
if !os.IsNotExist(err) {
return err
}
}
return destroy(r.c)
}
// createdState is used whenever a container is restored, loaded, or setting additional
// processes inside and it should not be destroyed when it is exiting.
type createdState struct {
c *linuxContainer
s Status
}
func (n *createdState) status() Status {
return n.s
}
func (n *createdState) transition(s containerState) error {
n.c.state = s
return nil
}
func (n *createdState) destroy() error {
return nil
}

View File

@ -3,6 +3,9 @@
package system
import (
"bufio"
"fmt"
"os"
"os/exec"
"syscall"
"unsafe"
@ -75,3 +78,37 @@ func Setctty() error {
}
return nil
}
/*
* Detect whether we are currently running in a user namespace.
* Copied from github.com/lxc/lxd/shared/util.go
*/
func RunningInUserNS() bool {
file, err := os.Open("/proc/self/uid_map")
if err != nil {
/*
* This kernel-provided file only exists if user namespaces are
* supported
*/
return false
}
defer file.Close()
buf := bufio.NewReader(file)
l, _, err := buf.ReadLine()
if err != nil {
return false
}
line := string(l)
var a, b, c int64
fmt.Sscanf(line, "%d %d %d", &a, &b, &c)
/*
* We assume we are in the initial user namespace if we have a full
* range - 4294967295 uids starting at uid 0.
*/
if a == 0 && b == 0 && c == 4294967295 {
return false
}
return true
}

View File

@ -3,6 +3,7 @@ package utils
import (
"crypto/rand"
"encoding/hex"
"encoding/json"
"io"
"path/filepath"
"syscall"
@ -36,10 +37,20 @@ func ResolveRootfs(uncleanRootfs string) (string, error) {
}
// ExitStatus returns the correct exit status for a process based on if it
// was signaled or exited cleanly.
// was signaled or exited cleanly
func ExitStatus(status syscall.WaitStatus) int {
if status.Signaled() {
return exitSignalOffset + int(status.Signal())
}
return status.ExitStatus()
}
// WriteJSON writes the provided struct v to w using standard json marshaling
func WriteJSON(w io.Writer, v interface{}) error {
data, err := json.Marshal(v)
if err != nil {
return err
}
_, err = w.Write(data)
return err
}

View File

@ -22,7 +22,7 @@ import (
type NodeInfo struct {
Cores int
Mem int64 // in bytes
Mem uint64 // in bytes
}
func nodeInfo(si *mesos.SlaveInfo, ei *mesos.ExecutorInfo) NodeInfo {
@ -57,13 +57,13 @@ func nodeInfo(si *mesos.SlaveInfo, ei *mesos.ExecutorInfo) NodeInfo {
// TODO(sttts): switch to float64 when "Machine Allocables" are implemented
ni.Cores += int(r.GetScalar().GetValue())
case "mem":
ni.Mem += int64(r.GetScalar().GetValue()) * 1024 * 1024
ni.Mem += uint64(r.GetScalar().GetValue()) * 1024 * 1024
}
}
// TODO(sttts): subtract executorCPU/Mem from static pod resources before subtracting them from the capacity
ni.Cores -= int(executorCPU)
ni.Mem -= int64(executorMem) * 1024 * 1024
ni.Mem -= uint64(executorMem) * 1024 * 1024
return ni
}

View File

@ -25,10 +25,10 @@ import (
type MesosCadvisor struct {
cadvisor.Interface
cores int
mem int64
mem uint64
}
func NewMesosCadvisor(cores int, mem int64, port uint) (*MesosCadvisor, error) {
func NewMesosCadvisor(cores int, mem uint64, port uint) (*MesosCadvisor, error) {
c, err := cadvisor.New(port)
if err != nil {
return nil, err

View File

@ -28,7 +28,7 @@ func CapacityFromMachineInfo(info *cadvisorApi.MachineInfo) api.ResourceList {
int64(info.NumCores*1000),
resource.DecimalSI),
api.ResourceMemory: *resource.NewQuantity(
info.MemoryCapacity,
int64(info.MemoryCapacity),
resource.BinarySI),
}
return c

View File

@ -126,8 +126,11 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
func createManager(containerName string) *fs.Manager {
return &fs.Manager{
Cgroups: &configs.Cgroup{
Name: containerName,
AllowAllDevices: true,
Parent: "/",
Name: containerName,
Resources: &configs.Resources{
AllowAllDevices: true,
},
},
}
}
@ -208,10 +211,13 @@ func (cm *containerManagerImpl) setupNode() error {
dockerContainer := &fs.Manager{
Cgroups: &configs.Cgroup{
Name: cm.DockerDaemonContainerName,
Memory: memoryLimit,
MemorySwap: -1,
AllowAllDevices: true,
Parent: "/",
Name: cm.DockerDaemonContainerName,
Resources: &configs.Resources{
Memory: memoryLimit,
MemorySwap: -1,
AllowAllDevices: true,
},
},
}
cont.ensureStateFunc = func(manager *fs.Manager) error {
@ -227,7 +233,8 @@ func (cm *containerManagerImpl) setupNode() error {
rootContainer := &fs.Manager{
Cgroups: &configs.Cgroup{
Name: "/",
Parent: "/",
Name: "/",
},
}
manager := createManager(cm.SystemContainerName)
@ -377,40 +384,23 @@ func ensureSystemContainer(rootContainer *fs.Manager, manager *fs.Manager) error
continue
}
// Get PIDs already in target group so we can remove them from the list of
// PIDs to move.
systemCgroupPIDs, err := manager.GetPids()
if err != nil {
errs = append(errs, fmt.Errorf("failed to list PIDs for %s: %v", manager.Cgroups.Name, err))
continue
}
systemCgroupPIDMap := make(map[int]struct{}, len(systemCgroupPIDs))
for _, pid := range systemCgroupPIDs {
systemCgroupPIDMap[pid] = struct{}{}
}
// Remove kernel pids and process 1
// Remove kernel pids and other protected PIDs (pid 1, PIDs already in system & kubelet containers)
pids := make([]int, 0, len(allPids))
for _, pid := range allPids {
if isKernelPid(pid) {
continue
}
if _, ok := systemCgroupPIDMap[pid]; ok {
continue
}
pids = append(pids, pid)
}
glog.Infof("Found %d PIDs in root, %d of them are kernel related", len(allPids), len(allPids)-len(pids))
glog.Infof("Found %d PIDs in root, %d of them are not to be moved", len(allPids), len(allPids)-len(pids))
// Check if we moved all the non-kernel PIDs.
// Check if we have moved all the non-kernel PIDs.
if len(pids) == 0 {
break
}
glog.Infof("Moving non-kernel threads: %v", pids)
glog.Infof("Moving non-kernel processes: %v", pids)
for _, pid := range pids {
err := manager.Apply(pid)
if err != nil {

View File

@ -1572,7 +1572,7 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe
if container.Name == PodInfraContainerName {
oomScoreAdj = qos.PodInfraOOMAdj
} else {
oomScoreAdj = qos.GetContainerOOMScoreAdjust(container, dm.machineInfo.MemoryCapacity)
oomScoreAdj = qos.GetContainerOOMScoreAdjust(container, int64(dm.machineInfo.MemoryCapacity))
}
cgroupName, err := dm.procFs.GetFullContainerName(containerInfo.State.Pid)
if err != nil {

View File

@ -41,7 +41,8 @@ func NewOOMAdjuster() *OOMAdjuster {
func getPids(cgroupName string) ([]int, error) {
fsManager := fs.Manager{
Cgroups: &configs.Cgroup{
Name: cgroupName,
Parent: "/",
Name: cgroupName,
},
}
return fsManager.GetPids()

View File

@ -33,8 +33,11 @@ import (
func RunInResourceContainer(containerName string) error {
manager := fs.Manager{
Cgroups: &configs.Cgroup{
Name: containerName,
AllowAllDevices: true,
Parent: "/",
Name: containerName,
Resources: &configs.Resources{
AllowAllDevices: true,
},
},
}