diff --git a/go.sum b/go.sum index 679382b57..460ed8554 100644 --- a/go.sum +++ b/go.sum @@ -887,6 +887,7 @@ github.com/sclevine/spec v1.2.0/go.mod h1:W4J29eT/Kzv7/b9IWLB055Z+qvVC9vt0Arko24 github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo= github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= +github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646 h1:RpforrEYXWkmGwJHIGnLZ3tTWStkjVVstwzNGqxX2Ds= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/sirupsen/logrus v1.0.4-0.20170822132746-89742aefa4b2/go.mod h1:pMByvHTf9Beacp5x1UXfOR9xyW/9antXMhjMPG0dEzc= github.com/sirupsen/logrus v1.0.6/go.mod h1:pMByvHTf9Beacp5x1UXfOR9xyW/9antXMhjMPG0dEzc= diff --git a/pkg/cri/instrument/instrumented_service.go b/pkg/cri/instrument/instrumented_service.go index 4fd0de443..f8a77d420 100644 --- a/pkg/cri/instrument/instrumented_service.go +++ b/pkg/cri/instrument/instrumented_service.go @@ -621,3 +621,19 @@ func (in *instrumentedService) ListPodSandboxMetrics(ctx context.Context, r *run res, err = in.c.ListPodSandboxMetrics(ctx, r) return res, errdefs.ToGRPC(err) } + +func (in *instrumentedService) RuntimeConfig(ctx context.Context, r *runtime.RuntimeConfigRequest) (res *runtime.RuntimeConfigResponse, err error) { + if err := in.checkInitialized(); err != nil { + return nil, err + } + log.G(ctx).Tracef("RuntimeConfig") + defer func() { + if err != nil { + log.G(ctx).WithError(err).Error("RuntimeConfig failed") + } else { + log.G(ctx).Tracef("RuntimeConfig returns config %+v", res) + } + }() + res, err = in.c.RuntimeConfig(ctx, r) + return res, errdefs.ToGRPC(err) +} diff --git a/pkg/cri/sbserver/runtime_config.go b/pkg/cri/sbserver/runtime_config.go new file mode 100644 index 000000000..37769f10d --- /dev/null +++ b/pkg/cri/sbserver/runtime_config.go @@ -0,0 +1,31 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sbserver + +import ( + "context" + + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +// RuntimeConfig returns configuration information of the runtime. +func (c *criService) RuntimeConfig(ctx context.Context, r *runtime.RuntimeConfigRequest) (*runtime.RuntimeConfigResponse, error) { + resp := &runtime.RuntimeConfigResponse{ + Linux: c.getLinuxRuntimeConfig(ctx), + } + return resp, nil +} diff --git a/pkg/cri/sbserver/runtime_config_linux.go b/pkg/cri/sbserver/runtime_config_linux.go new file mode 100644 index 000000000..0ee5345b9 --- /dev/null +++ b/pkg/cri/sbserver/runtime_config_linux.go @@ -0,0 +1,81 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sbserver + +import ( + "context" + "sort" + + "github.com/containerd/containerd/log" + runcoptions "github.com/containerd/containerd/runtime/v2/runc/options" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +func (c *criService) getLinuxRuntimeConfig(ctx context.Context) *runtime.LinuxRuntimeConfiguration { + return &runtime.LinuxRuntimeConfiguration{CgroupDriver: c.getCgroupDriver(ctx)} +} + +func (c *criService) getCgroupDriver(ctx context.Context) runtime.CgroupDriver { + // Go through the runtime handlers in a predictable order, starting from the + // default handler, others sorted in alphabetical order + handlerNames := make([]string, 0, len(c.config.ContainerdConfig.Runtimes)) + for n := range c.config.ContainerdConfig.Runtimes { + handlerNames = append(handlerNames, n) + } + sort.Slice(handlerNames, func(i, j int) bool { + if handlerNames[i] == c.config.ContainerdConfig.DefaultRuntimeName { + return true + } + if handlerNames[j] == c.config.ContainerdConfig.DefaultRuntimeName { + return false + } + return handlerNames[i] < handlerNames[j] + }) + + for _, handler := range handlerNames { + opts, err := generateRuntimeOptions(c.config.ContainerdConfig.Runtimes[handler]) + if err != nil { + log.G(ctx).Debugf("failed to parse runtime handler options for %q", handler) + continue + } + if d, ok := getCgroupDriverFromRuntimeHandlerOpts(opts); ok { + return d + } + log.G(ctx).Debugf("runtime handler %q does not provide cgroup driver information", handler) + } + + // If no runtime handlers have a setting, detect if systemd is running + d := runtime.CgroupDriver_CGROUPFS + if systemd.IsRunningSystemd() { + d = runtime.CgroupDriver_SYSTEMD + } + log.G(ctx).Debugf("no runtime handler provided cgroup driver setting, using auto-detected %s", runtime.CgroupDriver_name[int32(d)]) + return d +} + +func getCgroupDriverFromRuntimeHandlerOpts(opts interface{}) (runtime.CgroupDriver, bool) { + switch v := opts.(type) { + case *runcoptions.Options: + systemdCgroup := v.SystemdCgroup + if systemdCgroup { + return runtime.CgroupDriver_SYSTEMD, true + } + return runtime.CgroupDriver_CGROUPFS, true + } + return runtime.CgroupDriver_SYSTEMD, false +} diff --git a/pkg/cri/sbserver/runtime_config_linux_test.go b/pkg/cri/sbserver/runtime_config_linux_test.go new file mode 100644 index 000000000..a7a201172 --- /dev/null +++ b/pkg/cri/sbserver/runtime_config_linux_test.go @@ -0,0 +1,105 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sbserver + +import ( + "context" + "testing" + + criconfig "github.com/containerd/containerd/pkg/cri/config" + "github.com/containerd/containerd/plugin" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/stretchr/testify/assert" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +func newFakeRuntimeConfig(runcV2, systemdCgroup bool) criconfig.Runtime { + r := criconfig.Runtime{Type: "default", Options: map[string]interface{}{}} + if runcV2 { + r.Type = plugin.RuntimeRuncV2 + if systemdCgroup { + r.Options["SystemdCgroup"] = true + } + } + return r +} + +func TestRuntimeConfig(t *testing.T) { + autoDetected := runtime.CgroupDriver_CGROUPFS + if systemd.IsRunningSystemd() { + autoDetected = runtime.CgroupDriver_SYSTEMD + } + + for _, test := range []struct { + desc string + defaultRuntime string + runtimes map[string]criconfig.Runtime + expectedCgroupDriver runtime.CgroupDriver + }{ + { + desc: "no runtimes", + expectedCgroupDriver: autoDetected, + }, + { + desc: "non-runc runtime", + defaultRuntime: "non-runc", + runtimes: map[string]criconfig.Runtime{"non-runc": newFakeRuntimeConfig(false, false)}, + expectedCgroupDriver: autoDetected, + }, + { + desc: "no default, pick first in alphabetical order", + runtimes: map[string]criconfig.Runtime{ + "non-runc": newFakeRuntimeConfig(false, false), + "runc-2": newFakeRuntimeConfig(true, true), + "runc": newFakeRuntimeConfig(true, false), + "non-runc-2": newFakeRuntimeConfig(false, false), + }, + expectedCgroupDriver: runtime.CgroupDriver_CGROUPFS, + }, + { + desc: "pick default, cgroupfs", + defaultRuntime: "runc-2", + runtimes: map[string]criconfig.Runtime{ + "non-runc": newFakeRuntimeConfig(false, false), + "runc": newFakeRuntimeConfig(true, true), + "runc-2": newFakeRuntimeConfig(true, false), + }, + expectedCgroupDriver: runtime.CgroupDriver_CGROUPFS, + }, + { + desc: "pick default, systemd", + defaultRuntime: "runc-2", + runtimes: map[string]criconfig.Runtime{ + "non-runc": newFakeRuntimeConfig(false, false), + "runc": newFakeRuntimeConfig(true, false), + "runc-2": newFakeRuntimeConfig(true, true), + }, + expectedCgroupDriver: runtime.CgroupDriver_SYSTEMD, + }, + } { + test := test + t.Run(test.desc, func(t *testing.T) { + c := newTestCRIService() + c.config.PluginConfig.ContainerdConfig.DefaultRuntimeName = test.defaultRuntime + c.config.PluginConfig.ContainerdConfig.Runtimes = test.runtimes + + resp, err := c.RuntimeConfig(context.TODO(), &runtime.RuntimeConfigRequest{}) + assert.NoError(t, err) + assert.Equal(t, test.expectedCgroupDriver, resp.Linux.CgroupDriver, "got unexpected cgroup driver") + }) + } +} diff --git a/pkg/cri/sbserver/runtime_config_other.go b/pkg/cri/sbserver/runtime_config_other.go new file mode 100644 index 000000000..7559472ff --- /dev/null +++ b/pkg/cri/sbserver/runtime_config_other.go @@ -0,0 +1,29 @@ +//go:build !linux + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sbserver + +import ( + "context" + + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +func (c *criService) getLinuxRuntimeConfig(ctx context.Context) *runtime.LinuxRuntimeConfiguration { + return nil +} diff --git a/pkg/cri/server/runtime_config.go b/pkg/cri/server/runtime_config.go new file mode 100644 index 000000000..149d433ef --- /dev/null +++ b/pkg/cri/server/runtime_config.go @@ -0,0 +1,31 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package server + +import ( + "context" + + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +// RuntimeConfig returns configuration information of the runtime. +func (c *criService) RuntimeConfig(ctx context.Context, r *runtime.RuntimeConfigRequest) (*runtime.RuntimeConfigResponse, error) { + resp := &runtime.RuntimeConfigResponse{ + Linux: c.getLinuxRuntimeConfig(ctx), + } + return resp, nil +} diff --git a/pkg/cri/server/runtime_config_linux.go b/pkg/cri/server/runtime_config_linux.go new file mode 100644 index 000000000..f83ca3ba7 --- /dev/null +++ b/pkg/cri/server/runtime_config_linux.go @@ -0,0 +1,81 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package server + +import ( + "context" + "sort" + + "github.com/containerd/containerd/log" + runcoptions "github.com/containerd/containerd/runtime/v2/runc/options" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +func (c *criService) getLinuxRuntimeConfig(ctx context.Context) *runtime.LinuxRuntimeConfiguration { + return &runtime.LinuxRuntimeConfiguration{CgroupDriver: c.getCgroupDriver(ctx)} +} + +func (c *criService) getCgroupDriver(ctx context.Context) runtime.CgroupDriver { + // Go through the runtime handlers in a predictable order, starting from the + // default handler, others sorted in alphabetical order + handlerNames := make([]string, 0, len(c.config.ContainerdConfig.Runtimes)) + for n := range c.config.ContainerdConfig.Runtimes { + handlerNames = append(handlerNames, n) + } + sort.Slice(handlerNames, func(i, j int) bool { + if handlerNames[i] == c.config.ContainerdConfig.DefaultRuntimeName { + return true + } + if handlerNames[j] == c.config.ContainerdConfig.DefaultRuntimeName { + return false + } + return handlerNames[i] < handlerNames[j] + }) + + for _, handler := range handlerNames { + opts, err := generateRuntimeOptions(c.config.ContainerdConfig.Runtimes[handler]) + if err != nil { + log.G(ctx).Debugf("failed to parse runtime handler options for %q", handler) + continue + } + if d, ok := getCgroupDriverFromRuntimeHandlerOpts(opts); ok { + return d + } + log.G(ctx).Debugf("runtime handler %q does not provide cgroup driver information", handler) + } + + // If no runtime handlers have a setting, detect if systemd is running + d := runtime.CgroupDriver_CGROUPFS + if systemd.IsRunningSystemd() { + d = runtime.CgroupDriver_SYSTEMD + } + log.G(ctx).Debugf("no runtime handler provided cgroup driver setting, using auto-detected %s", runtime.CgroupDriver_name[int32(d)]) + return d +} + +func getCgroupDriverFromRuntimeHandlerOpts(opts interface{}) (runtime.CgroupDriver, bool) { + switch v := opts.(type) { + case *runcoptions.Options: + systemdCgroup := v.SystemdCgroup + if systemdCgroup { + return runtime.CgroupDriver_SYSTEMD, true + } + return runtime.CgroupDriver_CGROUPFS, true + } + return runtime.CgroupDriver_SYSTEMD, false +} diff --git a/pkg/cri/server/runtime_config_linux_test.go b/pkg/cri/server/runtime_config_linux_test.go new file mode 100644 index 000000000..143e86744 --- /dev/null +++ b/pkg/cri/server/runtime_config_linux_test.go @@ -0,0 +1,105 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package server + +import ( + "context" + "testing" + + criconfig "github.com/containerd/containerd/pkg/cri/config" + "github.com/containerd/containerd/plugin" + "github.com/opencontainers/runc/libcontainer/cgroups/systemd" + "github.com/stretchr/testify/assert" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +func newFakeRuntimeConfig(runcV2, systemdCgroup bool) criconfig.Runtime { + r := criconfig.Runtime{Type: "default", Options: map[string]interface{}{}} + if runcV2 { + r.Type = plugin.RuntimeRuncV2 + if systemdCgroup { + r.Options["SystemdCgroup"] = true + } + } + return r +} + +func TestRuntimeConfig(t *testing.T) { + autoDetected := runtime.CgroupDriver_CGROUPFS + if systemd.IsRunningSystemd() { + autoDetected = runtime.CgroupDriver_SYSTEMD + } + + for _, test := range []struct { + desc string + defaultRuntime string + runtimes map[string]criconfig.Runtime + expectedCgroupDriver runtime.CgroupDriver + }{ + { + desc: "no runtimes", + expectedCgroupDriver: autoDetected, + }, + { + desc: "non-runc runtime", + defaultRuntime: "non-runc", + runtimes: map[string]criconfig.Runtime{"non-runc": newFakeRuntimeConfig(false, false)}, + expectedCgroupDriver: autoDetected, + }, + { + desc: "no default, pick first in alphabetical order", + runtimes: map[string]criconfig.Runtime{ + "non-runc": newFakeRuntimeConfig(false, false), + "runc-2": newFakeRuntimeConfig(true, true), + "runc": newFakeRuntimeConfig(true, false), + "non-runc-2": newFakeRuntimeConfig(false, false), + }, + expectedCgroupDriver: runtime.CgroupDriver_CGROUPFS, + }, + { + desc: "pick default, cgroupfs", + defaultRuntime: "runc-2", + runtimes: map[string]criconfig.Runtime{ + "non-runc": newFakeRuntimeConfig(false, false), + "runc": newFakeRuntimeConfig(true, true), + "runc-2": newFakeRuntimeConfig(true, false), + }, + expectedCgroupDriver: runtime.CgroupDriver_CGROUPFS, + }, + { + desc: "pick default, systemd", + defaultRuntime: "runc-2", + runtimes: map[string]criconfig.Runtime{ + "non-runc": newFakeRuntimeConfig(false, false), + "runc": newFakeRuntimeConfig(true, false), + "runc-2": newFakeRuntimeConfig(true, true), + }, + expectedCgroupDriver: runtime.CgroupDriver_SYSTEMD, + }, + } { + test := test + t.Run(test.desc, func(t *testing.T) { + c := newTestCRIService() + c.config.PluginConfig.ContainerdConfig.DefaultRuntimeName = test.defaultRuntime + c.config.PluginConfig.ContainerdConfig.Runtimes = test.runtimes + + resp, err := c.RuntimeConfig(context.TODO(), &runtime.RuntimeConfigRequest{}) + assert.NoError(t, err) + assert.Equal(t, test.expectedCgroupDriver, resp.Linux.CgroupDriver, "got unexpected cgroup driver") + }) + } +} diff --git a/pkg/cri/server/runtime_config_other.go b/pkg/cri/server/runtime_config_other.go new file mode 100644 index 000000000..1039b2562 --- /dev/null +++ b/pkg/cri/server/runtime_config_other.go @@ -0,0 +1,29 @@ +//go:build !linux + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package server + +import ( + "context" + + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +func (c *criService) getLinuxRuntimeConfig(ctx context.Context) *runtime.LinuxRuntimeConfiguration { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go new file mode 100644 index 000000000..ba2b2266c --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go @@ -0,0 +1,59 @@ +package cgroups + +import ( + "github.com/opencontainers/runc/libcontainer/configs" +) + +type Manager interface { + // Apply creates a cgroup, if not yet created, and adds a process + // with the specified pid into that cgroup. A special value of -1 + // can be used to merely create a cgroup. + Apply(pid int) error + + // GetPids returns the PIDs of all processes inside the cgroup. + GetPids() ([]int, error) + + // GetAllPids returns the PIDs of all processes inside the cgroup + // any all its sub-cgroups. + GetAllPids() ([]int, error) + + // GetStats returns cgroups statistics. + GetStats() (*Stats, error) + + // Freeze sets the freezer cgroup to the specified state. + Freeze(state configs.FreezerState) error + + // Destroy removes cgroup. + Destroy() error + + // Path returns a cgroup path to the specified controller/subsystem. + // For cgroupv2, the argument is unused and can be empty. + Path(string) string + + // Set sets cgroup resources parameters/limits. If the argument is nil, + // the resources specified during Manager creation (or the previous call + // to Set) are used. + Set(r *configs.Resources) error + + // GetPaths returns cgroup path(s) to save in a state file in order to + // restore later. + // + // For cgroup v1, a key is cgroup subsystem name, and the value is the + // path to the cgroup for this subsystem. + // + // For cgroup v2 unified hierarchy, a key is "", and the value is the + // unified path. + GetPaths() map[string]string + + // GetCgroups returns the cgroup data as configured. + GetCgroups() (*configs.Cgroup, error) + + // GetFreezerState retrieves the current FreezerState of the cgroup. + GetFreezerState() (configs.FreezerState, error) + + // Exists returns whether the cgroup path exists or not. + Exists() bool + + // OOMKillCount reports OOM kill count for the cgroup. + OOMKillCount() (uint64, error) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go new file mode 100644 index 000000000..6c61ee4c0 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go @@ -0,0 +1,386 @@ +// SPDX-License-Identifier: Apache-2.0 +/* + * Copyright (C) 2020 Aleksa Sarai + * Copyright (C) 2020 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package devices + +import ( + "bufio" + "fmt" + "io" + "sort" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/devices" +) + +// deviceMeta is a Rule without the Allow or Permissions fields, and no +// wildcard-type support. It's effectively the "match" portion of a metadata +// rule, for the purposes of our emulation. +type deviceMeta struct { + node devices.Type + major int64 + minor int64 +} + +// deviceRule is effectively the tuple (deviceMeta, Permissions). +type deviceRule struct { + meta deviceMeta + perms devices.Permissions +} + +// deviceRules is a mapping of device metadata rules to the associated +// permissions in the ruleset. +type deviceRules map[deviceMeta]devices.Permissions + +func (r deviceRules) orderedEntries() []deviceRule { + var rules []deviceRule + for meta, perms := range r { + rules = append(rules, deviceRule{meta: meta, perms: perms}) + } + sort.Slice(rules, func(i, j int) bool { + // Sort by (major, minor, type). + a, b := rules[i].meta, rules[j].meta + return a.major < b.major || + (a.major == b.major && a.minor < b.minor) || + (a.major == b.major && a.minor == b.minor && a.node < b.node) + }) + return rules +} + +type Emulator struct { + defaultAllow bool + rules deviceRules +} + +func (e *Emulator) IsBlacklist() bool { + return e.defaultAllow +} + +func (e *Emulator) IsAllowAll() bool { + return e.IsBlacklist() && len(e.rules) == 0 +} + +func parseLine(line string) (*deviceRule, error) { + // Input: node major:minor perms. + fields := strings.FieldsFunc(line, func(r rune) bool { + return r == ' ' || r == ':' + }) + if len(fields) != 4 { + return nil, fmt.Errorf("malformed devices.list rule %s", line) + } + + var ( + rule deviceRule + node = fields[0] + major = fields[1] + minor = fields[2] + perms = fields[3] + ) + + // Parse the node type. + switch node { + case "a": + // Super-special case -- "a" always means every device with every + // access mode. In fact, for devices.list this actually indicates that + // the cgroup is in black-list mode. + // TODO: Double-check that the entire file is "a *:* rwm". + return nil, nil + case "b": + rule.meta.node = devices.BlockDevice + case "c": + rule.meta.node = devices.CharDevice + default: + return nil, fmt.Errorf("unknown device type %q", node) + } + + // Parse the major number. + if major == "*" { + rule.meta.major = devices.Wildcard + } else { + val, err := strconv.ParseUint(major, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid major number: %w", err) + } + rule.meta.major = int64(val) + } + + // Parse the minor number. + if minor == "*" { + rule.meta.minor = devices.Wildcard + } else { + val, err := strconv.ParseUint(minor, 10, 32) + if err != nil { + return nil, fmt.Errorf("invalid minor number: %w", err) + } + rule.meta.minor = int64(val) + } + + // Parse the access permissions. + rule.perms = devices.Permissions(perms) + if !rule.perms.IsValid() || rule.perms.IsEmpty() { + return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms) + } + return &rule, nil +} + +func (e *Emulator) addRule(rule deviceRule) error { //nolint:unparam + if e.rules == nil { + e.rules = make(map[deviceMeta]devices.Permissions) + } + + // Merge with any pre-existing permissions. + oldPerms := e.rules[rule.meta] + newPerms := rule.perms.Union(oldPerms) + e.rules[rule.meta] = newPerms + return nil +} + +func (e *Emulator) rmRule(rule deviceRule) error { + // Give an error if any of the permissions requested to be removed are + // present in a partially-matching wildcard rule, because such rules will + // be ignored by cgroupv1. + // + // This is a diversion from cgroupv1, but is necessary to avoid leading + // users into a false sense of security. cgroupv1 will silently(!) ignore + // requests to remove partial exceptions, but we really shouldn't do that. + // + // It may seem like we could just "split" wildcard rules which hit this + // issue, but unfortunately there are 2^32 possible major and minor + // numbers, which would exhaust kernel memory quickly if we did this. Not + // to mention it'd be really slow (the kernel side is implemented as a + // linked-list of exceptions). + for _, partialMeta := range []deviceMeta{ + {node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor}, + {node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard}, + {node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard}, + } { + // This wildcard rule is equivalent to the requested rule, so skip it. + if rule.meta == partialMeta { + continue + } + // Only give an error if the set of permissions overlap. + partialPerms := e.rules[partialMeta] + if !partialPerms.Intersection(rule.perms).IsEmpty() { + return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms) + } + } + + // Subtract all of the permissions listed from the full match rule. If the + // rule didn't exist, all of this is a no-op. + newPerms := e.rules[rule.meta].Difference(rule.perms) + if newPerms.IsEmpty() { + delete(e.rules, rule.meta) + } else { + e.rules[rule.meta] = newPerms + } + // TODO: The actual cgroup code doesn't care if an exception didn't exist + // during removal, so not erroring out here is /accurate/ but quite + // worrying. Maybe we should do additional validation, but again we + // have to worry about backwards-compatibility. + return nil +} + +func (e *Emulator) allow(rule *deviceRule) error { + // This cgroup is configured as a black-list. Reset the entire emulator, + // and put is into black-list mode. + if rule == nil || rule.meta.node == devices.WildcardDevice { + *e = Emulator{ + defaultAllow: true, + rules: nil, + } + return nil + } + + var err error + if e.defaultAllow { + err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception") + } else { + err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception") + } + return err +} + +func (e *Emulator) deny(rule *deviceRule) error { + // This cgroup is configured as a white-list. Reset the entire emulator, + // and put is into white-list mode. + if rule == nil || rule.meta.node == devices.WildcardDevice { + *e = Emulator{ + defaultAllow: false, + rules: nil, + } + return nil + } + + var err error + if e.defaultAllow { + err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception") + } else { + err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception") + } + return err +} + +func (e *Emulator) Apply(rule devices.Rule) error { + if !rule.Type.CanCgroup() { + return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type) + } + + innerRule := &deviceRule{ + meta: deviceMeta{ + node: rule.Type, + major: rule.Major, + minor: rule.Minor, + }, + perms: rule.Permissions, + } + if innerRule.meta.node == devices.WildcardDevice { + innerRule = nil + } + + if rule.Allow { + return e.allow(innerRule) + } + + return e.deny(innerRule) +} + +// EmulatorFromList takes a reader to a "devices.list"-like source, and returns +// a new Emulator that represents the state of the devices cgroup. Note that +// black-list devices cgroups cannot be fully reconstructed, due to limitations +// in the devices cgroup API. Instead, such cgroups are always treated as +// "allow all" cgroups. +func EmulatorFromList(list io.Reader) (*Emulator, error) { + // Normally cgroups are in black-list mode by default, but the way we + // figure out the current mode is whether or not devices.list has an + // allow-all rule. So we default to a white-list, and the existence of an + // "a *:* rwm" entry will tell us otherwise. + e := &Emulator{ + defaultAllow: false, + } + + // Parse the "devices.list". + s := bufio.NewScanner(list) + for s.Scan() { + line := s.Text() + deviceRule, err := parseLine(line) + if err != nil { + return nil, fmt.Errorf("error parsing line %q: %w", line, err) + } + // "devices.list" is an allow list. Note that this means that in + // black-list mode, we have no idea what rules are in play. As a + // result, we need to be very careful in Transition(). + if err := e.allow(deviceRule); err != nil { + return nil, fmt.Errorf("error adding devices.list rule: %w", err) + } + } + if err := s.Err(); err != nil { + return nil, fmt.Errorf("error reading devices.list lines: %w", err) + } + return e, nil +} + +// Transition calculates what is the minimally-disruptive set of rules need to +// be applied to a devices cgroup in order to transition to the given target. +// This means that any already-existing rules will not be applied, and +// disruptive rules (like denying all device access) will only be applied if +// necessary. +// +// This function is the sole reason for all of Emulator -- to allow us +// to figure out how to update a containers' cgroups without causing spurious +// device errors (if possible). +func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) { + var transitionRules []*devices.Rule + oldRules := source.rules + + // If the default policy doesn't match, we need to include a "disruptive" + // rule (either allow-all or deny-all) in order to switch the cgroup to the + // correct default policy. + // + // However, due to a limitation in "devices.list" we cannot be sure what + // deny rules are in place in a black-list cgroup. Thus if the source is a + // black-list we also have to include a disruptive rule. + if source.IsBlacklist() || source.defaultAllow != target.defaultAllow { + transitionRules = append(transitionRules, &devices.Rule{ + Type: 'a', + Major: -1, + Minor: -1, + Permissions: devices.Permissions("rwm"), + Allow: target.defaultAllow, + }) + // The old rules are only relevant if we aren't starting out with a + // disruptive rule. + oldRules = nil + } + + // NOTE: We traverse through the rules in a sorted order so we always write + // the same set of rules (this is to aid testing). + + // First, we create inverse rules for any old rules not in the new set. + // This includes partial-inverse rules for specific permissions. This is a + // no-op if we added a disruptive rule, since oldRules will be empty. + for _, rule := range oldRules.orderedEntries() { + meta, oldPerms := rule.meta, rule.perms + newPerms := target.rules[meta] + droppedPerms := oldPerms.Difference(newPerms) + if !droppedPerms.IsEmpty() { + transitionRules = append(transitionRules, &devices.Rule{ + Type: meta.node, + Major: meta.major, + Minor: meta.minor, + Permissions: droppedPerms, + Allow: target.defaultAllow, + }) + } + } + + // Add any additional rules which weren't in the old set. We happen to + // filter out rules which are present in both sets, though this isn't + // strictly necessary. + for _, rule := range target.rules.orderedEntries() { + meta, newPerms := rule.meta, rule.perms + oldPerms := oldRules[meta] + gainedPerms := newPerms.Difference(oldPerms) + if !gainedPerms.IsEmpty() { + transitionRules = append(transitionRules, &devices.Rule{ + Type: meta.node, + Major: meta.major, + Minor: meta.minor, + Permissions: gainedPerms, + Allow: !target.defaultAllow, + }) + } + } + return transitionRules, nil +} + +// Rules returns the minimum set of rules necessary to convert a *deny-all* +// cgroup to the emulated filter state (note that this is not the same as a +// default cgroupv1 cgroup -- which is allow-all). This is effectively just a +// wrapper around Transition() with the source emulator being an empty cgroup. +func (e *Emulator) Rules() ([]*devices.Rule, error) { + defaultCgroup := &Emulator{defaultAllow: false} + return defaultCgroup.Transition(e) +} + +func wrapErr(err error, text string) error { + if err == nil { + return nil + } + return fmt.Errorf(text+": %w", err) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go new file mode 100644 index 000000000..4e69b35bc --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go @@ -0,0 +1,208 @@ +// Package devicefilter contains eBPF device filter program +// +// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c +// +// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano) +// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397 +package devicefilter + +import ( + "errors" + "fmt" + "math" + "strconv" + + "github.com/cilium/ebpf/asm" + devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices" + "github.com/opencontainers/runc/libcontainer/devices" + "golang.org/x/sys/unix" +) + +const ( + // license string format is same as kernel MODULE_LICENSE macro + license = "Apache" +) + +// DeviceFilter returns eBPF device filter program and its license string +func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) { + // Generate the minimum ruleset for the device rules we are given. While we + // don't care about minimum transitions in cgroupv2, using the emulator + // gives us a guarantee that the behaviour of devices filtering is the same + // as cgroupv1, including security hardenings to avoid misconfiguration + // (such as punching holes in wildcard rules). + emu := new(devicesemulator.Emulator) + for _, rule := range rules { + if err := emu.Apply(*rule); err != nil { + return nil, "", err + } + } + cleanRules, err := emu.Rules() + if err != nil { + return nil, "", err + } + + p := &program{ + defaultAllow: emu.IsBlacklist(), + } + p.init() + + for idx, rule := range cleanRules { + if rule.Type == devices.WildcardDevice { + // We can safely skip over wildcard entries because there should + // only be one (at most) at the very start to instruct cgroupv1 to + // go into allow-list mode. However we do double-check this here. + if idx != 0 || rule.Allow != emu.IsBlacklist() { + return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString()) + } + continue + } + if rule.Allow == p.defaultAllow { + // There should be no rules which have an action equal to the + // default action, the emulator removes those. + return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString()) + } + if err := p.appendRule(rule); err != nil { + return nil, "", err + } + } + return p.finalize(), license, nil +} + +type program struct { + insts asm.Instructions + defaultAllow bool + blockID int +} + +func (p *program) init() { + // struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423 + /* + u32 access_type + u32 major + u32 minor + */ + // R2 <- type (lower 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R2, asm.R1, 0, asm.Word), + asm.And.Imm32(asm.R2, 0xFFFF)) + + // R3 <- access (upper 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R3, asm.R1, 0, asm.Word), + // RSh: bitwise shift right + asm.RSh.Imm32(asm.R3, 16)) + + // R4 <- major (u32 major at R1[4]) + p.insts = append(p.insts, + asm.LoadMem(asm.R4, asm.R1, 4, asm.Word)) + + // R5 <- minor (u32 minor at R1[8]) + p.insts = append(p.insts, + asm.LoadMem(asm.R5, asm.R1, 8, asm.Word)) +} + +// appendRule rule converts an OCI rule to the relevant eBPF block and adds it +// to the in-progress filter program. In order to operate properly, it must be +// called with a "clean" rule list (generated by devices.Emulator.Rules() -- +// with any "a" rules removed). +func (p *program) appendRule(rule *devices.Rule) error { + if p.blockID < 0 { + return errors.New("the program is finalized") + } + + var bpfType int32 + switch rule.Type { + case devices.CharDevice: + bpfType = int32(unix.BPF_DEVCG_DEV_CHAR) + case devices.BlockDevice: + bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK) + default: + // We do not permit 'a', nor any other types we don't know about. + return fmt.Errorf("invalid type %q", string(rule.Type)) + } + if rule.Major > math.MaxUint32 { + return fmt.Errorf("invalid major %d", rule.Major) + } + if rule.Minor > math.MaxUint32 { + return fmt.Errorf("invalid minor %d", rule.Major) + } + hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1 + hasMinor := rule.Minor >= 0 + bpfAccess := int32(0) + for _, r := range rule.Permissions { + switch r { + case 'r': + bpfAccess |= unix.BPF_DEVCG_ACC_READ + case 'w': + bpfAccess |= unix.BPF_DEVCG_ACC_WRITE + case 'm': + bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD + default: + return fmt.Errorf("unknown device access %v", r) + } + } + // If the access is rwm, skip the check. + hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD) + + var ( + blockSym = "block-" + strconv.Itoa(p.blockID) + nextBlockSym = "block-" + strconv.Itoa(p.blockID+1) + prevBlockLastIdx = len(p.insts) - 1 + ) + p.insts = append(p.insts, + // if (R2 != bpfType) goto next + asm.JNE.Imm(asm.R2, bpfType, nextBlockSym), + ) + if hasAccess { + p.insts = append(p.insts, + // if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next + asm.Mov.Reg32(asm.R1, asm.R3), + asm.And.Imm32(asm.R1, bpfAccess), + asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym), + ) + } + if hasMajor { + p.insts = append(p.insts, + // if (R4 != major) goto next + asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym), + ) + } + if hasMinor { + p.insts = append(p.insts, + // if (R5 != minor) goto next + asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym), + ) + } + p.insts = append(p.insts, acceptBlock(rule.Allow)...) + // set blockSym to the first instruction we added in this iteration + p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym) + p.blockID++ + return nil +} + +func (p *program) finalize() asm.Instructions { + var v int32 + if p.defaultAllow { + v = 1 + } + blockSym := "block-" + strconv.Itoa(p.blockID) + p.insts = append(p.insts, + // R0 <- v + asm.Mov.Imm32(asm.R0, v).Sym(blockSym), + asm.Return(), + ) + p.blockID = -1 + return p.insts +} + +func acceptBlock(accept bool) asm.Instructions { + var v int32 + if accept { + v = 1 + } + return []asm.Instruction{ + // R0 <- v + asm.Mov.Imm32(asm.R0, v), + asm.Return(), + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go new file mode 100644 index 000000000..35b00aaf0 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go @@ -0,0 +1,253 @@ +package ebpf + +import ( + "errors" + "fmt" + "os" + "runtime" + "sync" + "unsafe" + + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/cilium/ebpf/link" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +func nilCloser() error { + return nil +} + +func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) { + type bpfAttrQuery struct { + TargetFd uint32 + AttachType uint32 + QueryType uint32 + AttachFlags uint32 + ProgIds uint64 // __aligned_u64 + ProgCnt uint32 + } + + // Currently you can only have 64 eBPF programs attached to a cgroup. + size := 64 + retries := 0 + for retries < 10 { + progIds := make([]uint32, size) + query := bpfAttrQuery{ + TargetFd: uint32(dirFd), + AttachType: uint32(unix.BPF_CGROUP_DEVICE), + ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))), + ProgCnt: uint32(len(progIds)), + } + + // Fetch the list of program ids. + _, _, errno := unix.Syscall(unix.SYS_BPF, + uintptr(unix.BPF_PROG_QUERY), + uintptr(unsafe.Pointer(&query)), + unsafe.Sizeof(query)) + size = int(query.ProgCnt) + runtime.KeepAlive(query) + if errno != 0 { + // On ENOSPC we get the correct number of programs. + if errno == unix.ENOSPC { + retries++ + continue + } + return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno) + } + + // Convert the ids to program handles. + progIds = progIds[:size] + programs := make([]*ebpf.Program, 0, len(progIds)) + for _, progId := range progIds { + program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId)) + if err != nil { + // We skip over programs that give us -EACCES or -EPERM. This + // is necessary because there may be BPF programs that have + // been attached (such as with --systemd-cgroup) which have an + // LSM label that blocks us from interacting with the program. + // + // Because additional BPF_CGROUP_DEVICE programs only can add + // restrictions, there's no real issue with just ignoring these + // programs (and stops runc from breaking on distributions with + // very strict SELinux policies). + if errors.Is(err, os.ErrPermission) { + logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err) + continue + } + return nil, fmt.Errorf("cannot fetch program from id: %w", err) + } + programs = append(programs, program) + } + runtime.KeepAlive(progIds) + return programs, nil + } + + return nil, errors.New("could not get complete list of CGROUP_DEVICE programs") +} + +var ( + haveBpfProgReplaceBool bool + haveBpfProgReplaceOnce sync.Once +) + +// Loosely based on the BPF_F_REPLACE support check in +// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go. +// +// TODO: move this logic to cilium/ebpf +func haveBpfProgReplace() bool { + haveBpfProgReplaceOnce.Do(func() { + prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ + Type: ebpf.CGroupDevice, + License: "MIT", + Instructions: asm.Instructions{ + asm.Mov.Imm(asm.R0, 0), + asm.Return(), + }, + }) + if err != nil { + logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err) + return + } + defer prog.Close() + + devnull, err := os.Open("/dev/null") + if err != nil { + logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err) + return + } + defer devnull.Close() + + // We know that we have BPF_PROG_ATTACH since we can load + // BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL + // we know that the feature isn't present. + err = link.RawAttachProgram(link.RawAttachProgramOptions{ + // We rely on this fd being checked after attachFlags. + Target: int(devnull.Fd()), + // Attempt to "replace" bad fds with this program. + Program: prog, + Attach: ebpf.AttachCGroupDevice, + Flags: unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE, + }) + if errors.Is(err, unix.EINVAL) { + // not supported + return + } + // attach_flags test succeeded. + if !errors.Is(err, unix.EBADF) { + logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err) + } + haveBpfProgReplaceBool = true + }) + return haveBpfProgReplaceBool +} + +// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/ directory. +// +// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 . +// +// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92 +func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) { + // Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167). + // This limit is not inherited into the container. + memlockLimit := &unix.Rlimit{ + Cur: unix.RLIM_INFINITY, + Max: unix.RLIM_INFINITY, + } + _ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit) + + // Get the list of existing programs. + oldProgs, err := findAttachedCgroupDeviceFilters(dirFd) + if err != nil { + return nilCloser, err + } + useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1 + + // Generate new program. + spec := &ebpf.ProgramSpec{ + Type: ebpf.CGroupDevice, + Instructions: insts, + License: license, + } + prog, err := ebpf.NewProgram(spec) + if err != nil { + return nilCloser, err + } + + // If there is only one old program, we can just replace it directly. + var ( + replaceProg *ebpf.Program + attachFlags uint32 = unix.BPF_F_ALLOW_MULTI + ) + if useReplaceProg { + replaceProg = oldProgs[0] + attachFlags |= unix.BPF_F_REPLACE + } + err = link.RawAttachProgram(link.RawAttachProgramOptions{ + Target: dirFd, + Program: prog, + Replace: replaceProg, + Attach: ebpf.AttachCGroupDevice, + Flags: attachFlags, + }) + if err != nil { + return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err) + } + closer := func() error { + err = link.RawDetachProgram(link.RawDetachProgramOptions{ + Target: dirFd, + Program: prog, + Attach: ebpf.AttachCGroupDevice, + }) + if err != nil { + return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err) + } + // TODO: Should we attach the old filters back in this case? Otherwise + // we fail-open on a security feature, which is a bit scary. + return nil + } + if !useReplaceProg { + logLevel := logrus.DebugLevel + // If there was more than one old program, give a warning (since this + // really shouldn't happen with runc-managed cgroups) and then detach + // all the old programs. + if len(oldProgs) > 1 { + // NOTE: Ideally this should be a warning but it turns out that + // systemd-managed cgroups trigger this warning (apparently + // systemd doesn't delete old non-systemd programs when + // setting properties). + logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs)) + logLevel = logrus.InfoLevel + } + for idx, oldProg := range oldProgs { + // Output some extra debug info. + if info, err := oldProg.Info(); err == nil { + fields := logrus.Fields{ + "type": info.Type.String(), + "tag": info.Tag, + "name": info.Name, + } + if id, ok := info.ID(); ok { + fields["id"] = id + } + if runCount, ok := info.RunCount(); ok { + fields["run_count"] = runCount + } + if runtime, ok := info.Runtime(); ok { + fields["runtime"] = runtime.String() + } + logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx) + } + err = link.RawDetachProgram(link.RawDetachProgramOptions{ + Target: dirFd, + Program: oldProg, + Attach: ebpf.AttachCGroupDevice, + }) + if err != nil { + return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err) + } + } + } + return closer, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go new file mode 100644 index 000000000..0cdaf7478 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go @@ -0,0 +1,190 @@ +package cgroups + +import ( + "bytes" + "errors" + "fmt" + "os" + "path" + "strconv" + "strings" + "sync" + + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// OpenFile opens a cgroup file in a given dir with given flags. +// It is supposed to be used for cgroup files only, and returns +// an error if the file is not a cgroup file. +// +// Arguments dir and file are joined together to form an absolute path +// to a file being opened. +func OpenFile(dir, file string, flags int) (*os.File, error) { + if dir == "" { + return nil, fmt.Errorf("no directory specified for %s", file) + } + return openFile(dir, file, flags) +} + +// ReadFile reads data from a cgroup file in dir. +// It is supposed to be used for cgroup files only. +func ReadFile(dir, file string) (string, error) { + fd, err := OpenFile(dir, file, unix.O_RDONLY) + if err != nil { + return "", err + } + defer fd.Close() + var buf bytes.Buffer + + _, err = buf.ReadFrom(fd) + return buf.String(), err +} + +// WriteFile writes data to a cgroup file in dir. +// It is supposed to be used for cgroup files only. +func WriteFile(dir, file, data string) error { + fd, err := OpenFile(dir, file, unix.O_WRONLY) + if err != nil { + return err + } + defer fd.Close() + if err := retryingWriteFile(fd, data); err != nil { + // Having data in the error message helps in debugging. + return fmt.Errorf("failed to write %q: %w", data, err) + } + return nil +} + +func retryingWriteFile(fd *os.File, data string) error { + for { + _, err := fd.Write([]byte(data)) + if errors.Is(err, unix.EINTR) { + logrus.Infof("interrupted while writing %s to %s", data, fd.Name()) + continue + } + return err + } +} + +const ( + cgroupfsDir = "/sys/fs/cgroup" + cgroupfsPrefix = cgroupfsDir + "/" +) + +var ( + // TestMode is set to true by unit tests that need "fake" cgroupfs. + TestMode bool + + cgroupFd int = -1 + prepOnce sync.Once + prepErr error + resolveFlags uint64 +) + +func prepareOpenat2() error { + prepOnce.Do(func() { + fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{ + Flags: unix.O_DIRECTORY | unix.O_PATH, + }) + if err != nil { + prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err} + if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare + logrus.Warnf("falling back to securejoin: %s", prepErr) + } else { + logrus.Debug("openat2 not available, falling back to securejoin") + } + return + } + var st unix.Statfs_t + if err = unix.Fstatfs(fd, &st); err != nil { + prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err} + logrus.Warnf("falling back to securejoin: %s", prepErr) + return + } + + cgroupFd = fd + + resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS + if st.Type == unix.CGROUP2_SUPER_MAGIC { + // cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks + resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS + } + }) + + return prepErr +} + +func openFile(dir, file string, flags int) (*os.File, error) { + mode := os.FileMode(0) + if TestMode && flags&os.O_WRONLY != 0 { + // "emulate" cgroup fs for unit tests + flags |= os.O_TRUNC | os.O_CREATE + mode = 0o600 + } + path := path.Join(dir, file) + if prepareOpenat2() != nil { + return openFallback(path, flags, mode) + } + relPath := strings.TrimPrefix(path, cgroupfsPrefix) + if len(relPath) == len(path) { // non-standard path, old system? + return openFallback(path, flags, mode) + } + + fd, err := unix.Openat2(cgroupFd, relPath, + &unix.OpenHow{ + Resolve: resolveFlags, + Flags: uint64(flags) | unix.O_CLOEXEC, + Mode: uint64(mode), + }) + if err != nil { + err = &os.PathError{Op: "openat2", Path: path, Err: err} + // Check if cgroupFd is still opened to cgroupfsDir + // (happens when this package is incorrectly used + // across the chroot/pivot_root/mntns boundary, or + // when /sys/fs/cgroup is remounted). + // + // TODO: if such usage will ever be common, amend this + // to reopen cgroupFd and retry openat2. + fdStr := strconv.Itoa(cgroupFd) + fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr) + if fdDest != cgroupfsDir { + // Wrap the error so it is clear that cgroupFd + // is opened to an unexpected/wrong directory. + err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w", + fdStr, fdDest, cgroupfsDir, err) + } + return nil, err + } + + return os.NewFile(uintptr(fd), path), nil +} + +var errNotCgroupfs = errors.New("not a cgroup file") + +// Can be changed by unit tests. +var openFallback = openAndCheck + +// openAndCheck is used when openat2(2) is not available. It checks the opened +// file is on cgroupfs, returning an error otherwise. +func openAndCheck(path string, flags int, mode os.FileMode) (*os.File, error) { + fd, err := os.OpenFile(path, flags, mode) + if err != nil { + return nil, err + } + if TestMode { + return fd, nil + } + // Check this is a cgroupfs file. + var st unix.Statfs_t + if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil { + _ = fd.Close() + return nil, &os.PathError{Op: "statfs", Path: path, Err: err} + } + if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC { + _ = fd.Close() + return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs} + } + + return fd, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go new file mode 100644 index 000000000..c81b6562a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go @@ -0,0 +1,311 @@ +package fs + +import ( + "bufio" + "os" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type BlkioGroup struct { + weightFilename string + weightDeviceFilename string +} + +func (s *BlkioGroup) Name() string { + return "blkio" +} + +func (s *BlkioGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *BlkioGroup) Set(path string, r *configs.Resources) error { + s.detectWeightFilenames(path) + if r.BlkioWeight != 0 { + if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil { + return err + } + } + + if r.BlkioLeafWeight != 0 { + if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil { + return err + } + } + for _, wd := range r.BlkioWeightDevice { + if wd.Weight != 0 { + if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil { + return err + } + } + if wd.LeafWeight != 0 { + if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { + return err + } + } + } + for _, td := range r.BlkioThrottleReadBpsDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleWriteBpsDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleReadIOPSDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleWriteIOPSDevice { + if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { + return err + } + } + + return nil +} + +/* +examples: + + blkio.sectors + 8:0 6792 + + blkio.io_service_bytes + 8:0 Read 1282048 + 8:0 Write 2195456 + 8:0 Sync 2195456 + 8:0 Async 1282048 + 8:0 Total 3477504 + Total 3477504 + + blkio.io_serviced + 8:0 Read 124 + 8:0 Write 104 + 8:0 Sync 104 + 8:0 Async 124 + 8:0 Total 228 + Total 228 + + blkio.io_queued + 8:0 Read 0 + 8:0 Write 0 + 8:0 Sync 0 + 8:0 Async 0 + 8:0 Total 0 + Total 0 +*/ + +func splitBlkioStatLine(r rune) bool { + return r == ' ' || r == ':' +} + +func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) { + var blkioStats []cgroups.BlkioStatEntry + f, err := cgroups.OpenFile(dir, file, os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return blkioStats, nil + } + return nil, err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + // format: dev type amount + fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine) + if len(fields) < 3 { + if len(fields) == 2 && fields[0] == "Total" { + // skip total line + continue + } else { + return nil, malformedLine(dir, file, sc.Text()) + } + } + + v, err := strconv.ParseUint(fields[0], 10, 64) + if err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + major := v + + v, err = strconv.ParseUint(fields[1], 10, 64) + if err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + minor := v + + op := "" + valueField := 2 + if len(fields) == 4 { + op = fields[2] + valueField = 3 + } + v, err = strconv.ParseUint(fields[valueField], 10, 64) + if err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v}) + } + if err := sc.Err(); err != nil { + return nil, &parseError{Path: dir, File: file, Err: err} + } + + return blkioStats, nil +} + +func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error { + type blkioStatInfo struct { + filename string + blkioStatEntriesPtr *[]cgroups.BlkioStatEntry + } + bfqDebugStats := []blkioStatInfo{ + { + filename: "blkio.bfq.sectors_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, + }, + { + filename: "blkio.bfq.io_service_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, + }, + { + filename: "blkio.bfq.io_wait_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, + }, + { + filename: "blkio.bfq.io_merged_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, + }, + { + filename: "blkio.bfq.io_queued_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, + }, + { + filename: "blkio.bfq.time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, + }, + { + filename: "blkio.bfq.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.bfq.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + bfqStats := []blkioStatInfo{ + { + filename: "blkio.bfq.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.bfq.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + cfqStats := []blkioStatInfo{ + { + filename: "blkio.sectors_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, + }, + { + filename: "blkio.io_service_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, + }, + { + filename: "blkio.io_wait_time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, + }, + { + filename: "blkio.io_merged_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, + }, + { + filename: "blkio.io_queued_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, + }, + { + filename: "blkio.time_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, + }, + { + filename: "blkio.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + throttleRecursiveStats := []blkioStatInfo{ + { + filename: "blkio.throttle.io_serviced_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.throttle.io_service_bytes_recursive", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + baseStats := []blkioStatInfo{ + { + filename: "blkio.throttle.io_serviced", + blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, + }, + { + filename: "blkio.throttle.io_service_bytes", + blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, + }, + } + orderedStats := [][]blkioStatInfo{ + bfqDebugStats, + bfqStats, + cfqStats, + throttleRecursiveStats, + baseStats, + } + + var blkioStats []cgroups.BlkioStatEntry + var err error + + for _, statGroup := range orderedStats { + for i, statInfo := range statGroup { + if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil { + // if error occurs on first file, move to next group + if i == 0 { + break + } + return err + } + *statInfo.blkioStatEntriesPtr = blkioStats + // finish if all stats are gathered + if i == len(statGroup)-1 { + return nil + } + } + } + return nil +} + +func (s *BlkioGroup) detectWeightFilenames(path string) { + if s.weightFilename != "" { + // Already detected. + return + } + if cgroups.PathExists(filepath.Join(path, "blkio.weight")) { + s.weightFilename = "blkio.weight" + s.weightDeviceFilename = "blkio.weight_device" + } else { + s.weightFilename = "blkio.bfq.weight" + s.weightDeviceFilename = "blkio.bfq.weight_device" + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go new file mode 100644 index 000000000..6c79f899b --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go @@ -0,0 +1,129 @@ +package fs + +import ( + "bufio" + "errors" + "fmt" + "os" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +type CpuGroup struct{} + +func (s *CpuGroup) Name() string { + return "cpu" +} + +func (s *CpuGroup) Apply(path string, r *configs.Resources, pid int) error { + if err := os.MkdirAll(path, 0o755); err != nil { + return err + } + // We should set the real-Time group scheduling settings before moving + // in the process because if the process is already in SCHED_RR mode + // and no RT bandwidth is set, adding it will fail. + if err := s.SetRtSched(path, r); err != nil { + return err + } + // Since we are not using apply(), we need to place the pid + // into the procs file. + return cgroups.WriteCgroupProc(path, pid) +} + +func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error { + if r.CpuRtPeriod != 0 { + if err := cgroups.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil { + return err + } + } + if r.CpuRtRuntime != 0 { + if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil { + return err + } + } + return nil +} + +func (s *CpuGroup) Set(path string, r *configs.Resources) error { + if r.CpuShares != 0 { + shares := r.CpuShares + if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil { + return err + } + // read it back + sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares") + if err != nil { + return err + } + // ... and check + if shares > sharesRead { + return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead) + } else if shares < sharesRead { + return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead) + } + } + + var period string + if r.CpuPeriod != 0 { + period = strconv.FormatUint(r.CpuPeriod, 10) + if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { + // Sometimes when the period to be set is smaller + // than the current one, it is rejected by the kernel + // (EINVAL) as old_quota/new_period exceeds the parent + // cgroup quota limit. If this happens and the quota is + // going to be set, ignore the error for now and retry + // after setting the quota. + if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 { + return err + } + } else { + period = "" + } + } + if r.CpuQuota != 0 { + if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil { + return err + } + if period != "" { + if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { + return err + } + } + } + return s.SetRtSched(path, r) +} + +func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error { + const file = "cpu.stat" + f, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := fscommon.ParseKeyValue(sc.Text()) + if err != nil { + return &parseError{Path: path, File: file, Err: err} + } + switch t { + case "nr_periods": + stats.CpuStats.ThrottlingData.Periods = v + + case "nr_throttled": + stats.CpuStats.ThrottlingData.ThrottledPeriods = v + + case "throttled_time": + stats.CpuStats.ThrottlingData.ThrottledTime = v + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go new file mode 100644 index 000000000..d3bd7e111 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go @@ -0,0 +1,166 @@ +package fs + +import ( + "bufio" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + cgroupCpuacctStat = "cpuacct.stat" + cgroupCpuacctUsageAll = "cpuacct.usage_all" + + nanosecondsInSecond = 1000000000 + + userModeColumn = 1 + kernelModeColumn = 2 + cuacctUsageAllColumnsNumber = 3 + + // The value comes from `C.sysconf(C._SC_CLK_TCK)`, and + // on Linux it's a constant which is safe to be hard coded, + // so we can avoid using cgo here. For details, see: + // https://github.com/containerd/cgroups/pull/12 + clockTicks uint64 = 100 +) + +type CpuacctGroup struct{} + +func (s *CpuacctGroup) Name() string { + return "cpuacct" +} + +func (s *CpuacctGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *CpuacctGroup) Set(_ string, _ *configs.Resources) error { + return nil +} + +func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } + userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path) + if err != nil { + return err + } + + totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage") + if err != nil { + return err + } + + percpuUsage, err := getPercpuUsage(path) + if err != nil { + return err + } + + percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path) + if err != nil { + return err + } + + stats.CpuStats.CpuUsage.TotalUsage = totalUsage + stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage + stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode + stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode + stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage + stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage + return nil +} + +// Returns user and kernel usage breakdown in nanoseconds. +func getCpuUsageBreakdown(path string) (uint64, uint64, error) { + var userModeUsage, kernelModeUsage uint64 + const ( + userField = "user" + systemField = "system" + file = cgroupCpuacctStat + ) + + // Expected format: + // user + // system + data, err := cgroups.ReadFile(path, file) + if err != nil { + return 0, 0, err + } + // TODO: use strings.SplitN instead. + fields := strings.Fields(data) + if len(fields) < 4 || fields[0] != userField || fields[2] != systemField { + return 0, 0, malformedLine(path, file, data) + } + if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil { + return 0, 0, &parseError{Path: path, File: file, Err: err} + } + if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil { + return 0, 0, &parseError{Path: path, File: file, Err: err} + } + + return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil +} + +func getPercpuUsage(path string) ([]uint64, error) { + const file = "cpuacct.usage_percpu" + percpuUsage := []uint64{} + data, err := cgroups.ReadFile(path, file) + if err != nil { + return percpuUsage, err + } + // TODO: use strings.SplitN instead. + for _, value := range strings.Fields(data) { + value, err := strconv.ParseUint(value, 10, 64) + if err != nil { + return percpuUsage, &parseError{Path: path, File: file, Err: err} + } + percpuUsage = append(percpuUsage, value) + } + return percpuUsage, nil +} + +func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) { + usageKernelMode := []uint64{} + usageUserMode := []uint64{} + const file = cgroupCpuacctUsageAll + + fd, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if os.IsNotExist(err) { + return usageKernelMode, usageUserMode, nil + } else if err != nil { + return nil, nil, err + } + defer fd.Close() + + scanner := bufio.NewScanner(fd) + scanner.Scan() // skipping header line + + for scanner.Scan() { + lineFields := strings.SplitN(scanner.Text(), " ", cuacctUsageAllColumnsNumber+1) + if len(lineFields) != cuacctUsageAllColumnsNumber { + continue + } + + usageInKernelMode, err := strconv.ParseUint(lineFields[kernelModeColumn], 10, 64) + if err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + usageKernelMode = append(usageKernelMode, usageInKernelMode) + + usageInUserMode, err := strconv.ParseUint(lineFields[userModeColumn], 10, 64) + if err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + usageUserMode = append(usageUserMode, usageInUserMode) + } + if err := scanner.Err(); err != nil { + return nil, nil, &parseError{Path: path, File: file, Err: err} + } + + return usageKernelMode, usageUserMode, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go new file mode 100644 index 000000000..550baa427 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go @@ -0,0 +1,245 @@ +package fs + +import ( + "errors" + "os" + "path/filepath" + "strconv" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type CpusetGroup struct{} + +func (s *CpusetGroup) Name() string { + return "cpuset" +} + +func (s *CpusetGroup) Apply(path string, r *configs.Resources, pid int) error { + return s.ApplyDir(path, r, pid) +} + +func (s *CpusetGroup) Set(path string, r *configs.Resources) error { + if r.CpusetCpus != "" { + if err := cgroups.WriteFile(path, "cpuset.cpus", r.CpusetCpus); err != nil { + return err + } + } + if r.CpusetMems != "" { + if err := cgroups.WriteFile(path, "cpuset.mems", r.CpusetMems); err != nil { + return err + } + } + return nil +} + +func getCpusetStat(path string, file string) ([]uint16, error) { + var extracted []uint16 + fileContent, err := fscommon.GetCgroupParamString(path, file) + if err != nil { + return extracted, err + } + if len(fileContent) == 0 { + return extracted, &parseError{Path: path, File: file, Err: errors.New("empty file")} + } + + for _, s := range strings.Split(fileContent, ",") { + sp := strings.SplitN(s, "-", 3) + switch len(sp) { + case 3: + return extracted, &parseError{Path: path, File: file, Err: errors.New("extra dash")} + case 2: + min, err := strconv.ParseUint(sp[0], 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + max, err := strconv.ParseUint(sp[1], 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + if min > max { + return extracted, &parseError{Path: path, File: file, Err: errors.New("invalid values, min > max")} + } + for i := min; i <= max; i++ { + extracted = append(extracted, uint16(i)) + } + case 1: + value, err := strconv.ParseUint(s, 10, 16) + if err != nil { + return extracted, &parseError{Path: path, File: file, Err: err} + } + extracted = append(extracted, uint16(value)) + } + } + + return extracted, nil +} + +func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error { + var err error + + stats.CPUSetStats.CPUs, err = getCpusetStat(path, "cpuset.cpus") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.cpu_exclusive") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.Mems, err = getCpusetStat(path, "cpuset.mems") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_hardwall") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_exclusive") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_migrate") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_page") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_slab") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_pressure") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, "cpuset.sched_load_balance") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, "cpuset.sched_relax_domain_level") + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + + return nil +} + +func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error { + // This might happen if we have no cpuset cgroup mounted. + // Just do nothing and don't fail. + if dir == "" { + return nil + } + // 'ensureParent' start with parent because we don't want to + // explicitly inherit from parent, it could conflict with + // 'cpuset.cpu_exclusive'. + if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil { + return err + } + if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) { + return err + } + // We didn't inherit cpuset configs from parent, but we have + // to ensure cpuset configs are set before moving task into the + // cgroup. + // The logic is, if user specified cpuset configs, use these + // specified configs, otherwise, inherit from parent. This makes + // cpuset configs work correctly with 'cpuset.cpu_exclusive', and + // keep backward compatibility. + if err := s.ensureCpusAndMems(dir, r); err != nil { + return err + } + // Since we are not using apply(), we need to place the pid + // into the procs file. + return cgroups.WriteCgroupProc(dir, pid) +} + +func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) { + if cpus, err = cgroups.ReadFile(parent, "cpuset.cpus"); err != nil { + return + } + if mems, err = cgroups.ReadFile(parent, "cpuset.mems"); err != nil { + return + } + return cpus, mems, nil +} + +// cpusetEnsureParent makes sure that the parent directories of current +// are created and populated with the proper cpus and mems files copied +// from their respective parent. It does that recursively, starting from +// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point). +func cpusetEnsureParent(current string) error { + var st unix.Statfs_t + + parent := filepath.Dir(current) + err := unix.Statfs(parent, &st) + if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC { + return nil + } + // Treat non-existing directory as cgroupfs as it will be created, + // and the root cpuset directory obviously exists. + if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare + return &os.PathError{Op: "statfs", Path: parent, Err: err} + } + + if err := cpusetEnsureParent(parent); err != nil { + return err + } + if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) { + return err + } + return cpusetCopyIfNeeded(current, parent) +} + +// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent +// directory to the current directory if the file's contents are 0 +func cpusetCopyIfNeeded(current, parent string) error { + currentCpus, currentMems, err := getCpusetSubsystemSettings(current) + if err != nil { + return err + } + parentCpus, parentMems, err := getCpusetSubsystemSettings(parent) + if err != nil { + return err + } + + if isEmptyCpuset(currentCpus) { + if err := cgroups.WriteFile(current, "cpuset.cpus", parentCpus); err != nil { + return err + } + } + if isEmptyCpuset(currentMems) { + if err := cgroups.WriteFile(current, "cpuset.mems", parentMems); err != nil { + return err + } + } + return nil +} + +func isEmptyCpuset(str string) bool { + return str == "" || str == "\n" +} + +func (s *CpusetGroup) ensureCpusAndMems(path string, r *configs.Resources) error { + if err := s.Set(path, r); err != nil { + return err + } + return cpusetCopyIfNeeded(path, filepath.Dir(path)) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go new file mode 100644 index 000000000..4527a70eb --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go @@ -0,0 +1,109 @@ +package fs + +import ( + "bytes" + "errors" + "reflect" + + "github.com/opencontainers/runc/libcontainer/cgroups" + cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runc/libcontainer/userns" +) + +type DevicesGroup struct { + TestingSkipFinalCheck bool +} + +func (s *DevicesGroup) Name() string { + return "devices" +} + +func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error { + if r.SkipDevices { + return nil + } + if path == "" { + // Return error here, since devices cgroup + // is a hard requirement for container's security. + return errSubsystemDoesNotExist + } + + return apply(path, pid) +} + +func loadEmulator(path string) (*cgroupdevices.Emulator, error) { + list, err := cgroups.ReadFile(path, "devices.list") + if err != nil { + return nil, err + } + return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list)) +} + +func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) { + // This defaults to a white-list -- which is what we want! + emu := &cgroupdevices.Emulator{} + for _, rule := range rules { + if err := emu.Apply(*rule); err != nil { + return nil, err + } + } + return emu, nil +} + +func (s *DevicesGroup) Set(path string, r *configs.Resources) error { + if userns.RunningInUserNS() || r.SkipDevices { + return nil + } + + // Generate two emulators, one for the current state of the cgroup and one + // for the requested state by the user. + current, err := loadEmulator(path) + if err != nil { + return err + } + target, err := buildEmulator(r.Devices) + if err != nil { + return err + } + + // Compute the minimal set of transition rules needed to achieve the + // requested state. + transitionRules, err := current.Transition(target) + if err != nil { + return err + } + for _, rule := range transitionRules { + file := "devices.deny" + if rule.Allow { + file = "devices.allow" + } + if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil { + return err + } + } + + // Final safety check -- ensure that the resulting state is what was + // requested. This is only really correct for white-lists, but for + // black-lists we can at least check that the cgroup is in the right mode. + // + // This safety-check is skipped for the unit tests because we cannot + // currently mock devices.list correctly. + if !s.TestingSkipFinalCheck { + currentAfter, err := loadEmulator(path) + if err != nil { + return err + } + if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) { + return errors.New("resulting devices cgroup doesn't precisely match target") + } else if target.IsBlacklist() != currentAfter.IsBlacklist() { + return errors.New("resulting devices cgroup doesn't match target mode") + } + } + return nil +} + +func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/error.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/error.go new file mode 100644 index 000000000..f2ab6f130 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/error.go @@ -0,0 +1,15 @@ +package fs + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +type parseError = fscommon.ParseError + +// malformedLine is used by all cgroupfs file parsers that expect a line +// in a particular format but get some garbage instead. +func malformedLine(path, file, line string) error { + return &parseError{Path: path, File: file, Err: fmt.Errorf("malformed line: %s", line)} +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go new file mode 100644 index 000000000..987f1bf5e --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go @@ -0,0 +1,158 @@ +package fs + +import ( + "errors" + "fmt" + "os" + "strings" + "time" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +type FreezerGroup struct{} + +func (s *FreezerGroup) Name() string { + return "freezer" +} + +func (s *FreezerGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) { + switch r.Freezer { + case configs.Frozen: + defer func() { + if Err != nil { + // Freezing failed, and it is bad and dangerous + // to leave the cgroup in FROZEN or FREEZING + // state, so (try to) thaw it back. + _ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) + } + }() + + // As per older kernel docs (freezer-subsystem.txt before + // kernel commit ef9fe980c6fcc1821), if FREEZING is seen, + // userspace should either retry or thaw. While current + // kernel cgroup v1 docs no longer mention a need to retry, + // even a recent kernel (v5.4, Ubuntu 20.04) can't reliably + // freeze a cgroup v1 while new processes keep appearing in it + // (either via fork/clone or by writing new PIDs to + // cgroup.procs). + // + // The numbers below are empirically chosen to have a decent + // chance to succeed in various scenarios ("runc pause/unpause + // with parallel runc exec" and "bare freeze/unfreeze on a very + // slow system"), tested on RHEL7 and Ubuntu 20.04 kernels. + // + // Adding any amount of sleep in between retries did not + // increase the chances of successful freeze in "pause/unpause + // with parallel exec" reproducer. OTOH, adding an occasional + // sleep helped for the case where the system is extremely slow + // (CentOS 7 VM on GHA CI). + // + // Alas, this is still a game of chances, since the real fix + // belong to the kernel (cgroup v2 do not have this bug). + + for i := 0; i < 1000; i++ { + if i%50 == 49 { + // Occasional thaw and sleep improves + // the chances to succeed in freezing + // in case new processes keep appearing + // in the cgroup. + _ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) + time.Sleep(10 * time.Millisecond) + } + + if err := cgroups.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil { + return err + } + + if i%25 == 24 { + // Occasional short sleep before reading + // the state back also improves the chances to + // succeed in freezing in case of a very slow + // system. + time.Sleep(10 * time.Microsecond) + } + state, err := cgroups.ReadFile(path, "freezer.state") + if err != nil { + return err + } + state = strings.TrimSpace(state) + switch state { + case "FREEZING": + continue + case string(configs.Frozen): + if i > 1 { + logrus.Debugf("frozen after %d retries", i) + } + return nil + default: + // should never happen + return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state)) + } + } + // Despite our best efforts, it got stuck in FREEZING. + return errors.New("unable to freeze") + case configs.Thawed: + return cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) + case configs.Undefined: + return nil + default: + return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer)) + } +} + +func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} + +func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) { + for { + state, err := cgroups.ReadFile(path, "freezer.state") + if err != nil { + // If the kernel is too old, then we just treat the freezer as + // being in an "undefined" state. + if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { + err = nil + } + return configs.Undefined, err + } + switch strings.TrimSpace(state) { + case "THAWED": + return configs.Thawed, nil + case "FROZEN": + // Find out whether the cgroup is frozen directly, + // or indirectly via an ancestor. + self, err := cgroups.ReadFile(path, "freezer.self_freezing") + if err != nil { + // If the kernel is too old, then we just treat + // it as being frozen. + if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) { + err = nil + } + return configs.Frozen, err + } + switch self { + case "0\n": + return configs.Thawed, nil + case "1\n": + return configs.Frozen, nil + default: + return configs.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self) + } + case "FREEZING": + // Make sure we get a stable freezer state, so retry if the cgroup + // is still undergoing freezing. This should be a temporary delay. + time.Sleep(1 * time.Millisecond) + continue + default: + return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state) + } + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go new file mode 100644 index 000000000..9e2f0ec04 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go @@ -0,0 +1,265 @@ +package fs + +import ( + "errors" + "fmt" + "os" + "sync" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +var subsystems = []subsystem{ + &CpusetGroup{}, + &DevicesGroup{}, + &MemoryGroup{}, + &CpuGroup{}, + &CpuacctGroup{}, + &PidsGroup{}, + &BlkioGroup{}, + &HugetlbGroup{}, + &NetClsGroup{}, + &NetPrioGroup{}, + &PerfEventGroup{}, + &FreezerGroup{}, + &RdmaGroup{}, + &NameGroup{GroupName: "name=systemd", Join: true}, + &NameGroup{GroupName: "misc", Join: true}, +} + +var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") + +func init() { + // If using cgroups-hybrid mode then add a "" controller indicating + // it should join the cgroups v2. + if cgroups.IsCgroup2HybridMode() { + subsystems = append(subsystems, &NameGroup{GroupName: "", Join: true}) + } +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + // GetStats fills in the stats for the subsystem. + GetStats(path string, stats *cgroups.Stats) error + // Apply creates and joins a cgroup, adding pid into it. Some + // subsystems use resources to pre-configure the cgroup parents + // before creating or joining it. + Apply(path string, r *configs.Resources, pid int) error + // Set sets the cgroup resources. + Set(path string, r *configs.Resources) error +} + +type manager struct { + mu sync.Mutex + cgroups *configs.Cgroup + paths map[string]string +} + +func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) { + // Some v1 controllers (cpu, cpuset, and devices) expect + // cgroups.Resources to not be nil in Apply. + if cg.Resources == nil { + return nil, errors.New("cgroup v1 manager needs configs.Resources to be set during manager creation") + } + if cg.Resources.Unified != nil { + return nil, cgroups.ErrV1NoUnified + } + + if paths == nil { + var err error + paths, err = initPaths(cg) + if err != nil { + return nil, err + } + } + + return &manager{ + cgroups: cg, + paths: paths, + }, nil +} + +// isIgnorableError returns whether err is a permission error (in the loose +// sense of the word). This includes EROFS (which for an unprivileged user is +// basically a permission error) and EACCES (for similar reasons) as well as +// the normal EPERM. +func isIgnorableError(rootless bool, err error) bool { + // We do not ignore errors if we are root. + if !rootless { + return false + } + // Is it an ordinary EPERM? + if errors.Is(err, os.ErrPermission) { + return true + } + // Handle some specific syscall errors. + var errno unix.Errno + if errors.As(err, &errno) { + return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES + } + return false +} + +func (m *manager) Apply(pid int) (err error) { + m.mu.Lock() + defer m.mu.Unlock() + + c := m.cgroups + + for _, sys := range subsystems { + name := sys.Name() + p, ok := m.paths[name] + if !ok { + continue + } + + if err := sys.Apply(p, c.Resources, pid); err != nil { + // In the case of rootless (including euid=0 in userns), where an + // explicit cgroup path hasn't been set, we don't bail on error in + // case of permission problems here, but do delete the path from + // the m.paths map, since it is either non-existent and could not + // be created, or the pid could not be added to it. + // + // Cases where limits for the subsystem have been set are handled + // later by Set, which fails with a friendly error (see + // if path == "" in Set). + if isIgnorableError(c.Rootless, err) && c.Path == "" { + delete(m.paths, name) + continue + } + return err + } + + } + return nil +} + +func (m *manager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + return cgroups.RemovePaths(m.paths) +} + +func (m *manager) Path(subsys string) string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths[subsys] +} + +func (m *manager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for _, sys := range subsystems { + path := m.paths[sys.Name()] + if path == "" { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + return stats, nil +} + +func (m *manager) Set(r *configs.Resources) error { + if r == nil { + return nil + } + + if r.Unified != nil { + return cgroups.ErrV1NoUnified + } + + m.mu.Lock() + defer m.mu.Unlock() + for _, sys := range subsystems { + path := m.paths[sys.Name()] + if err := sys.Set(path, r); err != nil { + // When rootless is true, errors from the device subsystem + // are ignored, as it is really not expected to work. + if m.cgroups.Rootless && sys.Name() == "devices" { + continue + } + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if path == "" { + // We never created a path for this cgroup, so we cannot set + // limits for it (though we have already tried at this point). + return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) + } + return err + } + } + + return nil +} + +// Freeze toggles the container's freezer cgroup depending on the state +// provided +func (m *manager) Freeze(state configs.FreezerState) error { + path := m.Path("freezer") + if path == "" { + return errors.New("cannot toggle freezer: cgroups not configured for container") + } + + prevState := m.cgroups.Resources.Freezer + m.cgroups.Resources.Freezer = state + freezer := &FreezerGroup{} + if err := freezer.Set(path, m.cgroups.Resources); err != nil { + m.cgroups.Resources.Freezer = prevState + return err + } + return nil +} + +func (m *manager) GetPids() ([]int, error) { + return cgroups.GetPids(m.Path("devices")) +} + +func (m *manager) GetAllPids() ([]int, error) { + return cgroups.GetAllPids(m.Path("devices")) +} + +func (m *manager) GetPaths() map[string]string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths +} + +func (m *manager) GetCgroups() (*configs.Cgroup, error) { + return m.cgroups, nil +} + +func (m *manager) GetFreezerState() (configs.FreezerState, error) { + dir := m.Path("freezer") + // If the container doesn't have the freezer cgroup, say it's undefined. + if dir == "" { + return configs.Undefined, nil + } + freezer := &FreezerGroup{} + return freezer.GetState(dir) +} + +func (m *manager) Exists() bool { + return cgroups.PathExists(m.Path("devices")) +} + +func OOMKillCount(path string) (uint64, error) { + return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill") +} + +func (m *manager) OOMKillCount() (uint64, error) { + c, err := OOMKillCount(m.Path("memory")) + // Ignore ENOENT when rootless as it couldn't create cgroup. + if err != nil && m.cgroups.Rootless && os.IsNotExist(err) { + err = nil + } + + return c, err +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go new file mode 100644 index 000000000..8ddd6fdd8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go @@ -0,0 +1,62 @@ +package fs + +import ( + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type HugetlbGroup struct{} + +func (s *HugetlbGroup) Name() string { + return "hugetlb" +} + +func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *HugetlbGroup) Set(path string, r *configs.Resources) error { + for _, hugetlb := range r.HugetlbLimit { + if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { + return err + } + } + + return nil +} + +func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } + hugetlbStats := cgroups.HugetlbStats{} + for _, pageSize := range cgroups.HugePageSizes() { + usage := "hugetlb." + pageSize + ".usage_in_bytes" + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + return err + } + hugetlbStats.Usage = value + + maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes" + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil { + return err + } + hugetlbStats.MaxUsage = value + + failcnt := "hugetlb." + pageSize + ".failcnt" + value, err = fscommon.GetCgroupParamUint(path, failcnt) + if err != nil { + return err + } + hugetlbStats.Failcnt = value + + stats.HugetlbStats[pageSize] = hugetlbStats + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go new file mode 100644 index 000000000..b7c75f941 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go @@ -0,0 +1,348 @@ +package fs + +import ( + "bufio" + "errors" + "fmt" + "math" + "os" + "path/filepath" + "strconv" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +const ( + cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" + cgroupMemoryLimit = "memory.limit_in_bytes" + cgroupMemoryUsage = "memory.usage_in_bytes" + cgroupMemoryMaxUsage = "memory.max_usage_in_bytes" +) + +type MemoryGroup struct{} + +func (s *MemoryGroup) Name() string { + return "memory" +} + +func (s *MemoryGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func setMemory(path string, val int64) error { + if val == 0 { + return nil + } + + err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10)) + if !errors.Is(err, unix.EBUSY) { + return err + } + + // EBUSY means the kernel can't set new limit as it's too low + // (lower than the current usage). Return more specific error. + usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage) + if err != nil { + return err + } + max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage) + if err != nil { + return err + } + + return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max) +} + +func setSwap(path string, val int64) error { + if val == 0 { + return nil + } + + return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10)) +} + +func setMemoryAndSwap(path string, r *configs.Resources) error { + // If the memory update is set to -1 and the swap is not explicitly + // set, we should also set swap to -1, it means unlimited memory. + if r.Memory == -1 && r.MemorySwap == 0 { + // Only set swap if it's enabled in kernel + if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) { + r.MemorySwap = -1 + } + } + + // When memory and swap memory are both set, we need to handle the cases + // for updating container. + if r.Memory != 0 && r.MemorySwap != 0 { + curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit) + if err != nil { + return err + } + + // When update memory limit, we should adapt the write sequence + // for memory and swap memory, so it won't fail because the new + // value and the old value don't fit kernel's validation. + if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) { + if err := setSwap(path, r.MemorySwap); err != nil { + return err + } + if err := setMemory(path, r.Memory); err != nil { + return err + } + return nil + } + } + + if err := setMemory(path, r.Memory); err != nil { + return err + } + if err := setSwap(path, r.MemorySwap); err != nil { + return err + } + + return nil +} + +func (s *MemoryGroup) Set(path string, r *configs.Resources) error { + if err := setMemoryAndSwap(path, r); err != nil { + return err + } + + // ignore KernelMemory and KernelMemoryTCP + + if r.MemoryReservation != 0 { + if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil { + return err + } + } + + if r.OomKillDisable { + if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil { + return err + } + } + if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 { + return nil + } else if *r.MemorySwappiness <= 100 { + if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil { + return err + } + } else { + return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness) + } + + return nil +} + +func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { + const file = "memory.stat" + statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if err != nil { + if os.IsNotExist(err) { + return nil + } + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := fscommon.ParseKeyValue(sc.Text()) + if err != nil { + return &parseError{Path: path, File: file, Err: err} + } + stats.MemoryStats.Stats[t] = v + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"] + + memoryUsage, err := getMemoryData(path, "") + if err != nil { + return err + } + stats.MemoryStats.Usage = memoryUsage + swapUsage, err := getMemoryData(path, "memsw") + if err != nil { + return err + } + stats.MemoryStats.SwapUsage = swapUsage + kernelUsage, err := getMemoryData(path, "kmem") + if err != nil { + return err + } + stats.MemoryStats.KernelUsage = kernelUsage + kernelTCPUsage, err := getMemoryData(path, "kmem.tcp") + if err != nil { + return err + } + stats.MemoryStats.KernelTCPUsage = kernelTCPUsage + + value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy") + if err != nil { + return err + } + if value == 1 { + stats.MemoryStats.UseHierarchy = true + } + + pagesByNUMA, err := getPageUsageByNUMA(path) + if err != nil { + return err + } + stats.MemoryStats.PageUsageByNUMA = pagesByNUMA + + return nil +} + +func getMemoryData(path, name string) (cgroups.MemoryData, error) { + memoryData := cgroups.MemoryData{} + + moduleName := "memory" + if name != "" { + moduleName = "memory." + name + } + var ( + usage = moduleName + ".usage_in_bytes" + maxUsage = moduleName + ".max_usage_in_bytes" + failcnt = moduleName + ".failcnt" + limit = moduleName + ".limit_in_bytes" + ) + + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + if name != "" && os.IsNotExist(err) { + // Ignore ENOENT as swap and kmem controllers + // are optional in the kernel. + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, err + } + memoryData.Usage = value + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil { + return cgroups.MemoryData{}, err + } + memoryData.MaxUsage = value + value, err = fscommon.GetCgroupParamUint(path, failcnt) + if err != nil { + return cgroups.MemoryData{}, err + } + memoryData.Failcnt = value + value, err = fscommon.GetCgroupParamUint(path, limit) + if err != nil { + return cgroups.MemoryData{}, err + } + memoryData.Limit = value + + return memoryData, nil +} + +func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) { + const ( + maxColumns = math.MaxUint8 + 1 + file = "memory.numa_stat" + ) + stats := cgroups.PageUsageByNUMA{} + + fd, err := cgroups.OpenFile(path, file, os.O_RDONLY) + if os.IsNotExist(err) { + return stats, nil + } else if err != nil { + return stats, err + } + defer fd.Close() + + // File format is documented in linux/Documentation/cgroup-v1/memory.txt + // and it looks like this: + // + // total= N0= N1= ... + // file= N0= N1= ... + // anon= N0= N1= ... + // unevictable= N0= N1= ... + // hierarchical_= N0= N1= ... + + scanner := bufio.NewScanner(fd) + for scanner.Scan() { + var field *cgroups.PageStats + + line := scanner.Text() + columns := strings.SplitN(line, " ", maxColumns) + for i, column := range columns { + byNode := strings.SplitN(column, "=", 2) + // Some custom kernels have non-standard fields, like + // numa_locality 0 0 0 0 0 0 0 0 0 0 + // numa_exectime 0 + if len(byNode) < 2 { + if i == 0 { + // Ignore/skip those. + break + } else { + // The first column was already validated, + // so be strict to the rest. + return stats, malformedLine(path, file, line) + } + } + key, val := byNode[0], byNode[1] + if i == 0 { // First column: key is name, val is total. + field = getNUMAField(&stats, key) + if field == nil { // unknown field (new kernel?) + break + } + field.Total, err = strconv.ParseUint(val, 0, 64) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + field.Nodes = map[uint8]uint64{} + } else { // Subsequent columns: key is N, val is usage. + if len(key) < 2 || key[0] != 'N' { + // This is definitely an error. + return stats, malformedLine(path, file, line) + } + + n, err := strconv.ParseUint(key[1:], 10, 8) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + + usage, err := strconv.ParseUint(val, 10, 64) + if err != nil { + return stats, &parseError{Path: path, File: file, Err: err} + } + + field.Nodes[uint8(n)] = usage + } + + } + } + if err := scanner.Err(); err != nil { + return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err} + } + + return stats, nil +} + +func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats { + switch name { + case "total": + return &stats.Total + case "file": + return &stats.File + case "anon": + return &stats.Anon + case "unevictable": + return &stats.Unevictable + case "hierarchical_total": + return &stats.Hierarchical.Total + case "hierarchical_file": + return &stats.Hierarchical.File + case "hierarchical_anon": + return &stats.Hierarchical.Anon + case "hierarchical_unevictable": + return &stats.Hierarchical.Unevictable + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go new file mode 100644 index 000000000..b8d5d849c --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go @@ -0,0 +1,31 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NameGroup struct { + GroupName string + Join bool +} + +func (s *NameGroup) Name() string { + return s.GroupName +} + +func (s *NameGroup) Apply(path string, _ *configs.Resources, pid int) error { + if s.Join { + // Ignore errors if the named cgroup does not exist. + _ = apply(path, pid) + } + return nil +} + +func (s *NameGroup) Set(_ string, _ *configs.Resources) error { + return nil +} + +func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go new file mode 100644 index 000000000..abfd09ce8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go @@ -0,0 +1,32 @@ +package fs + +import ( + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NetClsGroup struct{} + +func (s *NetClsGroup) Name() string { + return "net_cls" +} + +func (s *NetClsGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *NetClsGroup) Set(path string, r *configs.Resources) error { + if r.NetClsClassid != 0 { + if err := cgroups.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil { + return err + } + } + + return nil +} + +func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go new file mode 100644 index 000000000..da74d3779 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go @@ -0,0 +1,30 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type NetPrioGroup struct{} + +func (s *NetPrioGroup) Name() string { + return "net_prio" +} + +func (s *NetPrioGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *NetPrioGroup) Set(path string, r *configs.Resources) error { + for _, prioMap := range r.NetPrioIfpriomap { + if err := cgroups.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil { + return err + } + } + + return nil +} + +func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go new file mode 100644 index 000000000..1092331b2 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go @@ -0,0 +1,186 @@ +package fs + +import ( + "errors" + "os" + "path/filepath" + "sync" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" +) + +// The absolute path to the root of the cgroup hierarchies. +var ( + cgroupRootLock sync.Mutex + cgroupRoot string +) + +const defaultCgroupRoot = "/sys/fs/cgroup" + +func initPaths(cg *configs.Cgroup) (map[string]string, error) { + root, err := rootPath() + if err != nil { + return nil, err + } + + inner, err := innerPath(cg) + if err != nil { + return nil, err + } + + paths := make(map[string]string) + for _, sys := range subsystems { + name := sys.Name() + path, err := subsysPath(root, inner, name) + if err != nil { + // The non-presence of the devices subsystem + // is considered fatal for security reasons. + if cgroups.IsNotFound(err) && (cg.SkipDevices || name != "devices") { + continue + } + + return nil, err + } + paths[name] = path + } + + return paths, nil +} + +func tryDefaultCgroupRoot() string { + var st, pst unix.Stat_t + + // (1) it should be a directory... + err := unix.Lstat(defaultCgroupRoot, &st) + if err != nil || st.Mode&unix.S_IFDIR == 0 { + return "" + } + + // (2) ... and a mount point ... + err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst) + if err != nil { + return "" + } + + if st.Dev == pst.Dev { + // parent dir has the same dev -- not a mount point + return "" + } + + // (3) ... of 'tmpfs' fs type. + var fst unix.Statfs_t + err = unix.Statfs(defaultCgroupRoot, &fst) + if err != nil || fst.Type != unix.TMPFS_MAGIC { + return "" + } + + // (4) it should have at least 1 entry ... + dir, err := os.Open(defaultCgroupRoot) + if err != nil { + return "" + } + names, err := dir.Readdirnames(1) + if err != nil { + return "" + } + if len(names) < 1 { + return "" + } + // ... which is a cgroup mount point. + err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst) + if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { + return "" + } + + return defaultCgroupRoot +} + +// rootPath finds and returns path to the root of the cgroup hierarchies. +func rootPath() (string, error) { + cgroupRootLock.Lock() + defer cgroupRootLock.Unlock() + + if cgroupRoot != "" { + return cgroupRoot, nil + } + + // fast path + cgroupRoot = tryDefaultCgroupRoot() + if cgroupRoot != "" { + return cgroupRoot, nil + } + + // slow path: parse mountinfo + mi, err := cgroups.GetCgroupMounts(false) + if err != nil { + return "", err + } + if len(mi) < 1 { + return "", errors.New("no cgroup mount found in mountinfo") + } + + // Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"), + // use its parent directory. + root := filepath.Dir(mi[0].Mountpoint) + + if _, err := os.Stat(root); err != nil { + return "", err + } + + cgroupRoot = root + return cgroupRoot, nil +} + +func innerPath(c *configs.Cgroup) (string, error) { + if (c.Name != "" || c.Parent != "") && c.Path != "" { + return "", errors.New("cgroup: either Path or Name and Parent should be used") + } + + // XXX: Do not remove CleanPath. Path safety is important! -- cyphar + innerPath := utils.CleanPath(c.Path) + if innerPath == "" { + cgParent := utils.CleanPath(c.Parent) + cgName := utils.CleanPath(c.Name) + innerPath = filepath.Join(cgParent, cgName) + } + + return innerPath, nil +} + +func subsysPath(root, inner, subsystem string) (string, error) { + // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. + if filepath.IsAbs(inner) { + mnt, err := cgroups.FindCgroupMountpoint(root, subsystem) + // If we didn't mount the subsystem, there is no point we make the path. + if err != nil { + return "", err + } + + // Sometimes subsystems can be mounted together as 'cpu,cpuacct'. + return filepath.Join(root, filepath.Base(mnt), inner), nil + } + + // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating + // process could in container and shared pid namespace with host, and + // /proc/1/cgroup could point to whole other world of cgroups. + parentPath, err := cgroups.GetOwnCgroupPath(subsystem) + if err != nil { + return "", err + } + + return filepath.Join(parentPath, inner), nil +} + +func apply(path string, pid int) error { + if path == "" { + return nil + } + if err := os.MkdirAll(path, 0o755); err != nil { + return err + } + return cgroups.WriteCgroupProc(path, pid) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go new file mode 100644 index 000000000..b86955c8f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go @@ -0,0 +1,24 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PerfEventGroup struct{} + +func (s *PerfEventGroup) Name() string { + return "perf_event" +} + +func (s *PerfEventGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *PerfEventGroup) Set(_ string, _ *configs.Resources) error { + return nil +} + +func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go new file mode 100644 index 000000000..1f13532a5 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go @@ -0,0 +1,62 @@ +package fs + +import ( + "math" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type PidsGroup struct{} + +func (s *PidsGroup) Name() string { + return "pids" +} + +func (s *PidsGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *PidsGroup) Set(path string, r *configs.Resources) error { + if r.PidsLimit != 0 { + // "max" is the fallback value. + limit := "max" + + if r.PidsLimit > 0 { + limit = strconv.FormatInt(r.PidsLimit, 10) + } + + if err := cgroups.WriteFile(path, "pids.max", limit); err != nil { + return err + } + } + + return nil +} + +func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { + if !cgroups.PathExists(path) { + return nil + } + current, err := fscommon.GetCgroupParamUint(path, "pids.current") + if err != nil { + return err + } + + max, err := fscommon.GetCgroupParamUint(path, "pids.max") + if err != nil { + return err + } + // If no limit is set, read from pids.max returns "max", which is + // converted to MaxUint64 by GetCgroupParamUint. Historically, we + // represent "no limit" for pids as 0, thus this conversion. + if max == math.MaxUint64 { + max = 0 + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/rdma.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/rdma.go new file mode 100644 index 000000000..5bbe0f35f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/rdma.go @@ -0,0 +1,25 @@ +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type RdmaGroup struct{} + +func (s *RdmaGroup) Name() string { + return "rdma" +} + +func (s *RdmaGroup) Apply(path string, _ *configs.Resources, pid int) error { + return apply(path, pid) +} + +func (s *RdmaGroup) Set(path string, r *configs.Resources) error { + return fscommon.RdmaSet(path, r) +} + +func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error { + return fscommon.RdmaGetStats(path, stats) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go new file mode 100644 index 000000000..bbbae4d58 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go @@ -0,0 +1,87 @@ +package fs2 + +import ( + "bufio" + "os" + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isCpuSet(r *configs.Resources) bool { + return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 +} + +func setCpu(dirPath string, r *configs.Resources) error { + if !isCpuSet(r) { + return nil + } + + // NOTE: .CpuShares is not used here. Conversion is the caller's responsibility. + if r.CpuWeight != 0 { + if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil { + return err + } + } + + if r.CpuQuota != 0 || r.CpuPeriod != 0 { + str := "max" + if r.CpuQuota > 0 { + str = strconv.FormatInt(r.CpuQuota, 10) + } + period := r.CpuPeriod + if period == 0 { + // This default value is documented in + // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html + period = 100000 + } + str += " " + strconv.FormatUint(period, 10) + if err := cgroups.WriteFile(dirPath, "cpu.max", str); err != nil { + return err + } + } + + return nil +} + +func statCpu(dirPath string, stats *cgroups.Stats) error { + const file = "cpu.stat" + f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY) + if err != nil { + return err + } + defer f.Close() + + sc := bufio.NewScanner(f) + for sc.Scan() { + t, v, err := fscommon.ParseKeyValue(sc.Text()) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + switch t { + case "usage_usec": + stats.CpuStats.CpuUsage.TotalUsage = v * 1000 + + case "user_usec": + stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000 + + case "system_usec": + stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000 + + case "nr_periods": + stats.CpuStats.ThrottlingData.Periods = v + + case "nr_throttled": + stats.CpuStats.ThrottlingData.ThrottledPeriods = v + + case "throttled_usec": + stats.CpuStats.ThrottlingData.ThrottledTime = v * 1000 + } + } + if err := sc.Err(); err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpuset.go new file mode 100644 index 000000000..16c45bad8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpuset.go @@ -0,0 +1,28 @@ +package fs2 + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isCpusetSet(r *configs.Resources) bool { + return r.CpusetCpus != "" || r.CpusetMems != "" +} + +func setCpuset(dirPath string, r *configs.Resources) error { + if !isCpusetSet(r) { + return nil + } + + if r.CpusetCpus != "" { + if err := cgroups.WriteFile(dirPath, "cpuset.cpus", r.CpusetCpus); err != nil { + return err + } + } + if r.CpusetMems != "" { + if err := cgroups.WriteFile(dirPath, "cpuset.mems", r.CpusetMems); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/create.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/create.go new file mode 100644 index 000000000..641123a4d --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/create.go @@ -0,0 +1,152 @@ +package fs2 + +import ( + "fmt" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func supportedControllers() (string, error) { + return cgroups.ReadFile(UnifiedMountpoint, "/cgroup.controllers") +} + +// needAnyControllers returns whether we enable some supported controllers or not, +// based on (1) controllers available and (2) resources that are being set. +// We don't check "pseudo" controllers such as +// "freezer" and "devices". +func needAnyControllers(r *configs.Resources) (bool, error) { + if r == nil { + return false, nil + } + + // list of all available controllers + content, err := supportedControllers() + if err != nil { + return false, err + } + avail := make(map[string]struct{}) + for _, ctr := range strings.Fields(content) { + avail[ctr] = struct{}{} + } + + // check whether the controller if available or not + have := func(controller string) bool { + _, ok := avail[controller] + return ok + } + + if isPidsSet(r) && have("pids") { + return true, nil + } + if isMemorySet(r) && have("memory") { + return true, nil + } + if isIoSet(r) && have("io") { + return true, nil + } + if isCpuSet(r) && have("cpu") { + return true, nil + } + if isCpusetSet(r) && have("cpuset") { + return true, nil + } + if isHugeTlbSet(r) && have("hugetlb") { + return true, nil + } + + return false, nil +} + +// containsDomainController returns whether the current config contains domain controller or not. +// Refer to: http://man7.org/linux/man-pages/man7/cgroups.7.html +// As at Linux 4.19, the following controllers are threaded: cpu, perf_event, and pids. +func containsDomainController(r *configs.Resources) bool { + return isMemorySet(r) || isIoSet(r) || isCpuSet(r) || isHugeTlbSet(r) +} + +// CreateCgroupPath creates cgroupv2 path, enabling all the supported controllers. +func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) { + if !strings.HasPrefix(path, UnifiedMountpoint) { + return fmt.Errorf("invalid cgroup path %s", path) + } + + content, err := supportedControllers() + if err != nil { + return err + } + + const ( + cgTypeFile = "cgroup.type" + cgStCtlFile = "cgroup.subtree_control" + ) + ctrs := strings.Fields(content) + res := "+" + strings.Join(ctrs, " +") + + elements := strings.Split(path, "/") + elements = elements[3:] + current := "/sys/fs" + for i, e := range elements { + current = filepath.Join(current, e) + if i > 0 { + if err := os.Mkdir(current, 0o755); err != nil { + if !os.IsExist(err) { + return err + } + } else { + // If the directory was created, be sure it is not left around on errors. + current := current + defer func() { + if Err != nil { + os.Remove(current) + } + }() + } + cgType, _ := cgroups.ReadFile(current, cgTypeFile) + cgType = strings.TrimSpace(cgType) + switch cgType { + // If the cgroup is in an invalid mode (usually this means there's an internal + // process in the cgroup tree, because we created a cgroup under an + // already-populated-by-other-processes cgroup), then we have to error out if + // the user requested controllers which are not thread-aware. However, if all + // the controllers requested are thread-aware we can simply put the cgroup into + // threaded mode. + case "domain invalid": + if containsDomainController(c.Resources) { + return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in an invalid state", current) + } else { + // Not entirely correct (in theory we'd always want to be a domain -- + // since that means we're a properly delegated cgroup subtree) but in + // this case there's not much we can do and it's better than giving an + // error. + _ = cgroups.WriteFile(current, cgTypeFile, "threaded") + } + // If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers + // (and you cannot usually take a cgroup out of threaded mode). + case "domain threaded": + fallthrough + case "threaded": + if containsDomainController(c.Resources) { + return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType) + } + } + } + // enable all supported controllers + if i < len(elements)-1 { + if err := cgroups.WriteFile(current, cgStCtlFile, res); err != nil { + // try write one by one + allCtrs := strings.Split(res, " ") + for _, ctr := range allCtrs { + _ = cgroups.WriteFile(current, cgStCtlFile, ctr) + } + } + // Some controllers might not be enabled when rootless or containerized, + // but we don't catch the error here. (Caught in setXXX() functions.) + } + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go new file mode 100644 index 000000000..9c949c91f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go @@ -0,0 +1,99 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package fs2 + +import ( + "bufio" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/utils" +) + +const UnifiedMountpoint = "/sys/fs/cgroup" + +func defaultDirPath(c *configs.Cgroup) (string, error) { + if (c.Name != "" || c.Parent != "") && c.Path != "" { + return "", fmt.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c) + } + + return _defaultDirPath(UnifiedMountpoint, c.Path, c.Parent, c.Name) +} + +func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) { + if (cgName != "" || cgParent != "") && cgPath != "" { + return "", errors.New("cgroup: either Path or Name and Parent should be used") + } + + // XXX: Do not remove CleanPath. Path safety is important! -- cyphar + innerPath := utils.CleanPath(cgPath) + if innerPath == "" { + cgParent := utils.CleanPath(cgParent) + cgName := utils.CleanPath(cgName) + innerPath = filepath.Join(cgParent, cgName) + } + if filepath.IsAbs(innerPath) { + return filepath.Join(root, innerPath), nil + } + + ownCgroup, err := parseCgroupFile("/proc/self/cgroup") + if err != nil { + return "", err + } + // The current user scope most probably has tasks in it already, + // making it impossible to enable controllers for its sub-cgroup. + // A parent cgroup (with no tasks in it) is what we need. + ownCgroup = filepath.Dir(ownCgroup) + + return filepath.Join(root, ownCgroup, innerPath), nil +} + +// parseCgroupFile parses /proc/PID/cgroup file and return string +func parseCgroupFile(path string) (string, error) { + f, err := os.Open(path) + if err != nil { + return "", err + } + defer f.Close() + return parseCgroupFromReader(f) +} + +func parseCgroupFromReader(r io.Reader) (string, error) { + s := bufio.NewScanner(r) + for s.Scan() { + var ( + text = s.Text() + parts = strings.SplitN(text, ":", 3) + ) + if len(parts) < 3 { + return "", fmt.Errorf("invalid cgroup entry: %q", text) + } + // text is like "0::/user.slice/user-1001.slice/session-1.scope" + if parts[0] == "0" && parts[1] == "" { + return parts[2], nil + } + } + if err := s.Err(); err != nil { + return "", err + } + return "", errors.New("cgroup path not found") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/devices.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/devices.go new file mode 100644 index 000000000..0d2345607 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/devices.go @@ -0,0 +1,75 @@ +package fs2 + +import ( + "fmt" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups/ebpf" + "github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runc/libcontainer/userns" +) + +func isRWM(perms devices.Permissions) bool { + var r, w, m bool + for _, perm := range perms { + switch perm { + case 'r': + r = true + case 'w': + w = true + case 'm': + m = true + } + } + return r && w && m +} + +// This is similar to the logic applied in crun for handling errors from bpf(2) +// . +func canSkipEBPFError(r *configs.Resources) bool { + // If we're running in a user namespace we can ignore eBPF rules because we + // usually cannot use bpf(2), as well as rootless containers usually don't + // have the necessary privileges to mknod(2) device inodes or access + // host-level instances (though ideally we would be blocking device access + // for rootless containers anyway). + if userns.RunningInUserNS() { + return true + } + + // We cannot ignore an eBPF load error if any rule if is a block rule or it + // doesn't permit all access modes. + // + // NOTE: This will sometimes trigger in cases where access modes are split + // between different rules but to handle this correctly would require + // using ".../libcontainer/cgroup/devices".Emulator. + for _, dev := range r.Devices { + if !dev.Allow || !isRWM(dev.Permissions) { + return false + } + } + return true +} + +func setDevices(dirPath string, r *configs.Resources) error { + if r.SkipDevices { + return nil + } + insts, license, err := devicefilter.DeviceFilter(r.Devices) + if err != nil { + return err + } + dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0o600) + if err != nil { + return fmt.Errorf("cannot get dir FD for %s", dirPath) + } + defer unix.Close(dirFD) + if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { + if !canSkipEBPFError(r) { + return err + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go new file mode 100644 index 000000000..8917a6411 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go @@ -0,0 +1,127 @@ +package fs2 + +import ( + "bufio" + "errors" + "fmt" + "os" + "strings" + "time" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func setFreezer(dirPath string, state configs.FreezerState) error { + var stateStr string + switch state { + case configs.Undefined: + return nil + case configs.Frozen: + stateStr = "1" + case configs.Thawed: + stateStr = "0" + default: + return fmt.Errorf("invalid freezer state %q requested", state) + } + + fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDWR) + if err != nil { + // We can ignore this request as long as the user didn't ask us to + // freeze the container (since without the freezer cgroup, that's a + // no-op). + if state != configs.Frozen { + return nil + } + return fmt.Errorf("freezer not supported: %w", err) + } + defer fd.Close() + + if _, err := fd.WriteString(stateStr); err != nil { + return err + } + // Confirm that the cgroup did actually change states. + if actualState, err := readFreezer(dirPath, fd); err != nil { + return err + } else if actualState != state { + return fmt.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState) + } + return nil +} + +func getFreezer(dirPath string) (configs.FreezerState, error) { + fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDONLY) + if err != nil { + // If the kernel is too old, then we just treat the freezer as being in + // an "undefined" state. + if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { + err = nil + } + return configs.Undefined, err + } + defer fd.Close() + + return readFreezer(dirPath, fd) +} + +func readFreezer(dirPath string, fd *os.File) (configs.FreezerState, error) { + if _, err := fd.Seek(0, 0); err != nil { + return configs.Undefined, err + } + state := make([]byte, 2) + if _, err := fd.Read(state); err != nil { + return configs.Undefined, err + } + switch string(state) { + case "0\n": + return configs.Thawed, nil + case "1\n": + return waitFrozen(dirPath) + default: + return configs.Undefined, fmt.Errorf(`unknown "cgroup.freeze" state: %q`, state) + } +} + +// waitFrozen polls cgroup.events until it sees "frozen 1" in it. +func waitFrozen(dirPath string) (configs.FreezerState, error) { + fd, err := cgroups.OpenFile(dirPath, "cgroup.events", unix.O_RDONLY) + if err != nil { + return configs.Undefined, err + } + defer fd.Close() + + // XXX: Simple wait/read/retry is used here. An implementation + // based on poll(2) or inotify(7) is possible, but it makes the code + // much more complicated. Maybe address this later. + const ( + // Perform maxIter with waitTime in between iterations. + waitTime = 10 * time.Millisecond + maxIter = 1000 + ) + scanner := bufio.NewScanner(fd) + for i := 0; scanner.Scan(); { + if i == maxIter { + return configs.Undefined, fmt.Errorf("timeout of %s reached waiting for the cgroup to freeze", waitTime*maxIter) + } + line := scanner.Text() + val := strings.TrimPrefix(line, "frozen ") + if val != line { // got prefix + if val[0] == '1' { + return configs.Frozen, nil + } + + i++ + // wait, then re-read + time.Sleep(waitTime) + _, err := fd.Seek(0, 0) + if err != nil { + return configs.Undefined, err + } + } + } + // Should only reach here either on read error, + // or if the file does not contain "frozen " line. + return configs.Undefined, scanner.Err() +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go new file mode 100644 index 000000000..492778e31 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go @@ -0,0 +1,259 @@ +package fs2 + +import ( + "errors" + "fmt" + "os" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type parseError = fscommon.ParseError + +type manager struct { + config *configs.Cgroup + // dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope" + dirPath string + // controllers is content of "cgroup.controllers" file. + // excludes pseudo-controllers ("devices" and "freezer"). + controllers map[string]struct{} +} + +// NewManager creates a manager for cgroup v2 unified hierarchy. +// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope". +// If dirPath is empty, it is automatically set using config. +func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error) { + if dirPath == "" { + var err error + dirPath, err = defaultDirPath(config) + if err != nil { + return nil, err + } + } + + m := &manager{ + config: config, + dirPath: dirPath, + } + return m, nil +} + +func (m *manager) getControllers() error { + if m.controllers != nil { + return nil + } + + data, err := cgroups.ReadFile(m.dirPath, "cgroup.controllers") + if err != nil { + if m.config.Rootless && m.config.Path == "" { + return nil + } + return err + } + fields := strings.Fields(data) + m.controllers = make(map[string]struct{}, len(fields)) + for _, c := range fields { + m.controllers[c] = struct{}{} + } + + return nil +} + +func (m *manager) Apply(pid int) error { + if err := CreateCgroupPath(m.dirPath, m.config); err != nil { + // Related tests: + // - "runc create (no limits + no cgrouppath + no permission) succeeds" + // - "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error" + // - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if m.config.Rootless { + if m.config.Path == "" { + if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed { + return nil + } + return fmt.Errorf("rootless needs no limits + no cgrouppath when no permission is granted for cgroups: %w", err) + } + } + return err + } + if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil { + return err + } + return nil +} + +func (m *manager) GetPids() ([]int, error) { + return cgroups.GetPids(m.dirPath) +} + +func (m *manager) GetAllPids() ([]int, error) { + return cgroups.GetAllPids(m.dirPath) +} + +func (m *manager) GetStats() (*cgroups.Stats, error) { + var errs []error + + st := cgroups.NewStats() + + // pids (since kernel 4.5) + if err := statPids(m.dirPath, st); err != nil { + errs = append(errs, err) + } + // memory (since kernel 4.5) + if err := statMemory(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + // io (since kernel 4.5) + if err := statIo(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + // cpu (since kernel 4.15) + // Note cpu.stat is available even if the controller is not enabled. + if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + // hugetlb (since kernel 5.6) + if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + // rdma (since kernel 4.11) + if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } + if len(errs) > 0 && !m.config.Rootless { + return st, fmt.Errorf("error while statting cgroup v2: %+v", errs) + } + return st, nil +} + +func (m *manager) Freeze(state configs.FreezerState) error { + if m.config.Resources == nil { + return errors.New("cannot toggle freezer: cgroups not configured for container") + } + if err := setFreezer(m.dirPath, state); err != nil { + return err + } + m.config.Resources.Freezer = state + return nil +} + +func (m *manager) Destroy() error { + return cgroups.RemovePath(m.dirPath) +} + +func (m *manager) Path(_ string) string { + return m.dirPath +} + +func (m *manager) Set(r *configs.Resources) error { + if r == nil { + return nil + } + if err := m.getControllers(); err != nil { + return err + } + // pids (since kernel 4.5) + if err := setPids(m.dirPath, r); err != nil { + return err + } + // memory (since kernel 4.5) + if err := setMemory(m.dirPath, r); err != nil { + return err + } + // io (since kernel 4.5) + if err := setIo(m.dirPath, r); err != nil { + return err + } + // cpu (since kernel 4.15) + if err := setCpu(m.dirPath, r); err != nil { + return err + } + // devices (since kernel 4.15, pseudo-controller) + // + // When rootless is true, errors from the device subsystem are ignored because it is really not expected to work. + // However, errors from other subsystems are not ignored. + // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" + if err := setDevices(m.dirPath, r); err != nil && !m.config.Rootless { + return err + } + // cpuset (since kernel 5.0) + if err := setCpuset(m.dirPath, r); err != nil { + return err + } + // hugetlb (since kernel 5.6) + if err := setHugeTlb(m.dirPath, r); err != nil { + return err + } + // rdma (since kernel 4.11) + if err := fscommon.RdmaSet(m.dirPath, r); err != nil { + return err + } + // freezer (since kernel 5.2, pseudo-controller) + if err := setFreezer(m.dirPath, r.Freezer); err != nil { + return err + } + if err := m.setUnified(r.Unified); err != nil { + return err + } + m.config.Resources = r + return nil +} + +func (m *manager) setUnified(res map[string]string) error { + for k, v := range res { + if strings.Contains(k, "/") { + return fmt.Errorf("unified resource %q must be a file name (no slashes)", k) + } + if err := cgroups.WriteFile(m.dirPath, k, v); err != nil { + // Check for both EPERM and ENOENT since O_CREAT is used by WriteFile. + if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) { + // Check if a controller is available, + // to give more specific error if not. + sk := strings.SplitN(k, ".", 2) + if len(sk) != 2 { + return fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k) + } + c := sk[0] + if _, ok := m.controllers[c]; !ok && c != "cgroup" { + return fmt.Errorf("unified resource %q can't be set: controller %q not available", k, c) + } + } + return fmt.Errorf("unable to set unified resource %q: %w", k, err) + } + } + + return nil +} + +func (m *manager) GetPaths() map[string]string { + paths := make(map[string]string, 1) + paths[""] = m.dirPath + return paths +} + +func (m *manager) GetCgroups() (*configs.Cgroup, error) { + return m.config, nil +} + +func (m *manager) GetFreezerState() (configs.FreezerState, error) { + return getFreezer(m.dirPath) +} + +func (m *manager) Exists() bool { + return cgroups.PathExists(m.dirPath) +} + +func OOMKillCount(path string) (uint64, error) { + return fscommon.GetValueByKey(path, "memory.events", "oom_kill") +} + +func (m *manager) OOMKillCount() (uint64, error) { + c, err := OOMKillCount(m.dirPath) + if err != nil && m.config.Rootless && os.IsNotExist(err) { + err = nil + } + + return c, err +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go new file mode 100644 index 000000000..c92a7e64a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go @@ -0,0 +1,48 @@ +package fs2 + +import ( + "strconv" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isHugeTlbSet(r *configs.Resources) bool { + return len(r.HugetlbLimit) > 0 +} + +func setHugeTlb(dirPath string, r *configs.Resources) error { + if !isHugeTlbSet(r) { + return nil + } + for _, hugetlb := range r.HugetlbLimit { + if err := cgroups.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { + return err + } + } + + return nil +} + +func statHugeTlb(dirPath string, stats *cgroups.Stats) error { + hugetlbStats := cgroups.HugetlbStats{} + for _, pagesize := range cgroups.HugePageSizes() { + value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current") + if err != nil { + return err + } + hugetlbStats.Usage = value + + fileName := "hugetlb." + pagesize + ".events" + value, err = fscommon.GetValueByKey(dirPath, fileName, "max") + if err != nil { + return err + } + hugetlbStats.Failcnt = value + + stats.HugetlbStats[pagesize] = hugetlbStats + } + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/io.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/io.go new file mode 100644 index 000000000..b2ff7d340 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/io.go @@ -0,0 +1,193 @@ +package fs2 + +import ( + "bufio" + "bytes" + "fmt" + "os" + "strconv" + "strings" + + "github.com/sirupsen/logrus" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isIoSet(r *configs.Resources) bool { + return r.BlkioWeight != 0 || + len(r.BlkioWeightDevice) > 0 || + len(r.BlkioThrottleReadBpsDevice) > 0 || + len(r.BlkioThrottleWriteBpsDevice) > 0 || + len(r.BlkioThrottleReadIOPSDevice) > 0 || + len(r.BlkioThrottleWriteIOPSDevice) > 0 +} + +// bfqDeviceWeightSupported checks for per-device BFQ weight support (added +// in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight". +func bfqDeviceWeightSupported(bfq *os.File) bool { + if bfq == nil { + return false + } + _, _ = bfq.Seek(0, 0) + buf := make([]byte, 32) + _, _ = bfq.Read(buf) + // If only a single number (default weight) if read back, we have older kernel. + _, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64) + return err != nil +} + +func setIo(dirPath string, r *configs.Resources) error { + if !isIoSet(r) { + return nil + } + + // If BFQ IO scheduler is available, use it. + var bfq *os.File + if r.BlkioWeight != 0 || len(r.BlkioWeightDevice) > 0 { + var err error + bfq, err = cgroups.OpenFile(dirPath, "io.bfq.weight", os.O_RDWR) + if err == nil { + defer bfq.Close() + } else if !os.IsNotExist(err) { + return err + } + } + + if r.BlkioWeight != 0 { + if bfq != nil { // Use BFQ. + if _, err := bfq.WriteString(strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil { + return err + } + } else { + // Fallback to io.weight with a conversion scheme. + v := cgroups.ConvertBlkIOToIOWeightValue(r.BlkioWeight) + if err := cgroups.WriteFile(dirPath, "io.weight", strconv.FormatUint(v, 10)); err != nil { + return err + } + } + } + if bfqDeviceWeightSupported(bfq) { + for _, wd := range r.BlkioWeightDevice { + if _, err := bfq.WriteString(wd.WeightString() + "\n"); err != nil { + return fmt.Errorf("setting device weight %q: %w", wd.WeightString(), err) + } + } + } + for _, td := range r.BlkioThrottleReadBpsDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleWriteBpsDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleReadIOPSDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil { + return err + } + } + for _, td := range r.BlkioThrottleWriteIOPSDevice { + if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil { + return err + } + } + + return nil +} + +func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) { + ret := map[string][]string{} + f, err := cgroups.OpenFile(dirPath, name, os.O_RDONLY) + if err != nil { + return nil, err + } + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Fields(line) + if len(parts) < 2 { + continue + } + ret[parts[0]] = parts[1:] + } + if err := scanner.Err(); err != nil { + return nil, &parseError{Path: dirPath, File: name, Err: err} + } + return ret, nil +} + +func statIo(dirPath string, stats *cgroups.Stats) error { + const file = "io.stat" + values, err := readCgroup2MapFile(dirPath, file) + if err != nil { + return err + } + // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt + var parsedStats cgroups.BlkioStats + for k, v := range values { + d := strings.Split(k, ":") + if len(d) != 2 { + continue + } + major, err := strconv.ParseUint(d[0], 10, 64) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + minor, err := strconv.ParseUint(d[1], 10, 64) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + + for _, item := range v { + d := strings.Split(item, "=") + if len(d) != 2 { + continue + } + op := d[0] + + // Map to the cgroupv1 naming and layout (in separate tables). + var targetTable *[]cgroups.BlkioStatEntry + switch op { + // Equivalent to cgroupv1's blkio.io_service_bytes. + case "rbytes": + op = "Read" + targetTable = &parsedStats.IoServiceBytesRecursive + case "wbytes": + op = "Write" + targetTable = &parsedStats.IoServiceBytesRecursive + // Equivalent to cgroupv1's blkio.io_serviced. + case "rios": + op = "Read" + targetTable = &parsedStats.IoServicedRecursive + case "wios": + op = "Write" + targetTable = &parsedStats.IoServicedRecursive + default: + // Skip over entries we cannot map to cgroupv1 stats for now. + // In the future we should expand the stats struct to include + // them. + logrus.Debugf("cgroupv2 io stats: skipping over unmappable %s entry", item) + continue + } + + value, err := strconv.ParseUint(d[1], 10, 64) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + + entry := cgroups.BlkioStatEntry{ + Op: op, + Major: major, + Minor: minor, + Value: value, + } + *targetTable = append(*targetTable, entry) + } + } + stats.BlkioStats = parsedStats + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go new file mode 100644 index 000000000..adbc4b230 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go @@ -0,0 +1,216 @@ +package fs2 + +import ( + "bufio" + "errors" + "math" + "os" + "strconv" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +// numToStr converts an int64 value to a string for writing to a +// cgroupv2 files with .min, .max, .low, or .high suffix. +// The value of -1 is converted to "max" for cgroupv1 compatibility +// (which used to write -1 to remove the limit). +func numToStr(value int64) (ret string) { + switch { + case value == 0: + ret = "" + case value == -1: + ret = "max" + default: + ret = strconv.FormatInt(value, 10) + } + + return ret +} + +func isMemorySet(r *configs.Resources) bool { + return r.MemoryReservation != 0 || r.Memory != 0 || r.MemorySwap != 0 +} + +func setMemory(dirPath string, r *configs.Resources) error { + if !isMemorySet(r) { + return nil + } + swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) + if err != nil { + return err + } + swapStr := numToStr(swap) + if swapStr == "" && swap == 0 && r.MemorySwap > 0 { + // memory and memorySwap set to the same value -- disable swap + swapStr = "0" + } + // never write empty string to `memory.swap.max`, it means set to 0. + if swapStr != "" { + if err := cgroups.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil { + return err + } + } + + if val := numToStr(r.Memory); val != "" { + if err := cgroups.WriteFile(dirPath, "memory.max", val); err != nil { + return err + } + } + + // cgroup.Resources.KernelMemory is ignored + + if val := numToStr(r.MemoryReservation); val != "" { + if err := cgroups.WriteFile(dirPath, "memory.low", val); err != nil { + return err + } + } + + return nil +} + +func statMemory(dirPath string, stats *cgroups.Stats) error { + const file = "memory.stat" + statsFile, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY) + if err != nil { + return err + } + defer statsFile.Close() + + sc := bufio.NewScanner(statsFile) + for sc.Scan() { + t, v, err := fscommon.ParseKeyValue(sc.Text()) + if err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + stats.MemoryStats.Stats[t] = v + } + if err := sc.Err(); err != nil { + return &parseError{Path: dirPath, File: file, Err: err} + } + stats.MemoryStats.Cache = stats.MemoryStats.Stats["file"] + // Unlike cgroup v1 which has memory.use_hierarchy binary knob, + // cgroup v2 is always hierarchical. + stats.MemoryStats.UseHierarchy = true + + memoryUsage, err := getMemoryDataV2(dirPath, "") + if err != nil { + if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint { + // The root cgroup does not have memory.{current,max} + // so emulate those using data from /proc/meminfo. + return statsFromMeminfo(stats) + } + return err + } + stats.MemoryStats.Usage = memoryUsage + swapUsage, err := getMemoryDataV2(dirPath, "swap") + if err != nil { + return err + } + // As cgroup v1 reports SwapUsage values as mem+swap combined, + // while in cgroup v2 swap values do not include memory, + // report combined mem+swap for v1 compatibility. + swapUsage.Usage += memoryUsage.Usage + if swapUsage.Limit != math.MaxUint64 { + swapUsage.Limit += memoryUsage.Limit + } + stats.MemoryStats.SwapUsage = swapUsage + + return nil +} + +func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { + memoryData := cgroups.MemoryData{} + + moduleName := "memory" + if name != "" { + moduleName = "memory." + name + } + usage := moduleName + ".current" + limit := moduleName + ".max" + + value, err := fscommon.GetCgroupParamUint(path, usage) + if err != nil { + if name != "" && os.IsNotExist(err) { + // Ignore EEXIST as there's no swap accounting + // if kernel CONFIG_MEMCG_SWAP is not set or + // swapaccount=0 kernel boot parameter is given. + return cgroups.MemoryData{}, nil + } + return cgroups.MemoryData{}, err + } + memoryData.Usage = value + + value, err = fscommon.GetCgroupParamUint(path, limit) + if err != nil { + return cgroups.MemoryData{}, err + } + memoryData.Limit = value + + return memoryData, nil +} + +func statsFromMeminfo(stats *cgroups.Stats) error { + const file = "/proc/meminfo" + f, err := os.Open(file) + if err != nil { + return err + } + defer f.Close() + + // Fields we are interested in. + var ( + swap_free uint64 + swap_total uint64 + main_total uint64 + main_free uint64 + ) + mem := map[string]*uint64{ + "SwapFree": &swap_free, + "SwapTotal": &swap_total, + "MemTotal": &main_total, + "MemFree": &main_free, + } + + found := 0 + sc := bufio.NewScanner(f) + for sc.Scan() { + parts := strings.SplitN(sc.Text(), ":", 3) + if len(parts) != 2 { + // Should not happen. + continue + } + k := parts[0] + p, ok := mem[k] + if !ok { + // Unknown field -- not interested. + continue + } + vStr := strings.TrimSpace(strings.TrimSuffix(parts[1], " kB")) + *p, err = strconv.ParseUint(vStr, 10, 64) + if err != nil { + return &parseError{File: file, Err: errors.New("bad value for " + k)} + } + + found++ + if found == len(mem) { + // Got everything we need -- skip the rest. + break + } + } + if err := sc.Err(); err != nil { + return &parseError{Path: "", File: file, Err: err} + } + + stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024 + stats.MemoryStats.SwapUsage.Limit = math.MaxUint64 + + stats.MemoryStats.Usage.Usage = (main_total - main_free) * 1024 + stats.MemoryStats.Usage.Limit = math.MaxUint64 + + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/pids.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/pids.go new file mode 100644 index 000000000..c8c4a3658 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/pids.go @@ -0,0 +1,72 @@ +package fs2 + +import ( + "errors" + "math" + "os" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func isPidsSet(r *configs.Resources) bool { + return r.PidsLimit != 0 +} + +func setPids(dirPath string, r *configs.Resources) error { + if !isPidsSet(r) { + return nil + } + if val := numToStr(r.PidsLimit); val != "" { + if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil { + return err + } + } + + return nil +} + +func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error { + // if the controller is not enabled, let's read PIDS from cgroups.procs + // (or threads if cgroup.threads is enabled) + contents, err := cgroups.ReadFile(dirPath, "cgroup.procs") + if errors.Is(err, unix.ENOTSUP) { + contents, err = cgroups.ReadFile(dirPath, "cgroup.threads") + } + if err != nil { + return err + } + pids := strings.Count(contents, "\n") + stats.PidsStats.Current = uint64(pids) + stats.PidsStats.Limit = 0 + return nil +} + +func statPids(dirPath string, stats *cgroups.Stats) error { + current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current") + if err != nil { + if os.IsNotExist(err) { + return statPidsFromCgroupProcs(dirPath, stats) + } + return err + } + + max, err := fscommon.GetCgroupParamUint(dirPath, "pids.max") + if err != nil { + return err + } + // If no limit is set, read from pids.max returns "max", which is + // converted to MaxUint64 by GetCgroupParamUint. Historically, we + // represent "no limit" for pids as 0, thus this conversion. + if max == math.MaxUint64 { + max = 0 + } + + stats.PidsStats.Current = current + stats.PidsStats.Limit = max + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/rdma.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/rdma.go new file mode 100644 index 000000000..d463d15ee --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/rdma.go @@ -0,0 +1,121 @@ +package fscommon + +import ( + "bufio" + "errors" + "math" + "os" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" + "golang.org/x/sys/unix" +) + +// parseRdmaKV parses raw string to RdmaEntry. +func parseRdmaKV(raw string, entry *cgroups.RdmaEntry) error { + var value uint32 + + parts := strings.SplitN(raw, "=", 3) + + if len(parts) != 2 { + return errors.New("Unable to parse RDMA entry") + } + + k, v := parts[0], parts[1] + + if v == "max" { + value = math.MaxUint32 + } else { + val64, err := strconv.ParseUint(v, 10, 32) + if err != nil { + return err + } + value = uint32(val64) + } + if k == "hca_handle" { + entry.HcaHandles = value + } else if k == "hca_object" { + entry.HcaObjects = value + } + + return nil +} + +// readRdmaEntries reads and converts array of rawstrings to RdmaEntries from file. +// example entry: mlx4_0 hca_handle=2 hca_object=2000 +func readRdmaEntries(dir, file string) ([]cgroups.RdmaEntry, error) { + rdmaEntries := make([]cgroups.RdmaEntry, 0) + fd, err := cgroups.OpenFile(dir, file, unix.O_RDONLY) + if err != nil { + return nil, err + } + defer fd.Close() //nolint:errorlint + scanner := bufio.NewScanner(fd) + for scanner.Scan() { + parts := strings.SplitN(scanner.Text(), " ", 4) + if len(parts) == 3 { + entry := new(cgroups.RdmaEntry) + entry.Device = parts[0] + err = parseRdmaKV(parts[1], entry) + if err != nil { + continue + } + err = parseRdmaKV(parts[2], entry) + if err != nil { + continue + } + + rdmaEntries = append(rdmaEntries, *entry) + } + } + return rdmaEntries, scanner.Err() +} + +// RdmaGetStats returns rdma stats such as totalLimit and current entries. +func RdmaGetStats(path string, stats *cgroups.Stats) error { + currentEntries, err := readRdmaEntries(path, "rdma.current") + if err != nil { + if errors.Is(err, os.ErrNotExist) { + err = nil + } + return err + } + maxEntries, err := readRdmaEntries(path, "rdma.max") + if err != nil { + return err + } + // If device got removed between reading two files, ignore returning stats. + if len(currentEntries) != len(maxEntries) { + return nil + } + + stats.RdmaStats = cgroups.RdmaStats{ + RdmaLimit: maxEntries, + RdmaCurrent: currentEntries, + } + + return nil +} + +func createCmdString(device string, limits configs.LinuxRdma) string { + cmdString := device + if limits.HcaHandles != nil { + cmdString += " hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10) + } + if limits.HcaObjects != nil { + cmdString += " hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10) + } + return cmdString +} + +// RdmaSet sets RDMA resources. +func RdmaSet(path string, r *configs.Resources) error { + for device, limits := range r.Rdma { + if err := cgroups.WriteFile(path, "rdma.max", createCmdString(device, limits)); err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go new file mode 100644 index 000000000..f4a51c9e5 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go @@ -0,0 +1,145 @@ +package fscommon + +import ( + "errors" + "fmt" + "math" + "path" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +var ( + // Deprecated: use cgroups.OpenFile instead. + OpenFile = cgroups.OpenFile + // Deprecated: use cgroups.ReadFile instead. + ReadFile = cgroups.ReadFile + // Deprecated: use cgroups.WriteFile instead. + WriteFile = cgroups.WriteFile +) + +// ParseError records a parse error details, including the file path. +type ParseError struct { + Path string + File string + Err error +} + +func (e *ParseError) Error() string { + return "unable to parse " + path.Join(e.Path, e.File) + ": " + e.Err.Error() +} + +func (e *ParseError) Unwrap() error { return e.Err } + +// ParseUint converts a string to an uint64 integer. +// Negative values are returned at zero as, due to kernel bugs, +// some of the memory cgroup stats can be negative. +func ParseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// ParseKeyValue parses a space-separated "name value" kind of cgroup +// parameter and returns its key as a string, and its value as uint64 +// (ParseUint is used to convert the value). For example, +// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234. +func ParseKeyValue(t string) (string, uint64, error) { + parts := strings.SplitN(t, " ", 3) + if len(parts) != 2 { + return "", 0, fmt.Errorf("line %q is not in key value format", t) + } + + value, err := ParseUint(parts[1], 10, 64) + if err != nil { + return "", 0, err + } + + return parts[0], value, nil +} + +// GetValueByKey reads a key-value pairs from the specified cgroup file, +// and returns a value of the specified key. ParseUint is used for value +// conversion. +func GetValueByKey(path, file, key string) (uint64, error) { + content, err := cgroups.ReadFile(path, file) + if err != nil { + return 0, err + } + + lines := strings.Split(content, "\n") + for _, line := range lines { + arr := strings.Split(line, " ") + if len(arr) == 2 && arr[0] == key { + val, err := ParseUint(arr[1], 10, 64) + if err != nil { + err = &ParseError{Path: path, File: file, Err: err} + } + return val, err + } + } + + return 0, nil +} + +// GetCgroupParamUint reads a single uint64 value from the specified cgroup file. +// If the value read is "max", the math.MaxUint64 is returned. +func GetCgroupParamUint(path, file string) (uint64, error) { + contents, err := GetCgroupParamString(path, file) + if err != nil { + return 0, err + } + contents = strings.TrimSpace(contents) + if contents == "max" { + return math.MaxUint64, nil + } + + res, err := ParseUint(contents, 10, 64) + if err != nil { + return res, &ParseError{Path: path, File: file, Err: err} + } + return res, nil +} + +// GetCgroupParamInt reads a single int64 value from specified cgroup file. +// If the value read is "max", the math.MaxInt64 is returned. +func GetCgroupParamInt(path, file string) (int64, error) { + contents, err := cgroups.ReadFile(path, file) + if err != nil { + return 0, err + } + contents = strings.TrimSpace(contents) + if contents == "max" { + return math.MaxInt64, nil + } + + res, err := strconv.ParseInt(contents, 10, 64) + if err != nil { + return res, &ParseError{Path: path, File: file, Err: err} + } + return res, nil +} + +// GetCgroupParamString reads a string from the specified cgroup file. +func GetCgroupParamString(path, file string) (string, error) { + contents, err := cgroups.ReadFile(path, file) + if err != nil { + return "", err + } + + return strings.TrimSpace(contents), nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/getallpids.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/getallpids.go new file mode 100644 index 000000000..1355a5101 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/getallpids.go @@ -0,0 +1,27 @@ +package cgroups + +import ( + "io/fs" + "path/filepath" +) + +// GetAllPids returns all pids from the cgroup identified by path, and all its +// sub-cgroups. +func GetAllPids(path string) ([]int, error) { + var pids []int + err := filepath.WalkDir(path, func(p string, d fs.DirEntry, iErr error) error { + if iErr != nil { + return iErr + } + if !d.IsDir() { + return nil + } + cPids, err := readProcsFile(p) + if err != nil { + return err + } + pids = append(pids, cPids...) + return nil + }) + return pids, err +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go new file mode 100644 index 000000000..40a81dd5a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go @@ -0,0 +1,173 @@ +package cgroups + +type ThrottlingData struct { + // Number of periods with throttling active + Periods uint64 `json:"periods,omitempty"` + // Number of periods when the container hit its throttling limit. + ThrottledPeriods uint64 `json:"throttled_periods,omitempty"` + // Aggregate time the container was throttled for in nanoseconds. + ThrottledTime uint64 `json:"throttled_time,omitempty"` +} + +// CpuUsage denotes the usage of a CPU. +// All CPU stats are aggregate since container inception. +type CpuUsage struct { + // Total CPU time consumed. + // Units: nanoseconds. + TotalUsage uint64 `json:"total_usage,omitempty"` + // Total CPU time consumed per core. + // Units: nanoseconds. + PercpuUsage []uint64 `json:"percpu_usage,omitempty"` + // CPU time consumed per core in kernel mode + // Units: nanoseconds. + PercpuUsageInKernelmode []uint64 `json:"percpu_usage_in_kernelmode"` + // CPU time consumed per core in user mode + // Units: nanoseconds. + PercpuUsageInUsermode []uint64 `json:"percpu_usage_in_usermode"` + // Time spent by tasks of the cgroup in kernel mode. + // Units: nanoseconds. + UsageInKernelmode uint64 `json:"usage_in_kernelmode"` + // Time spent by tasks of the cgroup in user mode. + // Units: nanoseconds. + UsageInUsermode uint64 `json:"usage_in_usermode"` +} + +type CpuStats struct { + CpuUsage CpuUsage `json:"cpu_usage,omitempty"` + ThrottlingData ThrottlingData `json:"throttling_data,omitempty"` +} + +type CPUSetStats struct { + // List of the physical numbers of the CPUs on which processes + // in that cpuset are allowed to execute + CPUs []uint16 `json:"cpus,omitempty"` + // cpu_exclusive flag + CPUExclusive uint64 `json:"cpu_exclusive"` + // List of memory nodes on which processes in that cpuset + // are allowed to allocate memory + Mems []uint16 `json:"mems,omitempty"` + // mem_hardwall flag + MemHardwall uint64 `json:"mem_hardwall"` + // mem_exclusive flag + MemExclusive uint64 `json:"mem_exclusive"` + // memory_migrate flag + MemoryMigrate uint64 `json:"memory_migrate"` + // memory_spread page flag + MemorySpreadPage uint64 `json:"memory_spread_page"` + // memory_spread slab flag + MemorySpreadSlab uint64 `json:"memory_spread_slab"` + // memory_pressure + MemoryPressure uint64 `json:"memory_pressure"` + // sched_load balance flag + SchedLoadBalance uint64 `json:"sched_load_balance"` + // sched_relax_domain_level + SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"` +} + +type MemoryData struct { + Usage uint64 `json:"usage,omitempty"` + MaxUsage uint64 `json:"max_usage,omitempty"` + Failcnt uint64 `json:"failcnt"` + Limit uint64 `json:"limit"` +} + +type MemoryStats struct { + // memory used for cache + Cache uint64 `json:"cache,omitempty"` + // usage of memory + Usage MemoryData `json:"usage,omitempty"` + // usage of memory + swap + SwapUsage MemoryData `json:"swap_usage,omitempty"` + // usage of kernel memory + KernelUsage MemoryData `json:"kernel_usage,omitempty"` + // usage of kernel TCP memory + KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"` + // usage of memory pages by NUMA node + // see chapter 5.6 of memory controller documentation + PageUsageByNUMA PageUsageByNUMA `json:"page_usage_by_numa,omitempty"` + // if true, memory usage is accounted for throughout a hierarchy of cgroups. + UseHierarchy bool `json:"use_hierarchy"` + + Stats map[string]uint64 `json:"stats,omitempty"` +} + +type PageUsageByNUMA struct { + // Embedding is used as types can't be recursive. + PageUsageByNUMAInner + Hierarchical PageUsageByNUMAInner `json:"hierarchical,omitempty"` +} + +type PageUsageByNUMAInner struct { + Total PageStats `json:"total,omitempty"` + File PageStats `json:"file,omitempty"` + Anon PageStats `json:"anon,omitempty"` + Unevictable PageStats `json:"unevictable,omitempty"` +} + +type PageStats struct { + Total uint64 `json:"total,omitempty"` + Nodes map[uint8]uint64 `json:"nodes,omitempty"` +} + +type PidsStats struct { + // number of pids in the cgroup + Current uint64 `json:"current,omitempty"` + // active pids hard limit + Limit uint64 `json:"limit,omitempty"` +} + +type BlkioStatEntry struct { + Major uint64 `json:"major,omitempty"` + Minor uint64 `json:"minor,omitempty"` + Op string `json:"op,omitempty"` + Value uint64 `json:"value,omitempty"` +} + +type BlkioStats struct { + // number of bytes transferred to and from the block device + IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"` + IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"` + IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"` + IoServiceTimeRecursive []BlkioStatEntry `json:"io_service_time_recursive,omitempty"` + IoWaitTimeRecursive []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"` + IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"` + IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"` + SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"` +} + +type HugetlbStats struct { + // current res_counter usage for hugetlb + Usage uint64 `json:"usage,omitempty"` + // maximum usage ever recorded. + MaxUsage uint64 `json:"max_usage,omitempty"` + // number of times hugetlb usage allocation failure. + Failcnt uint64 `json:"failcnt"` +} + +type RdmaEntry struct { + Device string `json:"device,omitempty"` + HcaHandles uint32 `json:"hca_handles,omitempty"` + HcaObjects uint32 `json:"hca_objects,omitempty"` +} + +type RdmaStats struct { + RdmaLimit []RdmaEntry `json:"rdma_limit,omitempty"` + RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"` +} + +type Stats struct { + CpuStats CpuStats `json:"cpu_stats,omitempty"` + CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"` + MemoryStats MemoryStats `json:"memory_stats,omitempty"` + PidsStats PidsStats `json:"pids_stats,omitempty"` + BlkioStats BlkioStats `json:"blkio_stats,omitempty"` + // the map is in the format "size of hugepage: stats of the hugepage" + HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` + RdmaStats RdmaStats `json:"rdma_stats,omitempty"` +} + +func NewStats() *Stats { + memoryStats := MemoryStats{Stats: make(map[string]uint64)} + hugetlbStats := make(map[string]HugetlbStats) + return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats} +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go new file mode 100644 index 000000000..c5b476e2c --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go @@ -0,0 +1,564 @@ +package systemd + +import ( + "bufio" + "context" + "errors" + "fmt" + "math" + "os" + "regexp" + "strconv" + "strings" + "sync" + "time" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + dbus "github.com/godbus/dbus/v5" + "github.com/sirupsen/logrus" + + cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/devices" +) + +const ( + // Default kernel value for cpu quota period is 100000 us (100 ms), same for v1 and v2. + // v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and + // v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html + defCPUQuotaPeriod = uint64(100000) +) + +var ( + versionOnce sync.Once + version int + + isRunningSystemdOnce sync.Once + isRunningSystemd bool +) + +// NOTE: This function comes from package github.com/coreos/go-systemd/util +// It was borrowed here to avoid a dependency on cgo. +// +// IsRunningSystemd checks whether the host was booted with systemd as its init +// system. This functions similarly to systemd's `sd_booted(3)`: internally, it +// checks whether /run/systemd/system/ exists and is a directory. +// http://www.freedesktop.org/software/systemd/man/sd_booted.html +func IsRunningSystemd() bool { + isRunningSystemdOnce.Do(func() { + fi, err := os.Lstat("/run/systemd/system") + isRunningSystemd = err == nil && fi.IsDir() + }) + return isRunningSystemd +} + +// systemd represents slice hierarchy using `-`, so we need to follow suit when +// generating the path of slice. Essentially, test-a-b.slice becomes +// /test.slice/test-a.slice/test-a-b.slice. +func ExpandSlice(slice string) (string, error) { + suffix := ".slice" + // Name has to end with ".slice", but can't be just ".slice". + if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Path-separators are not allowed. + if strings.Contains(slice, "/") { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + var path, prefix string + sliceName := strings.TrimSuffix(slice, suffix) + // if input was -.slice, we should just return root now + if sliceName == "-" { + return "/", nil + } + for _, component := range strings.Split(sliceName, "-") { + // test--a.slice isn't permitted, nor is -test.slice. + if component == "" { + return "", fmt.Errorf("invalid slice name: %s", slice) + } + + // Append the component to the path and to the prefix. + path += "/" + prefix + component + suffix + prefix += component + "-" + } + return path, nil +} + +func groupPrefix(ruleType devices.Type) (string, error) { + switch ruleType { + case devices.BlockDevice: + return "block-", nil + case devices.CharDevice: + return "char-", nil + default: + return "", fmt.Errorf("device type %v has no group prefix", ruleType) + } +} + +// findDeviceGroup tries to find the device group name (as listed in +// /proc/devices) with the type prefixed as required for DeviceAllow, for a +// given (type, major) combination. If more than one device group exists, an +// arbitrary one is chosen. +func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) { + fh, err := os.Open("/proc/devices") + if err != nil { + return "", err + } + defer fh.Close() + + prefix, err := groupPrefix(ruleType) + if err != nil { + return "", err + } + + scanner := bufio.NewScanner(fh) + var currentType devices.Type + for scanner.Scan() { + // We need to strip spaces because the first number is column-aligned. + line := strings.TrimSpace(scanner.Text()) + + // Handle the "header" lines. + switch line { + case "Block devices:": + currentType = devices.BlockDevice + continue + case "Character devices:": + currentType = devices.CharDevice + continue + case "": + continue + } + + // Skip lines unrelated to our type. + if currentType != ruleType { + continue + } + + // Parse out the (major, name). + var ( + currMajor int64 + currName string + ) + if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 { + if err == nil { + err = errors.New("wrong number of fields") + } + return "", fmt.Errorf("scan /proc/devices line %q: %w", line, err) + } + + if currMajor == ruleMajor { + return prefix + currName, nil + } + } + if err := scanner.Err(); err != nil { + return "", fmt.Errorf("reading /proc/devices: %w", err) + } + // Couldn't find the device group. + return "", nil +} + +// DeviceAllow is the dbus type "a(ss)" which means we need a struct +// to represent it in Go. +type deviceAllowEntry struct { + Path string + Perms string +} + +func allowAllDevices() []systemdDbus.Property { + // Setting mode to auto and removing all DeviceAllow rules + // results in allowing access to all devices. + return []systemdDbus.Property{ + newProp("DevicePolicy", "auto"), + newProp("DeviceAllow", []deviceAllowEntry{}), + } +} + +// generateDeviceProperties takes the configured device rules and generates a +// corresponding set of systemd properties to configure the devices correctly. +func generateDeviceProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) { + if r.SkipDevices { + return nil, nil + } + + properties := []systemdDbus.Property{ + // Always run in the strictest white-list mode. + newProp("DevicePolicy", "strict"), + // Empty the DeviceAllow array before filling it. + newProp("DeviceAllow", []deviceAllowEntry{}), + } + + // Figure out the set of rules. + configEmu := &cgroupdevices.Emulator{} + for _, rule := range r.Devices { + if err := configEmu.Apply(*rule); err != nil { + return nil, fmt.Errorf("unable to apply rule for systemd: %w", err) + } + } + // systemd doesn't support blacklists. So we log a warning, and tell + // systemd to act as a deny-all whitelist. This ruleset will be replaced + // with our normal fallback code. This may result in spurious errors, but + // the only other option is to error out here. + if configEmu.IsBlacklist() { + // However, if we're dealing with an allow-all rule then we can do it. + if configEmu.IsAllowAll() { + return allowAllDevices(), nil + } + logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule") + return properties, nil + } + + // Now generate the set of rules we actually need to apply. Unlike the + // normal devices cgroup, in "strict" mode systemd defaults to a deny-all + // whitelist which is the default for devices.Emulator. + finalRules, err := configEmu.Rules() + if err != nil { + return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err) + } + var deviceAllowList []deviceAllowEntry + for _, rule := range finalRules { + if !rule.Allow { + // Should never happen. + return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule) + } + switch rule.Type { + case devices.BlockDevice, devices.CharDevice: + default: + // Should never happen. + return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type) + } + + entry := deviceAllowEntry{ + Perms: string(rule.Permissions), + } + + // systemd has a fairly odd (though understandable) syntax here, and + // because of the OCI configuration format we have to do quite a bit of + // trickery to convert things: + // + // * Concrete rules with non-wildcard major/minor numbers have to use + // /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses + // stat(2) on such paths to look up device properties, meaning we + // cannot add whitelist rules for devices that don't exist. Since v240, + // device properties are parsed from the path string. + // + // However, path globbing is not support for path-based rules so we + // need to handle wildcards in some other manner. + // + // * Wildcard-minor rules have to specify a "device group name" (the + // second column in /proc/devices). + // + // * Wildcard (major and minor) rules can just specify a glob with the + // type ("char-*" or "block-*"). + // + // The only type of rule we can't handle is wildcard-major rules, and + // so we'll give a warning in that case (note that the fallback code + // will insert any rules systemd couldn't handle). What amazing fun. + + if rule.Major == devices.Wildcard { + // "_ *:n _" rules aren't supported by systemd. + if rule.Minor != devices.Wildcard { + logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule) + continue + } + + // "_ *:* _" rules just wildcard everything. + prefix, err := groupPrefix(rule.Type) + if err != nil { + return nil, err + } + entry.Path = prefix + "*" + } else if rule.Minor == devices.Wildcard { + // "_ n:* _" rules require a device group from /proc/devices. + group, err := findDeviceGroup(rule.Type, rule.Major) + if err != nil { + return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err) + } + if group == "" { + // Couldn't find a group. + logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule) + continue + } + entry.Path = group + } else { + // "_ n:m _" rules are just a path in /dev/{block,char}/. + switch rule.Type { + case devices.BlockDevice: + entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor) + case devices.CharDevice: + entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor) + } + if sdVer < 240 { + // Old systemd versions use stat(2) on path to find out device major:minor + // numbers and type. If the path doesn't exist, it will not add the rule, + // emitting a warning instead. + // Since all of this logic is best-effort anyway (we manually set these + // rules separately to systemd) we can safely skip entries that don't + // have a corresponding path. + if _, err := os.Stat(entry.Path); err != nil { + continue + } + } + } + deviceAllowList = append(deviceAllowList, entry) + } + + properties = append(properties, newProp("DeviceAllow", deviceAllowList)) + return properties, nil +} + +func newProp(name string, units interface{}) systemdDbus.Property { + return systemdDbus.Property{ + Name: name, + Value: dbus.MakeVariant(units), + } +} + +func getUnitName(c *configs.Cgroup) string { + // by default, we create a scope unless the user explicitly asks for a slice. + if !strings.HasSuffix(c.Name, ".slice") { + return c.ScopePrefix + "-" + c.Name + ".scope" + } + return c.Name +} + +// This code should be in sync with getUnitName. +func getUnitType(unitName string) string { + if strings.HasSuffix(unitName, ".slice") { + return "Slice" + } + return "Scope" +} + +// isDbusError returns true if the error is a specific dbus error. +func isDbusError(err error, name string) bool { + if err != nil { + var derr dbus.Error + if errors.As(err, &derr) { + return strings.Contains(derr.Name, name) + } + } + return false +} + +// isUnitExists returns true if the error is that a systemd unit already exists. +func isUnitExists(err error) bool { + return isDbusError(err, "org.freedesktop.systemd1.UnitExists") +} + +func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error { + statusChan := make(chan string, 1) + retry := true + +retry: + err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + _, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan) + return err + }) + if err != nil { + if !isUnitExists(err) { + return err + } + if ignoreExist { + // TODO: remove this hack. + // This is kubelet making sure a slice exists (see + // https://github.com/opencontainers/runc/pull/1124). + return nil + } + if retry { + // In case a unit with the same name exists, this may + // be a leftover failed unit. Reset it, so systemd can + // remove it, and retry once. + err = resetFailedUnit(cm, unitName) + if err != nil { + logrus.Warnf("unable to reset failed unit: %v", err) + } + retry = false + goto retry + } + return err + } + + timeout := time.NewTimer(30 * time.Second) + defer timeout.Stop() + + select { + case s := <-statusChan: + close(statusChan) + // Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit + if s != "done" { + _ = resetFailedUnit(cm, unitName) + return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s) + } + case <-timeout.C: + _ = resetFailedUnit(cm, unitName) + return errors.New("Timeout waiting for systemd to create " + unitName) + } + + return nil +} + +func stopUnit(cm *dbusConnManager, unitName string) error { + statusChan := make(chan string, 1) + err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + _, err := c.StopUnitContext(context.TODO(), unitName, "replace", statusChan) + return err + }) + if err == nil { + timeout := time.NewTimer(30 * time.Second) + defer timeout.Stop() + + select { + case s := <-statusChan: + close(statusChan) + // Please refer to https://godoc.org/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit + if s != "done" { + logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s) + } + case <-timeout.C: + return errors.New("Timed out while waiting for systemd to remove " + unitName) + } + } + + // In case of a failed unit, let systemd remove it. + _ = resetFailedUnit(cm, unitName) + + return nil +} + +func resetFailedUnit(cm *dbusConnManager, name string) error { + return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + return c.ResetFailedUnitContext(context.TODO(), name) + }) +} + +func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) { + var prop *systemdDbus.Property + err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) { + prop, Err = c.GetUnitTypePropertyContext(context.TODO(), unitName, unitType, propertyName) + return Err + }) + return prop, err +} + +func setUnitProperties(cm *dbusConnManager, name string, properties ...systemdDbus.Property) error { + return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + return c.SetUnitPropertiesContext(context.TODO(), name, true, properties...) + }) +} + +func getManagerProperty(cm *dbusConnManager, name string) (string, error) { + str := "" + err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { + var err error + str, err = c.GetManagerProperty(name) + return err + }) + if err != nil { + return "", err + } + return strconv.Unquote(str) +} + +func systemdVersion(cm *dbusConnManager) int { + versionOnce.Do(func() { + version = -1 + verStr, err := getManagerProperty(cm, "Version") + if err == nil { + version, err = systemdVersionAtoi(verStr) + } + + if err != nil { + logrus.WithError(err).Error("unable to get systemd version") + } + }) + + return version +} + +func systemdVersionAtoi(verStr string) (int, error) { + // verStr should be of the form: + // "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32" (without quotes). + // The result for all of the above should be 245. + // Thus, we unconditionally remove the "v" prefix + // and then match on the first integer we can grab. + re := regexp.MustCompile(`v?([0-9]+)`) + matches := re.FindStringSubmatch(verStr) + if len(matches) < 2 { + return 0, fmt.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches) + } + ver, err := strconv.Atoi(matches[1]) + if err != nil { + return -1, fmt.Errorf("can't parse version: %w", err) + } + return ver, nil +} + +func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota int64, period uint64) { + if period != 0 { + // systemd only supports CPUQuotaPeriodUSec since v242 + sdVer := systemdVersion(cm) + if sdVer >= 242 { + *properties = append(*properties, + newProp("CPUQuotaPeriodUSec", period)) + } else { + logrus.Debugf("systemd v%d is too old to support CPUQuotaPeriodSec "+ + " (setting will still be applied to cgroupfs)", sdVer) + } + } + if quota != 0 || period != 0 { + // corresponds to USEC_INFINITY in systemd + cpuQuotaPerSecUSec := uint64(math.MaxUint64) + if quota > 0 { + if period == 0 { + // assume the default + period = defCPUQuotaPeriod + } + // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota + // (integer percentage of CPU) internally. This means that if a fractional percent of + // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest + // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. + cpuQuotaPerSecUSec = uint64(quota*1000000) / period + if cpuQuotaPerSecUSec%10000 != 0 { + cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 + } + } + *properties = append(*properties, + newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) + } +} + +func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems string) error { + if cpus == "" && mems == "" { + return nil + } + + // systemd only supports AllowedCPUs/AllowedMemoryNodes since v244 + sdVer := systemdVersion(cm) + if sdVer < 244 { + logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+ + " (settings will still be applied to cgroupfs)", sdVer) + return nil + } + + if cpus != "" { + bits, err := RangeToBits(cpus) + if err != nil { + return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w", + cpus, err) + } + *props = append(*props, + newProp("AllowedCPUs", bits)) + } + if mems != "" { + bits, err := RangeToBits(mems) + if err != nil { + return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w", + mems, err) + } + *props = append(*props, + newProp("AllowedMemoryNodes", bits)) + } + return nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go new file mode 100644 index 000000000..dd474cf1b --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go @@ -0,0 +1,60 @@ +package systemd + +import ( + "errors" + "math/big" + "strconv" + "strings" +) + +// RangeToBits converts a text representation of a CPU mask (as written to +// or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes +// with the corresponding bits set (as consumed by systemd over dbus as +// AllowedCPUs/AllowedMemoryNodes unit property value). +func RangeToBits(str string) ([]byte, error) { + bits := new(big.Int) + + for _, r := range strings.Split(str, ",") { + // allow extra spaces around + r = strings.TrimSpace(r) + // allow empty elements (extra commas) + if r == "" { + continue + } + ranges := strings.SplitN(r, "-", 2) + if len(ranges) > 1 { + start, err := strconv.ParseUint(ranges[0], 10, 32) + if err != nil { + return nil, err + } + end, err := strconv.ParseUint(ranges[1], 10, 32) + if err != nil { + return nil, err + } + if start > end { + return nil, errors.New("invalid range: " + r) + } + for i := start; i <= end; i++ { + bits.SetBit(bits, int(i), 1) + } + } else { + val, err := strconv.ParseUint(ranges[0], 10, 32) + if err != nil { + return nil, err + } + bits.SetBit(bits, int(val), 1) + } + } + + ret := bits.Bytes() + if len(ret) == 0 { + // do not allow empty values + return nil, errors.New("empty value") + } + + // fit cpuset parsing order in systemd + for l, r := 0, len(ret)-1; l < r; l, r = l+1, r-1 { + ret[l], ret[r] = ret[r], ret[l] + } + return ret, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/dbus.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/dbus.go new file mode 100644 index 000000000..bb87ae83a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/dbus.go @@ -0,0 +1,102 @@ +package systemd + +import ( + "context" + "errors" + "fmt" + "sync" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + dbus "github.com/godbus/dbus/v5" +) + +var ( + dbusC *systemdDbus.Conn + dbusMu sync.RWMutex + dbusInited bool + dbusRootless bool +) + +type dbusConnManager struct{} + +// newDbusConnManager initializes systemd dbus connection manager. +func newDbusConnManager(rootless bool) *dbusConnManager { + dbusMu.Lock() + defer dbusMu.Unlock() + if dbusInited && rootless != dbusRootless { + panic("can't have both root and rootless dbus") + } + dbusInited = true + dbusRootless = rootless + return &dbusConnManager{} +} + +// getConnection lazily initializes and returns systemd dbus connection. +func (d *dbusConnManager) getConnection() (*systemdDbus.Conn, error) { + // In the case where dbusC != nil + // Use the read lock the first time to ensure + // that Conn can be acquired at the same time. + dbusMu.RLock() + if conn := dbusC; conn != nil { + dbusMu.RUnlock() + return conn, nil + } + dbusMu.RUnlock() + + // In the case where dbusC == nil + // Use write lock to ensure that only one + // will be created + dbusMu.Lock() + defer dbusMu.Unlock() + if conn := dbusC; conn != nil { + return conn, nil + } + + conn, err := d.newConnection() + if err != nil { + // When dbus-user-session is not installed, we can't detect whether we should try to connect to user dbus or system dbus, so d.dbusRootless is set to false. + // This may fail with a cryptic error "read unix @->/run/systemd/private: read: connection reset by peer: unknown." + // https://github.com/moby/moby/issues/42793 + return nil, fmt.Errorf("failed to connect to dbus (hint: for rootless containers, maybe you need to install dbus-user-session package, see https://github.com/opencontainers/runc/blob/master/docs/cgroup-v2.md): %w", err) + } + dbusC = conn + return conn, nil +} + +func (d *dbusConnManager) newConnection() (*systemdDbus.Conn, error) { + if dbusRootless { + return newUserSystemdDbus() + } + return systemdDbus.NewWithContext(context.TODO()) +} + +// resetConnection resets the connection to its initial state +// (so it can be reconnected if necessary). +func (d *dbusConnManager) resetConnection(conn *systemdDbus.Conn) { + dbusMu.Lock() + defer dbusMu.Unlock() + if dbusC != nil && dbusC == conn { + dbusC.Close() + dbusC = nil + } +} + +// retryOnDisconnect calls op, and if the error it returns is about closed dbus +// connection, the connection is re-established and the op is retried. This helps +// with the situation when dbus is restarted and we have a stale connection. +func (d *dbusConnManager) retryOnDisconnect(op func(*systemdDbus.Conn) error) error { + for { + conn, err := d.getConnection() + if err != nil { + return err + } + err = op(conn) + if err == nil { + return nil + } + if !errors.Is(err, dbus.ErrClosed) { + return err + } + d.resetConnection(conn) + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/user.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/user.go new file mode 100644 index 000000000..0f50f76ee --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/user.go @@ -0,0 +1,106 @@ +package systemd + +import ( + "bufio" + "bytes" + "errors" + "fmt" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + dbus "github.com/godbus/dbus/v5" + + "github.com/opencontainers/runc/libcontainer/userns" +) + +// newUserSystemdDbus creates a connection for systemd user-instance. +func newUserSystemdDbus() (*systemdDbus.Conn, error) { + addr, err := DetectUserDbusSessionBusAddress() + if err != nil { + return nil, err + } + uid, err := DetectUID() + if err != nil { + return nil, err + } + + return systemdDbus.NewConnection(func() (*dbus.Conn, error) { + conn, err := dbus.Dial(addr) + if err != nil { + return nil, fmt.Errorf("error while dialing %q: %w", addr, err) + } + methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))} + err = conn.Auth(methods) + if err != nil { + conn.Close() + return nil, fmt.Errorf("error while authenticating connection (address=%q, UID=%d): %w", addr, uid, err) + } + if err = conn.Hello(); err != nil { + conn.Close() + return nil, fmt.Errorf("error while sending Hello message (address=%q, UID=%d): %w", addr, uid, err) + } + return conn, nil + }) +} + +// DetectUID detects UID from the OwnerUID field of `busctl --user status` +// if running in userNS. The value corresponds to sd_bus_creds_get_owner_uid(3) . +// +// Otherwise returns os.Getuid() . +func DetectUID() (int, error) { + if !userns.RunningInUserNS() { + return os.Getuid(), nil + } + b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput() + if err != nil { + return -1, fmt.Errorf("could not execute `busctl --user --no-pager status` (output: %q): %w", string(b), err) + } + scanner := bufio.NewScanner(bytes.NewReader(b)) + for scanner.Scan() { + s := strings.TrimSpace(scanner.Text()) + if strings.HasPrefix(s, "OwnerUID=") { + uidStr := strings.TrimPrefix(s, "OwnerUID=") + i, err := strconv.Atoi(uidStr) + if err != nil { + return -1, fmt.Errorf("could not detect the OwnerUID: %w", err) + } + return i, nil + } + } + if err := scanner.Err(); err != nil { + return -1, err + } + return -1, errors.New("could not detect the OwnerUID") +} + +// DetectUserDbusSessionBusAddress returns $DBUS_SESSION_BUS_ADDRESS if set. +// Otherwise returns "unix:path=$XDG_RUNTIME_DIR/bus" if $XDG_RUNTIME_DIR/bus exists. +// Otherwise parses the value from `systemctl --user show-environment` . +func DetectUserDbusSessionBusAddress() (string, error) { + if env := os.Getenv("DBUS_SESSION_BUS_ADDRESS"); env != "" { + return env, nil + } + if xdr := os.Getenv("XDG_RUNTIME_DIR"); xdr != "" { + busPath := filepath.Join(xdr, "bus") + if _, err := os.Stat(busPath); err == nil { + busAddress := "unix:path=" + busPath + return busAddress, nil + } + } + b, err := exec.Command("systemctl", "--user", "--no-pager", "show-environment").CombinedOutput() + if err != nil { + return "", fmt.Errorf("could not execute `systemctl --user --no-pager show-environment` (output=%q): %w", string(b), err) + } + scanner := bufio.NewScanner(bytes.NewReader(b)) + for scanner.Scan() { + s := strings.TrimSpace(scanner.Text()) + if strings.HasPrefix(s, "DBUS_SESSION_BUS_ADDRESS=") { + return strings.TrimPrefix(s, "DBUS_SESSION_BUS_ADDRESS="), nil + } + } + return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from `systemctl --user --no-pager show-environment`. Make sure you have installed the dbus-user-session or dbus-daemon package and then run: `systemctl --user start dbus`") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go new file mode 100644 index 000000000..a574552da --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go @@ -0,0 +1,480 @@ +package systemd + +import ( + "errors" + "os" + "path/filepath" + "reflect" + "strings" + "sync" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/godbus/dbus/v5" + "github.com/sirupsen/logrus" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type legacyManager struct { + mu sync.Mutex + cgroups *configs.Cgroup + paths map[string]string + dbus *dbusConnManager +} + +func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) { + if cg.Rootless { + return nil, errors.New("cannot use rootless systemd cgroups manager on cgroup v1") + } + if cg.Resources != nil && cg.Resources.Unified != nil { + return nil, cgroups.ErrV1NoUnified + } + if paths == nil { + var err error + paths, err = initPaths(cg) + if err != nil { + return nil, err + } + } + return &legacyManager{ + cgroups: cg, + paths: paths, + dbus: newDbusConnManager(false), + }, nil +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. + GetStats(path string, stats *cgroups.Stats) error + // Set sets cgroup resource limits. + Set(path string, r *configs.Resources) error +} + +var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") + +var legacySubsystems = []subsystem{ + &fs.CpusetGroup{}, + &fs.DevicesGroup{}, + &fs.MemoryGroup{}, + &fs.CpuGroup{}, + &fs.CpuacctGroup{}, + &fs.PidsGroup{}, + &fs.BlkioGroup{}, + &fs.HugetlbGroup{}, + &fs.PerfEventGroup{}, + &fs.FreezerGroup{}, + &fs.NetPrioGroup{}, + &fs.NetClsGroup{}, + &fs.NameGroup{GroupName: "name=systemd"}, + &fs.RdmaGroup{}, + &fs.NameGroup{GroupName: "misc"}, +} + +func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { + var properties []systemdDbus.Property + + deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm)) + if err != nil { + return nil, err + } + properties = append(properties, deviceProperties...) + + if r.Memory != 0 { + properties = append(properties, + newProp("MemoryLimit", uint64(r.Memory))) + } + + if r.CpuShares != 0 { + properties = append(properties, + newProp("CPUShares", r.CpuShares)) + } + + addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod) + + if r.BlkioWeight != 0 { + properties = append(properties, + newProp("BlockIOWeight", uint64(r.BlkioWeight))) + } + + if r.PidsLimit > 0 || r.PidsLimit == -1 { + properties = append(properties, + newProp("TasksMax", uint64(r.PidsLimit))) + } + + err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) + if err != nil { + return nil, err + } + + return properties, nil +} + +// initPaths figures out and returns paths to cgroups. +func initPaths(c *configs.Cgroup) (map[string]string, error) { + slice := "system.slice" + if c.Parent != "" { + var err error + slice, err = ExpandSlice(c.Parent) + if err != nil { + return nil, err + } + } + + unit := getUnitName(c) + + paths := make(map[string]string) + for _, s := range legacySubsystems { + subsystemPath, err := getSubsystemPath(slice, unit, s.Name()) + if err != nil { + // Even if it's `not found` error, we'll return err + // because devices cgroup is hard requirement for + // container security. + if s.Name() == "devices" { + return nil, err + } + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return nil, err + } + paths[s.Name()] = subsystemPath + } + + // If systemd is using cgroups-hybrid mode then add the slice path of + // this container to the paths so the following process executed with + // "runc exec" joins that cgroup as well. + if cgroups.IsCgroup2HybridMode() { + // "" means cgroup-hybrid path + cgroupsHybridPath, err := getSubsystemPath(slice, unit, "") + if err != nil && cgroups.IsNotFound(err) { + return nil, err + } + paths[""] = cgroupsHybridPath + } + + return paths, nil +} + +func (m *legacyManager) Apply(pid int) error { + var ( + c = m.cgroups + unitName = getUnitName(c) + slice = "system.slice" + properties []systemdDbus.Property + ) + + m.mu.Lock() + defer m.mu.Unlock() + + if c.Parent != "" { + slice = c.Parent + } + + properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) + + if strings.HasSuffix(unitName, ".slice") { + // If we create a slice, the parent is defined via a Wants=. + properties = append(properties, systemdDbus.PropWants(slice)) + } else { + // Otherwise it's a scope, which we put into a Slice=. + properties = append(properties, systemdDbus.PropSlice(slice)) + // Assume scopes always support delegation (supported since systemd v218). + properties = append(properties, newProp("Delegate", true)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + } + + // Always enable accounting, this gets us the same behaviour as the fs implementation, + // plus the kernel has some problems with joining the memory cgroup at a later time. + properties = append(properties, + newProp("MemoryAccounting", true), + newProp("CPUAccounting", true), + newProp("BlockIOAccounting", true), + newProp("TasksAccounting", true), + ) + + // Assume DefaultDependencies= will always work (the check for it was previously broken.) + properties = append(properties, + newProp("DefaultDependencies", false)) + + properties = append(properties, c.SystemdProps...) + + if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { + return err + } + + if err := m.joinCgroups(pid); err != nil { + return err + } + + return nil +} + +func (m *legacyManager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + + stopErr := stopUnit(m.dbus, getUnitName(m.cgroups)) + + // Both on success and on error, cleanup all the cgroups + // we are aware of, as some of them were created directly + // by Apply() and are not managed by systemd. + if err := cgroups.RemovePaths(m.paths); err != nil && stopErr == nil { + return err + } + + return stopErr +} + +func (m *legacyManager) Path(subsys string) string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths[subsys] +} + +func (m *legacyManager) joinCgroups(pid int) error { + for _, sys := range legacySubsystems { + name := sys.Name() + switch name { + case "name=systemd": + // let systemd handle this + case "cpuset": + if path, ok := m.paths[name]; ok { + s := &fs.CpusetGroup{} + if err := s.ApplyDir(path, m.cgroups.Resources, pid); err != nil { + return err + } + } + default: + if path, ok := m.paths[name]; ok { + if err := os.MkdirAll(path, 0o755); err != nil { + return err + } + if err := cgroups.WriteCgroupProc(path, pid); err != nil { + return err + } + } + } + } + + return nil +} + +func getSubsystemPath(slice, unit, subsystem string) (string, error) { + mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem) + if err != nil { + return "", err + } + + return filepath.Join(mountpoint, slice, unit), nil +} + +func (m *legacyManager) Freeze(state configs.FreezerState) error { + err := m.doFreeze(state) + if err == nil { + m.cgroups.Resources.Freezer = state + } + return err +} + +// doFreeze is the same as Freeze but without +// changing the m.cgroups.Resources.Frozen field. +func (m *legacyManager) doFreeze(state configs.FreezerState) error { + path, ok := m.paths["freezer"] + if !ok { + return errSubsystemDoesNotExist + } + freezer := &fs.FreezerGroup{} + resources := &configs.Resources{Freezer: state} + return freezer.Set(path, resources) +} + +func (m *legacyManager) GetPids() ([]int, error) { + path, ok := m.paths["devices"] + if !ok { + return nil, errSubsystemDoesNotExist + } + return cgroups.GetPids(path) +} + +func (m *legacyManager) GetAllPids() ([]int, error) { + path, ok := m.paths["devices"] + if !ok { + return nil, errSubsystemDoesNotExist + } + return cgroups.GetAllPids(path) +} + +func (m *legacyManager) GetStats() (*cgroups.Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := cgroups.NewStats() + for _, sys := range legacySubsystems { + path := m.paths[sys.Name()] + if path == "" { + continue + } + if err := sys.GetStats(path, stats); err != nil { + return nil, err + } + } + + return stats, nil +} + +// freezeBeforeSet answers whether there is a need to freeze the cgroup before +// applying its systemd unit properties, and thaw after, while avoiding +// unnecessary freezer state changes. +// +// The reason why we have to freeze is that systemd's application of device +// rules is done disruptively, resulting in spurious errors to common devices +// (unlike our fs driver, they will happily write deny-all rules to running +// containers). So we have to freeze the container to avoid the container get +// an occasional "permission denied" error. +func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) (needsFreeze, needsThaw bool, err error) { + // Special case for SkipDevices, as used by Kubernetes to create pod + // cgroups with allow-all device policy). + if r.SkipDevices { + if r.SkipFreezeOnSet { + // Both needsFreeze and needsThaw are false. + return + } + + // No need to freeze if SkipDevices is set, and either + // (1) systemd unit does not (yet) exist, or + // (2) it has DevicePolicy=auto and empty DeviceAllow list. + // + // Interestingly, (1) and (2) are the same here because + // a non-existent unit returns default properties, + // and settings in (2) are the defaults. + // + // Do not return errors from getUnitTypeProperty, as they alone + // should not prevent Set from working. + + unitType := getUnitType(unitName) + + devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy") + if e == nil && devPolicy.Value == dbus.MakeVariant("auto") { + devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow") + if e == nil { + if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 { + needsFreeze = false + needsThaw = false + return + } + } + } + } + + needsFreeze = true + needsThaw = true + + // Check the current freezer state. + freezerState, err := m.GetFreezerState() + if err != nil { + return + } + if freezerState == configs.Frozen { + // Already frozen, and should stay frozen. + needsFreeze = false + needsThaw = false + } + + if r.Freezer == configs.Frozen { + // Will be frozen anyway -- no need to thaw. + needsThaw = false + } + return +} + +func (m *legacyManager) Set(r *configs.Resources) error { + if r == nil { + return nil + } + if r.Unified != nil { + return cgroups.ErrV1NoUnified + } + properties, err := genV1ResourcesProperties(r, m.dbus) + if err != nil { + return err + } + + unitName := getUnitName(m.cgroups) + needsFreeze, needsThaw, err := m.freezeBeforeSet(unitName, r) + if err != nil { + return err + } + + if needsFreeze { + if err := m.doFreeze(configs.Frozen); err != nil { + // If freezer cgroup isn't supported, we just warn about it. + logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + // skip update the cgroup while frozen failed. #3803 + if !errors.Is(err, errSubsystemDoesNotExist) { + if needsThaw { + if thawErr := m.doFreeze(configs.Thawed); thawErr != nil { + logrus.Infof("thaw container after doFreeze failed: %v", thawErr) + } + } + return err + } + } + } + setErr := setUnitProperties(m.dbus, unitName, properties...) + if needsThaw { + if err := m.doFreeze(configs.Thawed); err != nil { + logrus.Infof("thaw container after SetUnitProperties failed: %v", err) + } + } + if setErr != nil { + return setErr + } + + for _, sys := range legacySubsystems { + // Get the subsystem path, but don't error out for not found cgroups. + path, ok := m.paths[sys.Name()] + if !ok { + continue + } + if err := sys.Set(path, r); err != nil { + return err + } + } + + return nil +} + +func (m *legacyManager) GetPaths() map[string]string { + m.mu.Lock() + defer m.mu.Unlock() + return m.paths +} + +func (m *legacyManager) GetCgroups() (*configs.Cgroup, error) { + return m.cgroups, nil +} + +func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) { + path, ok := m.paths["freezer"] + if !ok { + return configs.Undefined, nil + } + freezer := &fs.FreezerGroup{} + return freezer.GetState(path) +} + +func (m *legacyManager) Exists() bool { + return cgroups.PathExists(m.Path("devices")) +} + +func (m *legacyManager) OOMKillCount() (uint64, error) { + return fs.OOMKillCount(m.Path("memory")) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go new file mode 100644 index 000000000..919e5632f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go @@ -0,0 +1,472 @@ +package systemd + +import ( + "bufio" + "errors" + "fmt" + "math" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/sirupsen/logrus" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs2" + "github.com/opencontainers/runc/libcontainer/configs" +) + +type unifiedManager struct { + mu sync.Mutex + cgroups *configs.Cgroup + // path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope" + path string + dbus *dbusConnManager + fsMgr cgroups.Manager +} + +func NewUnifiedManager(config *configs.Cgroup, path string) (cgroups.Manager, error) { + m := &unifiedManager{ + cgroups: config, + path: path, + dbus: newDbusConnManager(config.Rootless), + } + if err := m.initPath(); err != nil { + return nil, err + } + + fsMgr, err := fs2.NewManager(config, m.path) + if err != nil { + return nil, err + } + m.fsMgr = fsMgr + + return m, nil +} + +// unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified +// key/value map (where key is cgroupfs file name) to systemd unit properties. +// This is on a best-effort basis, so the properties that are not known +// (to this function and/or systemd) are ignored (but logged with "debug" +// log level). +// +// For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt +// +// For the list of systemd unit properties, see systemd.resource-control(5). +func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) { + var err error + + for k, v := range res { + if strings.Contains(k, "/") { + return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k) + } + sk := strings.SplitN(k, ".", 2) + if len(sk) != 2 { + return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k) + } + // Kernel is quite forgiving to extra whitespace + // around the value, and so should we. + v = strings.TrimSpace(v) + // Please keep cases in alphabetical order. + switch k { + case "cpu.max": + // value: quota [period] + quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set + period := defCPUQuotaPeriod + sv := strings.Fields(v) + if len(sv) < 1 || len(sv) > 2 { + return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v) + } + // quota + if sv[0] != "max" { + quota, err = strconv.ParseInt(sv[0], 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err) + } + } + // period + if len(sv) == 2 { + period, err = strconv.ParseUint(sv[1], 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err) + } + } + addCpuQuota(cm, &props, quota, period) + + case "cpu.weight": + num, err := strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) + } + props = append(props, + newProp("CPUWeight", num)) + + case "cpuset.cpus", "cpuset.mems": + bits, err := RangeToBits(v) + if err != nil { + return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err) + } + m := map[string]string{ + "cpuset.cpus": "AllowedCPUs", + "cpuset.mems": "AllowedMemoryNodes", + } + // systemd only supports these properties since v244 + sdVer := systemdVersion(cm) + if sdVer >= 244 { + props = append(props, + newProp(m[k], bits)) + } else { + logrus.Debugf("systemd v%d is too old to support %s"+ + " (setting will still be applied to cgroupfs)", + sdVer, m[k]) + } + + case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max": + num := uint64(math.MaxUint64) + if v != "max" { + num, err = strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) + } + } + m := map[string]string{ + "memory.high": "MemoryHigh", + "memory.low": "MemoryLow", + "memory.min": "MemoryMin", + "memory.max": "MemoryMax", + "memory.swap.max": "MemorySwapMax", + } + props = append(props, + newProp(m[k], num)) + + case "pids.max": + num := uint64(math.MaxUint64) + if v != "max" { + var err error + num, err = strconv.ParseUint(v, 10, 64) + if err != nil { + return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) + } + } + props = append(props, + newProp("TasksMax", num)) + + case "memory.oom.group": + // Setting this to 1 is roughly equivalent to OOMPolicy=kill + // (as per systemd.service(5) and + // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html), + // but it's not clear what to do if it is unset or set + // to 0 in runc update, as there are two other possible + // values for OOMPolicy (continue/stop). + fallthrough + + default: + // Ignore the unknown resource here -- will still be + // applied in Set which calls fs2.Set. + logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v) + } + } + + return props, nil +} + +func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { + var properties []systemdDbus.Property + + // NOTE: This is of questionable correctness because we insert our own + // devices eBPF program later. Two programs with identical rules + // aren't the end of the world, but it is a bit concerning. However + // it's unclear if systemd removes all eBPF programs attached when + // doing SetUnitProperties... + deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm)) + if err != nil { + return nil, err + } + properties = append(properties, deviceProperties...) + + if r.Memory != 0 { + properties = append(properties, + newProp("MemoryMax", uint64(r.Memory))) + } + if r.MemoryReservation != 0 { + properties = append(properties, + newProp("MemoryLow", uint64(r.MemoryReservation))) + } + + swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) + if err != nil { + return nil, err + } + if swap != 0 { + properties = append(properties, + newProp("MemorySwapMax", uint64(swap))) + } + + if r.CpuWeight != 0 { + properties = append(properties, + newProp("CPUWeight", r.CpuWeight)) + } + + addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod) + + if r.PidsLimit > 0 || r.PidsLimit == -1 { + properties = append(properties, + newProp("TasksMax", uint64(r.PidsLimit))) + } + + err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) + if err != nil { + return nil, err + } + + // ignore r.KernelMemory + + // convert Resources.Unified map to systemd properties + if r.Unified != nil { + unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified) + if err != nil { + return nil, err + } + properties = append(properties, unifiedProps...) + } + + return properties, nil +} + +func (m *unifiedManager) Apply(pid int) error { + var ( + c = m.cgroups + unitName = getUnitName(c) + properties []systemdDbus.Property + ) + + slice := "system.slice" + if m.cgroups.Rootless { + slice = "user.slice" + } + if c.Parent != "" { + slice = c.Parent + } + + properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) + + if strings.HasSuffix(unitName, ".slice") { + // If we create a slice, the parent is defined via a Wants=. + properties = append(properties, systemdDbus.PropWants(slice)) + } else { + // Otherwise it's a scope, which we put into a Slice=. + properties = append(properties, systemdDbus.PropSlice(slice)) + // Assume scopes always support delegation (supported since systemd v218). + properties = append(properties, newProp("Delegate", true)) + } + + // only add pid if its valid, -1 is used w/ general slice creation. + if pid != -1 { + properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) + } + + // Always enable accounting, this gets us the same behaviour as the fs implementation, + // plus the kernel has some problems with joining the memory cgroup at a later time. + properties = append(properties, + newProp("MemoryAccounting", true), + newProp("CPUAccounting", true), + newProp("IOAccounting", true), + newProp("TasksAccounting", true), + ) + + // Assume DefaultDependencies= will always work (the check for it was previously broken.) + properties = append(properties, + newProp("DefaultDependencies", false)) + + properties = append(properties, c.SystemdProps...) + + if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { + return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err) + } + + if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil { + return err + } + + if c.OwnerUID != nil { + // The directory itself must be chowned. + err := os.Chown(m.path, *c.OwnerUID, -1) + if err != nil { + return err + } + + filesToChown, err := cgroupFilesToChown() + if err != nil { + return err + } + + for _, v := range filesToChown { + err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1) + // Some files might not be present. + if err != nil && !errors.Is(err, os.ErrNotExist) { + return err + } + } + } + + return nil +} + +// The kernel exposes a list of files that should be chowned to the delegate +// uid in /sys/kernel/cgroup/delegate. If the file is not present +// (Linux < 4.15), use the initial values mentioned in cgroups(7). +func cgroupFilesToChown() ([]string, error) { + const cgroupDelegateFile = "/sys/kernel/cgroup/delegate" + + f, err := os.Open(cgroupDelegateFile) + if err != nil { + return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil + } + defer f.Close() + + filesToChown := []string{} + scanner := bufio.NewScanner(f) + for scanner.Scan() { + filesToChown = append(filesToChown, scanner.Text()) + } + if err := scanner.Err(); err != nil { + return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err) + } + + return filesToChown, nil +} + +func (m *unifiedManager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + + unitName := getUnitName(m.cgroups) + if err := stopUnit(m.dbus, unitName); err != nil { + return err + } + + // systemd 239 do not remove sub-cgroups. + err := m.fsMgr.Destroy() + // fsMgr.Destroy has handled ErrNotExist + if err != nil { + return err + } + + return nil +} + +func (m *unifiedManager) Path(_ string) string { + return m.path +} + +// getSliceFull value is used in initPath. +// The value is incompatible with systemdDbus.PropSlice. +func (m *unifiedManager) getSliceFull() (string, error) { + c := m.cgroups + slice := "system.slice" + if c.Rootless { + slice = "user.slice" + } + if c.Parent != "" { + var err error + slice, err = ExpandSlice(c.Parent) + if err != nil { + return "", err + } + } + + if c.Rootless { + // managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service". + managerCG, err := getManagerProperty(m.dbus, "ControlGroup") + if err != nil { + return "", err + } + slice = filepath.Join(managerCG, slice) + } + + // an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice" + // NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified. + return slice, nil +} + +func (m *unifiedManager) initPath() error { + if m.path != "" { + return nil + } + + sliceFull, err := m.getSliceFull() + if err != nil { + return err + } + + c := m.cgroups + path := filepath.Join(sliceFull, getUnitName(c)) + path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path) + if err != nil { + return err + } + + // an example of the final path in rootless: + // "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope" + m.path = path + + return nil +} + +func (m *unifiedManager) Freeze(state configs.FreezerState) error { + return m.fsMgr.Freeze(state) +} + +func (m *unifiedManager) GetPids() ([]int, error) { + return cgroups.GetPids(m.path) +} + +func (m *unifiedManager) GetAllPids() ([]int, error) { + return cgroups.GetAllPids(m.path) +} + +func (m *unifiedManager) GetStats() (*cgroups.Stats, error) { + return m.fsMgr.GetStats() +} + +func (m *unifiedManager) Set(r *configs.Resources) error { + if r == nil { + return nil + } + properties, err := genV2ResourcesProperties(r, m.dbus) + if err != nil { + return err + } + + if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil { + return fmt.Errorf("unable to set unit properties: %w", err) + } + + return m.fsMgr.Set(r) +} + +func (m *unifiedManager) GetPaths() map[string]string { + paths := make(map[string]string, 1) + paths[""] = m.path + return paths +} + +func (m *unifiedManager) GetCgroups() (*configs.Cgroup, error) { + return m.cgroups, nil +} + +func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) { + return m.fsMgr.GetFreezerState() +} + +func (m *unifiedManager) Exists() bool { + return cgroups.PathExists(m.path) +} + +func (m *unifiedManager) OOMKillCount() (uint64, error) { + return m.fsMgr.OOMKillCount() +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go new file mode 100644 index 000000000..fc4ae44a4 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -0,0 +1,469 @@ +package cgroups + +import ( + "bufio" + "errors" + "fmt" + "io" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + "time" + + "github.com/opencontainers/runc/libcontainer/userns" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const ( + CgroupProcesses = "cgroup.procs" + unifiedMountpoint = "/sys/fs/cgroup" + hybridMountpoint = "/sys/fs/cgroup/unified" +) + +var ( + isUnifiedOnce sync.Once + isUnified bool + isHybridOnce sync.Once + isHybrid bool +) + +// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode. +func IsCgroup2UnifiedMode() bool { + isUnifiedOnce.Do(func() { + var st unix.Statfs_t + err := unix.Statfs(unifiedMountpoint, &st) + if err != nil { + if os.IsNotExist(err) && userns.RunningInUserNS() { + // ignore the "not found" error if running in userns + logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint) + isUnified = false + return + } + panic(fmt.Sprintf("cannot statfs cgroup root: %s", err)) + } + isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC + }) + return isUnified +} + +// IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode. +func IsCgroup2HybridMode() bool { + isHybridOnce.Do(func() { + var st unix.Statfs_t + err := unix.Statfs(hybridMountpoint, &st) + if err != nil { + isHybrid = false + if !os.IsNotExist(err) { + // Report unexpected errors. + logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint) + } + return + } + isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC + }) + return isHybrid +} + +type Mount struct { + Mountpoint string + Root string + Subsystems []string +} + +// GetCgroupMounts returns the mounts for the cgroup subsystems. +// all indicates whether to return just the first instance or all the mounts. +// This function should not be used from cgroupv2 code, as in this case +// all the controllers are available under the constant unifiedMountpoint. +func GetCgroupMounts(all bool) ([]Mount, error) { + if IsCgroup2UnifiedMode() { + // TODO: remove cgroupv2 case once all external users are converted + availableControllers, err := GetAllSubsystems() + if err != nil { + return nil, err + } + m := Mount{ + Mountpoint: unifiedMountpoint, + Root: unifiedMountpoint, + Subsystems: availableControllers, + } + return []Mount{m}, nil + } + + return getCgroupMountsV1(all) +} + +// GetAllSubsystems returns all the cgroup subsystems supported by the kernel +func GetAllSubsystems() ([]string, error) { + // /proc/cgroups is meaningless for v2 + // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features + if IsCgroup2UnifiedMode() { + // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers. + // - devices: implemented in kernel 4.15 + // - freezer: implemented in kernel 5.2 + // We assume these are always available, as it is hard to detect availability. + pseudo := []string{"devices", "freezer"} + data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers") + if err != nil { + return nil, err + } + subsystems := append(pseudo, strings.Fields(data)...) + return subsystems, nil + } + f, err := os.Open("/proc/cgroups") + if err != nil { + return nil, err + } + defer f.Close() + + subsystems := []string{} + + s := bufio.NewScanner(f) + for s.Scan() { + text := s.Text() + if text[0] != '#' { + parts := strings.Fields(text) + if len(parts) >= 4 && parts[3] != "0" { + subsystems = append(subsystems, parts[0]) + } + } + } + if err := s.Err(); err != nil { + return nil, err + } + return subsystems, nil +} + +func readProcsFile(dir string) ([]int, error) { + f, err := OpenFile(dir, CgroupProcesses, os.O_RDONLY) + if err != nil { + return nil, err + } + defer f.Close() + + var ( + s = bufio.NewScanner(f) + out = []int{} + ) + + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, pid) + } + } + return out, s.Err() +} + +// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup +// or /proc//cgroup, into a map of subsystems to cgroup paths, e.g. +// +// "cpu": "/user.slice/user-1000.slice" +// "pids": "/user.slice/user-1000.slice" +// +// etc. +// +// Note that for cgroup v2 unified hierarchy, there are no per-controller +// cgroup paths, so the resulting map will have a single element where the key +// is empty string ("") and the value is the cgroup path the is in. +func ParseCgroupFile(path string) (map[string]string, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + defer f.Close() + + return parseCgroupFromReader(f) +} + +// helper function for ParseCgroupFile to make testing easier +func parseCgroupFromReader(r io.Reader) (map[string]string, error) { + s := bufio.NewScanner(r) + cgroups := make(map[string]string) + + for s.Scan() { + text := s.Text() + // from cgroups(7): + // /proc/[pid]/cgroup + // ... + // For each cgroup hierarchy ... there is one entry + // containing three colon-separated fields of the form: + // hierarchy-ID:subsystem-list:cgroup-path + parts := strings.SplitN(text, ":", 3) + if len(parts) < 3 { + return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text) + } + + for _, subs := range strings.Split(parts[1], ",") { + cgroups[subs] = parts[2] + } + } + if err := s.Err(); err != nil { + return nil, err + } + + return cgroups, nil +} + +func PathExists(path string) bool { + if _, err := os.Stat(path); err != nil { + return false + } + return true +} + +func EnterPid(cgroupPaths map[string]string, pid int) error { + for _, path := range cgroupPaths { + if PathExists(path) { + if err := WriteCgroupProc(path, pid); err != nil { + return err + } + } + } + return nil +} + +func rmdir(path string) error { + err := unix.Rmdir(path) + if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare + return nil + } + return &os.PathError{Op: "rmdir", Path: path, Err: err} +} + +// RemovePath aims to remove cgroup path. It does so recursively, +// by removing any subdirectories (sub-cgroups) first. +func RemovePath(path string) error { + // try the fast path first + if err := rmdir(path); err == nil { + return nil + } + + infos, err := os.ReadDir(path) + if err != nil { + if os.IsNotExist(err) { + err = nil + } + return err + } + for _, info := range infos { + if info.IsDir() { + // We should remove subcgroups dir first + if err = RemovePath(filepath.Join(path, info.Name())); err != nil { + break + } + } + } + if err == nil { + err = rmdir(path) + } + return err +} + +// RemovePaths iterates over the provided paths removing them. +// We trying to remove all paths five times with increasing delay between tries. +// If after all there are not removed cgroups - appropriate error will be +// returned. +func RemovePaths(paths map[string]string) (err error) { + const retries = 5 + delay := 10 * time.Millisecond + for i := 0; i < retries; i++ { + if i != 0 { + time.Sleep(delay) + delay *= 2 + } + for s, p := range paths { + if err := RemovePath(p); err != nil { + // do not log intermediate iterations + switch i { + case 0: + logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)") + case retries - 1: + logrus.WithError(err).Error("Failed to remove cgroup") + } + } + _, err := os.Stat(p) + // We need this strange way of checking cgroups existence because + // RemoveAll almost always returns error, even on already removed + // cgroups + if os.IsNotExist(err) { + delete(paths, s) + } + } + if len(paths) == 0 { + //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506 + paths = make(map[string]string) + return nil + } + } + return fmt.Errorf("Failed to remove paths: %v", paths) +} + +var ( + hugePageSizes []string + initHPSOnce sync.Once +) + +func HugePageSizes() []string { + initHPSOnce.Do(func() { + dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0) + if err != nil { + return + } + files, err := dir.Readdirnames(0) + dir.Close() + if err != nil { + return + } + + hugePageSizes, err = getHugePageSizeFromFilenames(files) + if err != nil { + logrus.Warn("HugePageSizes: ", err) + } + }) + + return hugePageSizes +} + +func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) { + pageSizes := make([]string, 0, len(fileNames)) + var warn error + + for _, file := range fileNames { + // example: hugepages-1048576kB + val := strings.TrimPrefix(file, "hugepages-") + if len(val) == len(file) { + // Unexpected file name: no prefix found, ignore it. + continue + } + // The suffix is always "kB" (as of Linux 5.13). If we find + // something else, produce an error but keep going. + eLen := len(val) - 2 + val = strings.TrimSuffix(val, "kB") + if len(val) != eLen { + // Highly unlikely. + if warn == nil { + warn = errors.New(file + `: invalid suffix (expected "kB")`) + } + continue + } + size, err := strconv.Atoi(val) + if err != nil { + // Highly unlikely. + if warn == nil { + warn = fmt.Errorf("%s: %w", file, err) + } + continue + } + // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574 + // but in our case the size is in KB already. + if size >= (1 << 20) { + val = strconv.Itoa(size>>20) + "GB" + } else if size >= (1 << 10) { + val = strconv.Itoa(size>>10) + "MB" + } else { + val += "KB" + } + pageSizes = append(pageSizes, val) + } + + return pageSizes, warn +} + +// GetPids returns all pids, that were added to cgroup at path. +func GetPids(dir string) ([]int, error) { + return readProcsFile(dir) +} + +// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file +func WriteCgroupProc(dir string, pid int) error { + // Normally dir should not be empty, one case is that cgroup subsystem + // is not mounted, we will get empty dir, and we want it fail here. + if dir == "" { + return fmt.Errorf("no such directory for %s", CgroupProcesses) + } + + // Dont attach any pid to the cgroup if -1 is specified as a pid + if pid == -1 { + return nil + } + + file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY) + if err != nil { + return fmt.Errorf("failed to write %v: %w", pid, err) + } + defer file.Close() + + for i := 0; i < 5; i++ { + _, err = file.WriteString(strconv.Itoa(pid)) + if err == nil { + return nil + } + + // EINVAL might mean that the task being added to cgroup.procs is in state + // TASK_NEW. We should attempt to do so again. + if errors.Is(err, unix.EINVAL) { + time.Sleep(30 * time.Millisecond) + continue + } + + return fmt.Errorf("failed to write %v: %w", pid, err) + } + return err +} + +// Since the OCI spec is designed for cgroup v1, in some cases +// there is need to convert from the cgroup v1 configuration to cgroup v2 +// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142) +// convert from [2-262144] to [1-10000] +// 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)" +func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 { + if cpuShares == 0 { + return 0 + } + return (1 + ((cpuShares-2)*9999)/262142) +} + +// ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec +// for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap +// is defined as memory+swap combined, while in cgroup v2 swap is a separate value. +func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) { + // for compatibility with cgroup1 controller, set swap to unlimited in + // case the memory is set to unlimited, and swap is not explicitly set, + // treating the request as "set both memory and swap to unlimited". + if memory == -1 && memorySwap == 0 { + return -1, nil + } + if memorySwap == -1 || memorySwap == 0 { + // -1 is "max", 0 is "unset", so treat as is + return memorySwap, nil + } + // sanity checks + if memory == 0 || memory == -1 { + return 0, errors.New("unable to set swap limit without memory limit") + } + if memory < 0 { + return 0, fmt.Errorf("invalid memory value: %d", memory) + } + if memorySwap < memory { + return 0, errors.New("memory+swap limit should be >= memory limit") + } + + return memorySwap - memory, nil +} + +// Since the OCI spec is designed for cgroup v1, in some cases +// there is need to convert from the cgroup v1 configuration to cgroup v2 +// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990) +// convert linearly from [10-1000] to [1-10000] +func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 { + if blkIoWeight == 0 { + return 0 + } + return 1 + (uint64(blkIoWeight)-10)*9999/990 +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go new file mode 100644 index 000000000..47c75f22b --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go @@ -0,0 +1,290 @@ +package cgroups + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strings" + "sync" + "syscall" + + securejoin "github.com/cyphar/filepath-securejoin" + "github.com/moby/sys/mountinfo" + "golang.org/x/sys/unix" +) + +// Code in this source file are specific to cgroup v1, +// and must not be used from any cgroup v2 code. + +const ( + CgroupNamePrefix = "name=" + defaultPrefix = "/sys/fs/cgroup" +) + +var ( + errUnified = errors.New("not implemented for cgroup v2 unified hierarchy") + ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1") + + readMountinfoOnce sync.Once + readMountinfoErr error + cgroupMountinfo []*mountinfo.Info +) + +type NotFoundError struct { + Subsystem string +} + +func (e *NotFoundError) Error() string { + return fmt.Sprintf("mountpoint for %s not found", e.Subsystem) +} + +func NewNotFoundError(sub string) error { + return &NotFoundError{ + Subsystem: sub, + } +} + +func IsNotFound(err error) bool { + var nfErr *NotFoundError + return errors.As(err, &nfErr) +} + +func tryDefaultPath(cgroupPath, subsystem string) string { + if !strings.HasPrefix(defaultPrefix, cgroupPath) { + return "" + } + + // remove possible prefix + subsystem = strings.TrimPrefix(subsystem, CgroupNamePrefix) + + // Make sure we're still under defaultPrefix, and resolve + // a possible symlink (like cpu -> cpu,cpuacct). + path, err := securejoin.SecureJoin(defaultPrefix, subsystem) + if err != nil { + return "" + } + + // (1) path should be a directory. + st, err := os.Lstat(path) + if err != nil || !st.IsDir() { + return "" + } + + // (2) path should be a mount point. + pst, err := os.Lstat(filepath.Dir(path)) + if err != nil { + return "" + } + + if st.Sys().(*syscall.Stat_t).Dev == pst.Sys().(*syscall.Stat_t).Dev { + // parent dir has the same dev -- path is not a mount point + return "" + } + + // (3) path should have 'cgroup' fs type. + fst := unix.Statfs_t{} + err = unix.Statfs(path, &fst) + if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { + return "" + } + + return path +} + +// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones +// with fstype of "cgroup") for the current running process. +// +// The results are cached (to avoid re-reading mountinfo which is relatively +// expensive), so it is assumed that cgroup mounts are not being changed. +func readCgroupMountinfo() ([]*mountinfo.Info, error) { + readMountinfoOnce.Do(func() { + cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts( + mountinfo.FSTypeFilter("cgroup"), + ) + }) + + return cgroupMountinfo, readMountinfoErr +} + +// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt +func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + + // If subsystem is empty, we look for the cgroupv2 hybrid path. + if len(subsystem) == 0 { + return hybridMountpoint, nil + } + + // Avoid parsing mountinfo by trying the default path first, if possible. + if path := tryDefaultPath(cgroupPath, subsystem); path != "" { + return path, nil + } + + mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem) + return mnt, err +} + +func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) { + if IsCgroup2UnifiedMode() { + return "", "", errUnified + } + + mi, err := readCgroupMountinfo() + if err != nil { + return "", "", err + } + + return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem) +} + +func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) { + for _, mi := range mounts { + if strings.HasPrefix(mi.Mountpoint, cgroupPath) { + for _, opt := range strings.Split(mi.VFSOptions, ",") { + if opt == subsystem { + return mi.Mountpoint, mi.Root, nil + } + } + } + } + + return "", "", NewNotFoundError(subsystem) +} + +func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) { + if len(m.Subsystems) == 0 { + return "", errors.New("no subsystem for mount") + } + + return getControllerPath(m.Subsystems[0], cgroups) +} + +func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) { + res := make([]Mount, 0, len(ss)) + numFound := 0 + for _, mi := range mounts { + m := Mount{ + Mountpoint: mi.Mountpoint, + Root: mi.Root, + } + for _, opt := range strings.Split(mi.VFSOptions, ",") { + seen, known := ss[opt] + if !known || (!all && seen) { + continue + } + ss[opt] = true + opt = strings.TrimPrefix(opt, CgroupNamePrefix) + m.Subsystems = append(m.Subsystems, opt) + numFound++ + } + if len(m.Subsystems) > 0 || all { + res = append(res, m) + } + if !all && numFound >= len(ss) { + break + } + } + return res, nil +} + +func getCgroupMountsV1(all bool) ([]Mount, error) { + mi, err := readCgroupMountinfo() + if err != nil { + return nil, err + } + + allSubsystems, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return nil, err + } + + allMap := make(map[string]bool) + for s := range allSubsystems { + allMap[s] = false + } + + return getCgroupMountsHelper(allMap, mi, all) +} + +// GetOwnCgroup returns the relative path to the cgroup docker is running in. +func GetOwnCgroup(subsystem string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + cgroups, err := ParseCgroupFile("/proc/self/cgroup") + if err != nil { + return "", err + } + + return getControllerPath(subsystem, cgroups) +} + +func GetOwnCgroupPath(subsystem string) (string, error) { + cgroup, err := GetOwnCgroup(subsystem) + if err != nil { + return "", err + } + + // If subsystem is empty, we look for the cgroupv2 hybrid path. + if len(subsystem) == 0 { + return hybridMountpoint, nil + } + + return getCgroupPathHelper(subsystem, cgroup) +} + +func GetInitCgroup(subsystem string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + cgroups, err := ParseCgroupFile("/proc/1/cgroup") + if err != nil { + return "", err + } + + return getControllerPath(subsystem, cgroups) +} + +func GetInitCgroupPath(subsystem string) (string, error) { + cgroup, err := GetInitCgroup(subsystem) + if err != nil { + return "", err + } + + return getCgroupPathHelper(subsystem, cgroup) +} + +func getCgroupPathHelper(subsystem, cgroup string) (string, error) { + mnt, root, err := FindCgroupMountpointAndRoot("", subsystem) + if err != nil { + return "", err + } + + // This is needed for nested containers, because in /proc/self/cgroup we + // see paths from host, which don't exist in container. + relCgroup, err := filepath.Rel(root, cgroup) + if err != nil { + return "", err + } + + return filepath.Join(mnt, relCgroup), nil +} + +func getControllerPath(subsystem string, cgroups map[string]string) (string, error) { + if IsCgroup2UnifiedMode() { + return "", errUnified + } + + if p, ok := cgroups[subsystem]; ok { + return p, nil + } + + if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok { + return p, nil + } + + return "", NewNotFoundError(subsystem) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go new file mode 100644 index 000000000..fa195bf90 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go @@ -0,0 +1,66 @@ +package configs + +import "fmt" + +// blockIODevice holds major:minor format supported in blkio cgroup +type blockIODevice struct { + // Major is the device's major number + Major int64 `json:"major"` + // Minor is the device's minor number + Minor int64 `json:"minor"` +} + +// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair +type WeightDevice struct { + blockIODevice + // Weight is the bandwidth rate for the device, range is from 10 to 1000 + Weight uint16 `json:"weight"` + // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only + LeafWeight uint16 `json:"leafWeight"` +} + +// NewWeightDevice returns a configured WeightDevice pointer +func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice { + wd := &WeightDevice{} + wd.Major = major + wd.Minor = minor + wd.Weight = weight + wd.LeafWeight = leafWeight + return wd +} + +// WeightString formats the struct to be writable to the cgroup specific file +func (wd *WeightDevice) WeightString() string { + return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight) +} + +// LeafWeightString formats the struct to be writable to the cgroup specific file +func (wd *WeightDevice) LeafWeightString() string { + return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight) +} + +// ThrottleDevice struct holds a `major:minor rate_per_second` pair +type ThrottleDevice struct { + blockIODevice + // Rate is the IO rate limit per cgroup per device + Rate uint64 `json:"rate"` +} + +// NewThrottleDevice returns a configured ThrottleDevice pointer +func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice { + td := &ThrottleDevice{} + td.Major = major + td.Minor = minor + td.Rate = rate + return td +} + +// String formats the struct to be writable to the cgroup specific file +func (td *ThrottleDevice) String() string { + return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate) +} + +// StringName formats the struct to be writable to the cgroup specific file +func (td *ThrottleDevice) StringName(name string) string { + return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go new file mode 100644 index 000000000..2d4a89871 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go @@ -0,0 +1,158 @@ +package configs + +import ( + systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/opencontainers/runc/libcontainer/devices" +) + +type FreezerState string + +const ( + Undefined FreezerState = "" + Frozen FreezerState = "FROZEN" + Thawed FreezerState = "THAWED" +) + +// Cgroup holds properties of a cgroup on Linux. +type Cgroup struct { + // Name specifies the name of the cgroup + Name string `json:"name,omitempty"` + + // Parent specifies the name of parent of cgroup or slice + Parent string `json:"parent,omitempty"` + + // Path specifies the path to cgroups that are created and/or joined by the container. + // The path is assumed to be relative to the host system cgroup mountpoint. + Path string `json:"path"` + + // ScopePrefix describes prefix for the scope name + ScopePrefix string `json:"scope_prefix"` + + // Resources contains various cgroups settings to apply + *Resources + + // Systemd tells if systemd should be used to manage cgroups. + Systemd bool + + // SystemdProps are any additional properties for systemd, + // derived from org.systemd.property.xxx annotations. + // Ignored unless systemd is used for managing cgroups. + SystemdProps []systemdDbus.Property `json:"-"` + + // Rootless tells if rootless cgroups should be used. + Rootless bool + + // The host UID that should own the cgroup, or nil to accept + // the default ownership. This should only be set when the + // cgroupfs is to be mounted read/write. + // Not all cgroup manager implementations support changing + // the ownership. + OwnerUID *int `json:"owner_uid,omitempty"` +} + +type Resources struct { + // Devices is the set of access rules for devices in the container. + Devices []*devices.Rule `json:"devices"` + + // Memory limit (in bytes) + Memory int64 `json:"memory"` + + // Memory reservation or soft_limit (in bytes) + MemoryReservation int64 `json:"memory_reservation"` + + // Total memory usage (memory + swap); set `-1` to enable unlimited swap + MemorySwap int64 `json:"memory_swap"` + + // CPU shares (relative weight vs. other containers) + CpuShares uint64 `json:"cpu_shares"` + + // CPU hardcap limit (in usecs). Allowed cpu time in a given period. + CpuQuota int64 `json:"cpu_quota"` + + // CPU period to be used for hardcapping (in usecs). 0 to use system default. + CpuPeriod uint64 `json:"cpu_period"` + + // How many time CPU will use in realtime scheduling (in usecs). + CpuRtRuntime int64 `json:"cpu_rt_quota"` + + // CPU period to be used for realtime scheduling (in usecs). + CpuRtPeriod uint64 `json:"cpu_rt_period"` + + // CPU to use + CpusetCpus string `json:"cpuset_cpus"` + + // MEM to use + CpusetMems string `json:"cpuset_mems"` + + // Process limit; set <= `0' to disable limit. + PidsLimit int64 `json:"pids_limit"` + + // Specifies per cgroup weight, range is from 10 to 1000. + BlkioWeight uint16 `json:"blkio_weight"` + + // Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only + BlkioLeafWeight uint16 `json:"blkio_leaf_weight"` + + // Weight per cgroup per device, can override BlkioWeight. + BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"` + + // IO read rate limit per cgroup per device, bytes per second. + BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"` + + // IO write rate limit per cgroup per device, bytes per second. + BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"` + + // IO read rate limit per cgroup per device, IO per second. + BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"` + + // IO write rate limit per cgroup per device, IO per second. + BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"` + + // set the freeze value for the process + Freezer FreezerState `json:"freezer"` + + // Hugetlb limit (in bytes) + HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"` + + // Whether to disable OOM Killer + OomKillDisable bool `json:"oom_kill_disable"` + + // Tuning swappiness behaviour per cgroup + MemorySwappiness *uint64 `json:"memory_swappiness"` + + // Set priority of network traffic for container + NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"` + + // Set class identifier for container's network packets + NetClsClassid uint32 `json:"net_cls_classid_u"` + + // Rdma resource restriction configuration + Rdma map[string]LinuxRdma `json:"rdma"` + + // Used on cgroups v2: + + // CpuWeight sets a proportional bandwidth limit. + CpuWeight uint64 `json:"cpu_weight"` + + // Unified is cgroupv2-only key-value map. + Unified map[string]string `json:"unified"` + + // SkipDevices allows to skip configuring device permissions. + // Used by e.g. kubelet while creating a parent cgroup (kubepods) + // common for many containers, and by runc update. + // + // NOTE it is impossible to start a container which has this flag set. + SkipDevices bool `json:"-"` + + // SkipFreezeOnSet is a flag for cgroup manager to skip the cgroup + // freeze when setting resources. Only applicable to systemd legacy + // (i.e. cgroup v1) manager (which uses freeze by default to avoid + // spurious permission errors caused by systemd inability to update + // device rules in a non-disruptive manner). + // + // If not set, a few methods (such as looking into cgroup's + // devices.list and querying the systemd unit properties) are used + // during Set() to figure out whether the freeze is required. Those + // methods may be relatively slow, thus this flag. + SkipFreezeOnSet bool `json:"-"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go new file mode 100644 index 000000000..7e383020f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go @@ -0,0 +1,9 @@ +//go:build !linux +// +build !linux + +package configs + +// Cgroup holds properties of a cgroup on Linux +// TODO Windows: This can ultimately be entirely factored out on Windows as +// cgroups are a Unix-specific construct. +type Cgroup struct{} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go new file mode 100644 index 000000000..c1b4a0041 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -0,0 +1,414 @@ +package configs + +import ( + "bytes" + "encoding/json" + "fmt" + "os/exec" + "time" + + "github.com/sirupsen/logrus" + + "github.com/opencontainers/runc/libcontainer/devices" + "github.com/opencontainers/runtime-spec/specs-go" +) + +type Rlimit struct { + Type int `json:"type"` + Hard uint64 `json:"hard"` + Soft uint64 `json:"soft"` +} + +// IDMap represents UID/GID Mappings for User Namespaces. +type IDMap struct { + ContainerID int `json:"container_id"` + HostID int `json:"host_id"` + Size int `json:"size"` +} + +// Seccomp represents syscall restrictions +// By default, only the native architecture of the kernel is allowed to be used +// for syscalls. Additional architectures can be added by specifying them in +// Architectures. +type Seccomp struct { + DefaultAction Action `json:"default_action"` + Architectures []string `json:"architectures"` + Syscalls []*Syscall `json:"syscalls"` + DefaultErrnoRet *uint `json:"default_errno_ret"` + ListenerPath string `json:"listener_path,omitempty"` + ListenerMetadata string `json:"listener_metadata,omitempty"` +} + +// Action is taken upon rule match in Seccomp +type Action int + +const ( + Kill Action = iota + 1 + Errno + Trap + Allow + Trace + Log + Notify + KillThread + KillProcess +) + +// Operator is a comparison operator to be used when matching syscall arguments in Seccomp +type Operator int + +const ( + EqualTo Operator = iota + 1 + NotEqualTo + GreaterThan + GreaterThanOrEqualTo + LessThan + LessThanOrEqualTo + MaskEqualTo +) + +// Arg is a rule to match a specific syscall argument in Seccomp +type Arg struct { + Index uint `json:"index"` + Value uint64 `json:"value"` + ValueTwo uint64 `json:"value_two"` + Op Operator `json:"op"` +} + +// Syscall is a rule to match a syscall in Seccomp +type Syscall struct { + Name string `json:"name"` + Action Action `json:"action"` + ErrnoRet *uint `json:"errnoRet"` + Args []*Arg `json:"args"` +} + +// TODO Windows. Many of these fields should be factored out into those parts +// which are common across platforms, and those which are platform specific. + +// Config defines configuration options for executing a process inside a contained environment. +type Config struct { + // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs + // This is a common option when the container is running in ramdisk + NoPivotRoot bool `json:"no_pivot_root"` + + // ParentDeathSignal specifies the signal that is sent to the container's process in the case + // that the parent process dies. + ParentDeathSignal int `json:"parent_death_signal"` + + // Path to a directory containing the container's root filesystem. + Rootfs string `json:"rootfs"` + + // Umask is the umask to use inside of the container. + Umask *uint32 `json:"umask"` + + // Readonlyfs will remount the container's rootfs as readonly where only externally mounted + // bind mounts are writtable. + Readonlyfs bool `json:"readonlyfs"` + + // Specifies the mount propagation flags to be applied to /. + RootPropagation int `json:"rootPropagation"` + + // Mounts specify additional source and destination paths that will be mounted inside the container's + // rootfs and mount namespace if specified + Mounts []*Mount `json:"mounts"` + + // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well! + Devices []*devices.Device `json:"devices"` + + MountLabel string `json:"mount_label"` + + // Hostname optionally sets the container's hostname if provided + Hostname string `json:"hostname"` + + // Namespaces specifies the container's namespaces that it should setup when cloning the init process + // If a namespace is not provided that namespace is shared from the container's parent process + Namespaces Namespaces `json:"namespaces"` + + // Capabilities specify the capabilities to keep when executing the process inside the container + // All capabilities not specified will be dropped from the processes capability mask + Capabilities *Capabilities `json:"capabilities"` + + // Networks specifies the container's network setup to be created + Networks []*Network `json:"networks"` + + // Routes can be specified to create entries in the route table as the container is started + Routes []*Route `json:"routes"` + + // Cgroups specifies specific cgroup settings for the various subsystems that the container is + // placed into to limit the resources the container has available + Cgroups *Cgroup `json:"cgroups"` + + // AppArmorProfile specifies the profile to apply to the process running in the container and is + // change at the time the process is execed + AppArmorProfile string `json:"apparmor_profile,omitempty"` + + // ProcessLabel specifies the label to apply to the process running in the container. It is + // commonly used by selinux + ProcessLabel string `json:"process_label,omitempty"` + + // Rlimits specifies the resource limits, such as max open files, to set in the container + // If Rlimits are not set, the container will inherit rlimits from the parent process + Rlimits []Rlimit `json:"rlimits,omitempty"` + + // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores + // for a process. Valid values are between the range [-1000, '1000'], where processes with + // higher scores are preferred for being killed. If it is unset then we don't touch the current + // value. + // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ + OomScoreAdj *int `json:"oom_score_adj,omitempty"` + + // UidMappings is an array of User ID mappings for User Namespaces + UidMappings []IDMap `json:"uid_mappings"` + + // GidMappings is an array of Group ID mappings for User Namespaces + GidMappings []IDMap `json:"gid_mappings"` + + // MaskPaths specifies paths within the container's rootfs to mask over with a bind + // mount pointing to /dev/null as to prevent reads of the file. + MaskPaths []string `json:"mask_paths"` + + // ReadonlyPaths specifies paths within the container's rootfs to remount as read-only + // so that these files prevent any writes. + ReadonlyPaths []string `json:"readonly_paths"` + + // Sysctl is a map of properties and their values. It is the equivalent of using + // sysctl -w my.property.name value in Linux. + Sysctl map[string]string `json:"sysctl"` + + // Seccomp allows actions to be taken whenever a syscall is made within the container. + // A number of rules are given, each having an action to be taken if a syscall matches it. + // A default action to be taken if no rules match is also given. + Seccomp *Seccomp `json:"seccomp"` + + // NoNewPrivileges controls whether processes in the container can gain additional privileges. + NoNewPrivileges bool `json:"no_new_privileges,omitempty"` + + // Hooks are a collection of actions to perform at various container lifecycle events. + // CommandHooks are serialized to JSON, but other hooks are not. + Hooks Hooks + + // Version is the version of opencontainer specification that is supported. + Version string `json:"version"` + + // Labels are user defined metadata that is stored in the config and populated on the state + Labels []string `json:"labels"` + + // NoNewKeyring will not allocated a new session keyring for the container. It will use the + // callers keyring in this case. + NoNewKeyring bool `json:"no_new_keyring"` + + // IntelRdt specifies settings for Intel RDT group that the container is placed into + // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available + IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` + + // RootlessEUID is set when the runc was launched with non-zero EUID. + // Note that RootlessEUID is set to false when launched with EUID=0 in userns. + // When RootlessEUID is set, runc creates a new userns for the container. + // (config.json needs to contain userns settings) + RootlessEUID bool `json:"rootless_euid,omitempty"` + + // RootlessCgroups is set when unlikely to have the full access to cgroups. + // When RootlessCgroups is set, cgroups errors are ignored. + RootlessCgroups bool `json:"rootless_cgroups,omitempty"` +} + +type ( + HookName string + HookList []Hook + Hooks map[HookName]HookList +) + +const ( + // Prestart commands are executed after the container namespaces are created, + // but before the user supplied command is executed from init. + // Note: This hook is now deprecated + // Prestart commands are called in the Runtime namespace. + Prestart HookName = "prestart" + + // CreateRuntime commands MUST be called as part of the create operation after + // the runtime environment has been created but before the pivot_root has been executed. + // CreateRuntime is called immediately after the deprecated Prestart hook. + // CreateRuntime commands are called in the Runtime Namespace. + CreateRuntime HookName = "createRuntime" + + // CreateContainer commands MUST be called as part of the create operation after + // the runtime environment has been created but before the pivot_root has been executed. + // CreateContainer commands are called in the Container namespace. + CreateContainer HookName = "createContainer" + + // StartContainer commands MUST be called as part of the start operation and before + // the container process is started. + // StartContainer commands are called in the Container namespace. + StartContainer HookName = "startContainer" + + // Poststart commands are executed after the container init process starts. + // Poststart commands are called in the Runtime Namespace. + Poststart HookName = "poststart" + + // Poststop commands are executed after the container init process exits. + // Poststop commands are called in the Runtime Namespace. + Poststop HookName = "poststop" +) + +// KnownHookNames returns the known hook names. +// Used by `runc features`. +func KnownHookNames() []string { + return []string{ + string(Prestart), // deprecated + string(CreateRuntime), + string(CreateContainer), + string(StartContainer), + string(Poststart), + string(Poststop), + } +} + +type Capabilities struct { + // Bounding is the set of capabilities checked by the kernel. + Bounding []string + // Effective is the set of capabilities checked by the kernel. + Effective []string + // Inheritable is the capabilities preserved across execve. + Inheritable []string + // Permitted is the limiting superset for effective capabilities. + Permitted []string + // Ambient is the ambient set of capabilities that are kept. + Ambient []string +} + +func (hooks HookList) RunHooks(state *specs.State) error { + for i, h := range hooks { + if err := h.Run(state); err != nil { + return fmt.Errorf("error running hook #%d: %w", i, err) + } + } + + return nil +} + +func (hooks *Hooks) UnmarshalJSON(b []byte) error { + var state map[HookName][]CommandHook + + if err := json.Unmarshal(b, &state); err != nil { + return err + } + + *hooks = Hooks{} + for n, commandHooks := range state { + if len(commandHooks) == 0 { + continue + } + + (*hooks)[n] = HookList{} + for _, h := range commandHooks { + (*hooks)[n] = append((*hooks)[n], h) + } + } + + return nil +} + +func (hooks *Hooks) MarshalJSON() ([]byte, error) { + serialize := func(hooks []Hook) (serializableHooks []CommandHook) { + for _, hook := range hooks { + switch chook := hook.(type) { + case CommandHook: + serializableHooks = append(serializableHooks, chook) + default: + logrus.Warnf("cannot serialize hook of type %T, skipping", hook) + } + } + + return serializableHooks + } + + return json.Marshal(map[string]interface{}{ + "prestart": serialize((*hooks)[Prestart]), + "createRuntime": serialize((*hooks)[CreateRuntime]), + "createContainer": serialize((*hooks)[CreateContainer]), + "startContainer": serialize((*hooks)[StartContainer]), + "poststart": serialize((*hooks)[Poststart]), + "poststop": serialize((*hooks)[Poststop]), + }) +} + +type Hook interface { + // Run executes the hook with the provided state. + Run(*specs.State) error +} + +// NewFunctionHook will call the provided function when the hook is run. +func NewFunctionHook(f func(*specs.State) error) FuncHook { + return FuncHook{ + run: f, + } +} + +type FuncHook struct { + run func(*specs.State) error +} + +func (f FuncHook) Run(s *specs.State) error { + return f.run(s) +} + +type Command struct { + Path string `json:"path"` + Args []string `json:"args"` + Env []string `json:"env"` + Dir string `json:"dir"` + Timeout *time.Duration `json:"timeout"` +} + +// NewCommandHook will execute the provided command when the hook is run. +func NewCommandHook(cmd Command) CommandHook { + return CommandHook{ + Command: cmd, + } +} + +type CommandHook struct { + Command +} + +func (c Command) Run(s *specs.State) error { + b, err := json.Marshal(s) + if err != nil { + return err + } + var stdout, stderr bytes.Buffer + cmd := exec.Cmd{ + Path: c.Path, + Args: c.Args, + Env: c.Env, + Stdin: bytes.NewReader(b), + Stdout: &stdout, + Stderr: &stderr, + } + if err := cmd.Start(); err != nil { + return err + } + errC := make(chan error, 1) + go func() { + err := cmd.Wait() + if err != nil { + err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String()) + } + errC <- err + }() + var timerCh <-chan time.Time + if c.Timeout != nil { + timer := time.NewTimer(*c.Timeout) + defer timer.Stop() + timerCh = timer.C + } + select { + case err := <-errC: + return err + case <-timerCh: + _ = cmd.Process.Kill() + <-errC + return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds()) + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go new file mode 100644 index 000000000..8c02848b7 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go @@ -0,0 +1,68 @@ +package configs + +import "errors" + +var ( + errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.") + errNoUserMap = errors.New("User namespaces enabled, but no user mapping found.") + errNoGIDMap = errors.New("User namespaces enabled, but no gid mappings found.") + errNoGroupMap = errors.New("User namespaces enabled, but no group mapping found.") +) + +// HostUID gets the translated uid for the process on host which could be +// different when user namespaces are enabled. +func (c Config) HostUID(containerId int) (int, error) { + if c.Namespaces.Contains(NEWUSER) { + if c.UidMappings == nil { + return -1, errNoUIDMap + } + id, found := c.hostIDFromMapping(containerId, c.UidMappings) + if !found { + return -1, errNoUserMap + } + return id, nil + } + // Return unchanged id. + return containerId, nil +} + +// HostRootUID gets the root uid for the process on host which could be non-zero +// when user namespaces are enabled. +func (c Config) HostRootUID() (int, error) { + return c.HostUID(0) +} + +// HostGID gets the translated gid for the process on host which could be +// different when user namespaces are enabled. +func (c Config) HostGID(containerId int) (int, error) { + if c.Namespaces.Contains(NEWUSER) { + if c.GidMappings == nil { + return -1, errNoGIDMap + } + id, found := c.hostIDFromMapping(containerId, c.GidMappings) + if !found { + return -1, errNoGroupMap + } + return id, nil + } + // Return unchanged id. + return containerId, nil +} + +// HostRootGID gets the root gid for the process on host which could be non-zero +// when user namespaces are enabled. +func (c Config) HostRootGID() (int, error) { + return c.HostGID(0) +} + +// Utility function that gets a host ID for a container ID from user namespace map +// if that ID is present in the map. +func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) { + for _, m := range uMap { + if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) { + hostID := m.HostID + (containerID - m.ContainerID) + return hostID, true + } + } + return -1, false +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go new file mode 100644 index 000000000..bce829e29 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go @@ -0,0 +1,10 @@ +//go:build gofuzz +// +build gofuzz + +package configs + +func FuzzUnmarshalJSON(data []byte) int { + hooks := Hooks{} + _ = hooks.UnmarshalJSON(data) + return 1 +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go new file mode 100644 index 000000000..d30216380 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go @@ -0,0 +1,9 @@ +package configs + +type HugepageLimit struct { + // which type of hugepage to limit. + Pagesize string `json:"page_size"` + + // usage limit for hugepage. + Limit uint64 `json:"limit"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go new file mode 100644 index 000000000..f8d951ab8 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go @@ -0,0 +1,16 @@ +package configs + +type IntelRdt struct { + // The identity for RDT Class of Service + ClosID string `json:"closID,omitempty"` + + // The schema for L3 cache id and capacity bitmask (CBM) + // Format: "L3:=;=;..." + L3CacheSchema string `json:"l3_cache_schema,omitempty"` + + // The schema of memory bandwidth per L3 cache id + // Format: "MB:=bandwidth0;=bandwidth1;..." + // The unit of memory bandwidth is specified in "percentages" by + // default, and in "MBps" if MBA Software Controller is enabled. + MemBwSchema string `json:"memBwSchema,omitempty"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go new file mode 100644 index 000000000..9a0395eaf --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go @@ -0,0 +1,14 @@ +package configs + +import ( + "fmt" +) + +type IfPrioMap struct { + Interface string `json:"interface"` + Priority int64 `json:"priority"` +} + +func (i *IfPrioMap) CgroupString() string { + return fmt.Sprintf("%s %d", i.Interface, i.Priority) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go new file mode 100644 index 000000000..784c61820 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go @@ -0,0 +1,48 @@ +package configs + +import "golang.org/x/sys/unix" + +const ( + // EXT_COPYUP is a directive to copy up the contents of a directory when + // a tmpfs is mounted over it. + EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning +) + +type Mount struct { + // Source path for the mount. + Source string `json:"source"` + + // Destination path for the mount inside the container. + Destination string `json:"destination"` + + // Device the mount is for. + Device string `json:"device"` + + // Mount flags. + Flags int `json:"flags"` + + // Propagation Flags + PropagationFlags []int `json:"propagation_flags"` + + // Mount data applied to the mount. + Data string `json:"data"` + + // Relabel source if set, "z" indicates shared, "Z" indicates unshared. + Relabel string `json:"relabel"` + + // RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2). + RecAttr *unix.MountAttr `json:"rec_attr"` + + // Extensions are additional flags that are specific to runc. + Extensions int `json:"extensions"` + + // Optional Command to be run before Source is mounted. + PremountCmds []Command `json:"premount_cmds"` + + // Optional Command to be run after Source is mounted. + PostmountCmds []Command `json:"postmount_cmds"` +} + +func (m *Mount) IsBind() bool { + return m.Flags&unix.MS_BIND != 0 +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go new file mode 100644 index 000000000..a3329a31a --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go @@ -0,0 +1,5 @@ +package configs + +type NamespaceType string + +type Namespaces []Namespace diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go new file mode 100644 index 000000000..d52d6fcd1 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go @@ -0,0 +1,126 @@ +package configs + +import ( + "fmt" + "os" + "sync" +) + +const ( + NEWNET NamespaceType = "NEWNET" + NEWPID NamespaceType = "NEWPID" + NEWNS NamespaceType = "NEWNS" + NEWUTS NamespaceType = "NEWUTS" + NEWIPC NamespaceType = "NEWIPC" + NEWUSER NamespaceType = "NEWUSER" + NEWCGROUP NamespaceType = "NEWCGROUP" +) + +var ( + nsLock sync.Mutex + supportedNamespaces = make(map[NamespaceType]bool) +) + +// NsName converts the namespace type to its filename +func NsName(ns NamespaceType) string { + switch ns { + case NEWNET: + return "net" + case NEWNS: + return "mnt" + case NEWPID: + return "pid" + case NEWIPC: + return "ipc" + case NEWUSER: + return "user" + case NEWUTS: + return "uts" + case NEWCGROUP: + return "cgroup" + } + return "" +} + +// IsNamespaceSupported returns whether a namespace is available or +// not +func IsNamespaceSupported(ns NamespaceType) bool { + nsLock.Lock() + defer nsLock.Unlock() + supported, ok := supportedNamespaces[ns] + if ok { + return supported + } + nsFile := NsName(ns) + // if the namespace type is unknown, just return false + if nsFile == "" { + return false + } + _, err := os.Stat("/proc/self/ns/" + nsFile) + // a namespace is supported if it exists and we have permissions to read it + supported = err == nil + supportedNamespaces[ns] = supported + return supported +} + +func NamespaceTypes() []NamespaceType { + return []NamespaceType{ + NEWUSER, // Keep user NS always first, don't move it. + NEWIPC, + NEWUTS, + NEWNET, + NEWPID, + NEWNS, + NEWCGROUP, + } +} + +// Namespace defines configuration for each namespace. It specifies an +// alternate path that is able to be joined via setns. +type Namespace struct { + Type NamespaceType `json:"type"` + Path string `json:"path"` +} + +func (n *Namespace) GetPath(pid int) string { + return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type)) +} + +func (n *Namespaces) Remove(t NamespaceType) bool { + i := n.index(t) + if i == -1 { + return false + } + *n = append((*n)[:i], (*n)[i+1:]...) + return true +} + +func (n *Namespaces) Add(t NamespaceType, path string) { + i := n.index(t) + if i == -1 { + *n = append(*n, Namespace{Type: t, Path: path}) + return + } + (*n)[i].Path = path +} + +func (n *Namespaces) index(t NamespaceType) int { + for i, ns := range *n { + if ns.Type == t { + return i + } + } + return -1 +} + +func (n *Namespaces) Contains(t NamespaceType) bool { + return n.index(t) != -1 +} + +func (n *Namespaces) PathOf(t NamespaceType) string { + i := n.index(t) + if i == -1 { + return "" + } + return (*n)[i].Path +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go new file mode 100644 index 000000000..0516dba8d --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go @@ -0,0 +1,33 @@ +//go:build linux +// +build linux + +package configs + +import "golang.org/x/sys/unix" + +func (n *Namespace) Syscall() int { + return namespaceInfo[n.Type] +} + +var namespaceInfo = map[NamespaceType]int{ + NEWNET: unix.CLONE_NEWNET, + NEWNS: unix.CLONE_NEWNS, + NEWUSER: unix.CLONE_NEWUSER, + NEWIPC: unix.CLONE_NEWIPC, + NEWUTS: unix.CLONE_NEWUTS, + NEWPID: unix.CLONE_NEWPID, + NEWCGROUP: unix.CLONE_NEWCGROUP, +} + +// CloneFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare. This function returns flags only for new namespaces. +func (n *Namespaces) CloneFlags() uintptr { + var flag int + for _, v := range *n { + if v.Path != "" { + continue + } + flag |= namespaceInfo[v.Type] + } + return uintptr(flag) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go new file mode 100644 index 000000000..fbb0d4907 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go @@ -0,0 +1,14 @@ +//go:build !linux && !windows +// +build !linux,!windows + +package configs + +func (n *Namespace) Syscall() int { + panic("No namespace syscall support") +} + +// CloneFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare. This function returns flags only for new namespaces. +func (n *Namespaces) CloneFlags() uintptr { + panic("No namespace syscall support") +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go new file mode 100644 index 000000000..946db30a5 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go @@ -0,0 +1,8 @@ +//go:build !linux +// +build !linux + +package configs + +// Namespace defines configuration for each namespace. It specifies an +// alternate path that is able to be joined via setns. +type Namespace struct{} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/network.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/network.go new file mode 100644 index 000000000..c44c3ea71 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/network.go @@ -0,0 +1,75 @@ +package configs + +// Network defines configuration for a container's networking stack +// +// The network configuration can be omitted from a container causing the +// container to be setup with the host's networking stack +type Network struct { + // Type sets the networks type, commonly veth and loopback + Type string `json:"type"` + + // Name of the network interface + Name string `json:"name"` + + // The bridge to use. + Bridge string `json:"bridge"` + + // MacAddress contains the MAC address to set on the network interface + MacAddress string `json:"mac_address"` + + // Address contains the IPv4 and mask to set on the network interface + Address string `json:"address"` + + // Gateway sets the gateway address that is used as the default for the interface + Gateway string `json:"gateway"` + + // IPv6Address contains the IPv6 and mask to set on the network interface + IPv6Address string `json:"ipv6_address"` + + // IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface + IPv6Gateway string `json:"ipv6_gateway"` + + // Mtu sets the mtu value for the interface and will be mirrored on both the host and + // container's interfaces if a pair is created, specifically in the case of type veth + // Note: This does not apply to loopback interfaces. + Mtu int `json:"mtu"` + + // TxQueueLen sets the tx_queuelen value for the interface and will be mirrored on both the host and + // container's interfaces if a pair is created, specifically in the case of type veth + // Note: This does not apply to loopback interfaces. + TxQueueLen int `json:"txqueuelen"` + + // HostInterfaceName is a unique name of a veth pair that resides on in the host interface of the + // container. + HostInterfaceName string `json:"host_interface_name"` + + // HairpinMode specifies if hairpin NAT should be enabled on the virtual interface + // bridge port in the case of type veth + // Note: This is unsupported on some systems. + // Note: This does not apply to loopback interfaces. + HairpinMode bool `json:"hairpin_mode"` +} + +// Route defines a routing table entry. +// +// Routes can be specified to create entries in the routing table as the container +// is started. +// +// All of destination, source, and gateway should be either IPv4 or IPv6. +// One of the three options must be present, and omitted entries will use their +// IP family default for the route table. For IPv4 for example, setting the +// gateway to 1.2.3.4 and the interface to eth0 will set up a standard +// destination of 0.0.0.0(or *) when viewed in the route table. +type Route struct { + // Destination specifies the destination IP address and mask in the CIDR form. + Destination string `json:"destination"` + + // Source specifies the source IP address and mask in the CIDR form. + Source string `json:"source"` + + // Gateway specifies the gateway IP address. + Gateway string `json:"gateway"` + + // InterfaceName specifies the device to set this route up for, for example eth0. + InterfaceName string `json:"interface_name"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/rdma.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/rdma.go new file mode 100644 index 000000000..c69f2c802 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/rdma.go @@ -0,0 +1,9 @@ +package configs + +// LinuxRdma for Linux cgroup 'rdma' resource management (Linux 4.11) +type LinuxRdma struct { + // Maximum number of HCA handles that can be opened. Default is "no limit". + HcaHandles *uint32 `json:"hca_handles,omitempty"` + // Maximum number of HCA objects that can be created. Default is "no limit". + HcaObjects *uint32 `json:"hca_objects,omitempty"` +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go new file mode 100644 index 000000000..f6cb98e5e --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go @@ -0,0 +1,5 @@ +package userns + +// RunningInUserNS detects whether we are currently running in a user namespace. +// Originally copied from github.com/lxc/lxd/shared/util.go +var RunningInUserNS = runningInUserNS diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go new file mode 100644 index 000000000..1e00ab8b5 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go @@ -0,0 +1,16 @@ +//go:build gofuzz +// +build gofuzz + +package userns + +import ( + "strings" + + "github.com/opencontainers/runc/libcontainer/user" +) + +func FuzzUIDMap(data []byte) int { + uidmap, _ := user.ParseIDMap(strings.NewReader(string(data))) + _ = uidMapInUserNS(uidmap) + return 1 +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go new file mode 100644 index 000000000..724e6df01 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go @@ -0,0 +1,37 @@ +package userns + +import ( + "sync" + + "github.com/opencontainers/runc/libcontainer/user" +) + +var ( + inUserNS bool + nsOnce sync.Once +) + +// runningInUserNS detects whether we are currently running in a user namespace. +// Originally copied from github.com/lxc/lxd/shared/util.go +func runningInUserNS() bool { + nsOnce.Do(func() { + uidmap, err := user.CurrentProcessUIDMap() + if err != nil { + // This kernel-provided file only exists if user namespaces are supported + return + } + inUserNS = uidMapInUserNS(uidmap) + }) + return inUserNS +} + +func uidMapInUserNS(uidmap []user.IDMap) bool { + /* + * We assume we are in the initial user namespace if we have a full + * range - 4294967295 uids starting at uid 0. + */ + if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 { + return false + } + return true +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go new file mode 100644 index 000000000..f35c13a10 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go @@ -0,0 +1,18 @@ +//go:build !linux +// +build !linux + +package userns + +import "github.com/opencontainers/runc/libcontainer/user" + +// runningInUserNS is a stub for non-Linux systems +// Always returns false +func runningInUserNS() bool { + return false +} + +// uidMapInUserNS is a stub for non-Linux systems +// Always returns false +func uidMapInUserNS(uidmap []user.IDMap) bool { + return false +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go new file mode 100644 index 000000000..7ef9da21f --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go @@ -0,0 +1,96 @@ +package utils + +/* + * Copyright 2016, 2017 SUSE LLC + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import ( + "fmt" + "os" + + "golang.org/x/sys/unix" +) + +// MaxSendfdLen is the maximum length of the name of a file descriptor being +// sent using SendFd. The name of the file handle returned by RecvFd will never +// be larger than this value. +const MaxNameLen = 4096 + +// oobSpace is the size of the oob slice required to store a single FD. Note +// that unix.UnixRights appears to make the assumption that fd is always int32, +// so sizeof(fd) = 4. +var oobSpace = unix.CmsgSpace(4) + +// RecvFd waits for a file descriptor to be sent over the given AF_UNIX +// socket. The file name of the remote file descriptor will be recreated +// locally (it is sent as non-auxiliary data in the same payload). +func RecvFd(socket *os.File) (*os.File, error) { + // For some reason, unix.Recvmsg uses the length rather than the capacity + // when passing the msg_controllen and other attributes to recvmsg. So we + // have to actually set the length. + name := make([]byte, MaxNameLen) + oob := make([]byte, oobSpace) + + sockfd := socket.Fd() + n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0) + if err != nil { + return nil, err + } + + if n >= MaxNameLen || oobn != oobSpace { + return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) + } + + // Truncate. + name = name[:n] + oob = oob[:oobn] + + scms, err := unix.ParseSocketControlMessage(oob) + if err != nil { + return nil, err + } + if len(scms) != 1 { + return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) + } + scm := scms[0] + + fds, err := unix.ParseUnixRights(&scm) + if err != nil { + return nil, err + } + if len(fds) != 1 { + return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds)) + } + fd := uintptr(fds[0]) + + return os.NewFile(fd, string(name)), nil +} + +// SendFd sends a file descriptor over the given AF_UNIX socket. In +// addition, the file.Name() of the given file will also be sent as +// non-auxiliary data in the same payload (allowing to send contextual +// information for a file descriptor). +func SendFd(socket *os.File, name string, fd uintptr) error { + if len(name) >= MaxNameLen { + return fmt.Errorf("sendfd: filename too long: %s", name) + } + return SendFds(socket, []byte(name), int(fd)) +} + +// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket. +func SendFds(socket *os.File, msg []byte, fds ...int) error { + oob := unix.UnixRights(fds...) + return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go new file mode 100644 index 000000000..6b9fc3435 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go @@ -0,0 +1,167 @@ +package utils + +import ( + "encoding/binary" + "encoding/json" + "fmt" + "io" + "os" + "path/filepath" + "strconv" + "strings" + "unsafe" + + securejoin "github.com/cyphar/filepath-securejoin" + "golang.org/x/sys/unix" +) + +const ( + exitSignalOffset = 128 +) + +// NativeEndian is the native byte order of the host system. +var NativeEndian binary.ByteOrder + +func init() { + // Copied from . + i := uint32(1) + b := (*[4]byte)(unsafe.Pointer(&i)) + if b[0] == 1 { + NativeEndian = binary.LittleEndian + } else { + NativeEndian = binary.BigEndian + } +} + +// ExitStatus returns the correct exit status for a process based on if it +// was signaled or exited cleanly +func ExitStatus(status unix.WaitStatus) int { + if status.Signaled() { + return exitSignalOffset + int(status.Signal()) + } + return status.ExitStatus() +} + +// WriteJSON writes the provided struct v to w using standard json marshaling +func WriteJSON(w io.Writer, v interface{}) error { + data, err := json.Marshal(v) + if err != nil { + return err + } + _, err = w.Write(data) + return err +} + +// CleanPath makes a path safe for use with filepath.Join. This is done by not +// only cleaning the path, but also (if the path is relative) adding a leading +// '/' and cleaning it (then removing the leading '/'). This ensures that a +// path resulting from prepending another path will always resolve to lexically +// be a subdirectory of the prefixed path. This is all done lexically, so paths +// that include symlinks won't be safe as a result of using CleanPath. +func CleanPath(path string) string { + // Deal with empty strings nicely. + if path == "" { + return "" + } + + // Ensure that all paths are cleaned (especially problematic ones like + // "/../../../../../" which can cause lots of issues). + path = filepath.Clean(path) + + // If the path isn't absolute, we need to do more processing to fix paths + // such as "../../../..//some/path". We also shouldn't convert absolute + // paths to relative ones. + if !filepath.IsAbs(path) { + path = filepath.Clean(string(os.PathSeparator) + path) + // This can't fail, as (by definition) all paths are relative to root. + path, _ = filepath.Rel(string(os.PathSeparator), path) + } + + // Clean the path again for good measure. + return filepath.Clean(path) +} + +// stripRoot returns the passed path, stripping the root path if it was +// (lexicially) inside it. Note that both passed paths will always be treated +// as absolute, and the returned path will also always be absolute. In +// addition, the paths are cleaned before stripping the root. +func stripRoot(root, path string) string { + // Make the paths clean and absolute. + root, path = CleanPath("/"+root), CleanPath("/"+path) + switch { + case path == root: + path = "/" + case root == "/": + // do nothing + case strings.HasPrefix(path, root+"/"): + path = strings.TrimPrefix(path, root+"/") + } + return CleanPath("/" + path) +} + +// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) +// corresponding to the unsafePath resolved within the root. Before passing the +// fd, this path is verified to have been inside the root -- so operating on it +// through the passed fdpath should be safe. Do not access this path through +// the original path strings, and do not attempt to use the pathname outside of +// the passed closure (the file handle will be freed once the closure returns). +func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { + // Remove the root then forcefully resolve inside the root. + unsafePath = stripRoot(root, unsafePath) + path, err := securejoin.SecureJoin(root, unsafePath) + if err != nil { + return fmt.Errorf("resolving path inside rootfs failed: %w", err) + } + + // Open the target path. + fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return fmt.Errorf("open o_path procfd: %w", err) + } + defer fh.Close() + + // Double-check the path is the one we expected. + procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd())) + if realpath, err := os.Readlink(procfd); err != nil { + return fmt.Errorf("procfd verification failed: %w", err) + } else if realpath != path { + return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) + } + + // Run the closure. + return fn(procfd) +} + +// SearchLabels searches a list of key-value pairs for the provided key and +// returns the corresponding value. The pairs must be separated with '='. +func SearchLabels(labels []string, query string) string { + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == query { + return parts[1] + } + } + return "" +} + +// Annotations returns the bundle path and user defined annotations from the +// libcontainer state. We need to remove the bundle because that is a label +// added by libcontainer. +func Annotations(labels []string) (bundle string, userAnnotations map[string]string) { + userAnnotations = make(map[string]string) + for _, l := range labels { + parts := strings.SplitN(l, "=", 2) + if len(parts) < 2 { + continue + } + if parts[0] == "bundle" { + bundle = parts[1] + } else { + userAnnotations[parts[0]] = parts[1] + } + } + return +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go new file mode 100644 index 000000000..220d0b439 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go @@ -0,0 +1,69 @@ +//go:build !windows +// +build !windows + +package utils + +import ( + "fmt" + "os" + "strconv" + + "golang.org/x/sys/unix" +) + +// EnsureProcHandle returns whether or not the given file handle is on procfs. +func EnsureProcHandle(fh *os.File) error { + var buf unix.Statfs_t + if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil { + return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err) + } + if buf.Type != unix.PROC_SUPER_MAGIC { + return fmt.Errorf("%s is not on procfs", fh.Name()) + } + return nil +} + +// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for +// the process (except for those below the given fd value). +func CloseExecFrom(minFd int) error { + fdDir, err := os.Open("/proc/self/fd") + if err != nil { + return err + } + defer fdDir.Close() + + if err := EnsureProcHandle(fdDir); err != nil { + return err + } + + fdList, err := fdDir.Readdirnames(-1) + if err != nil { + return err + } + for _, fdStr := range fdList { + fd, err := strconv.Atoi(fdStr) + // Ignore non-numeric file names. + if err != nil { + continue + } + // Ignore descriptors lower than our specified minimum. + if fd < minFd { + continue + } + // Intentionally ignore errors from unix.CloseOnExec -- the cases where + // this might fail are basically file descriptors that have already + // been closed (including and especially the one that was created when + // os.ReadDir did the "opendir" syscall). + unix.CloseOnExec(fd) + } + return nil +} + +// NewSockPair returns a new unix socket pair +func NewSockPair(name string) (parent *os.File, child *os.File, err error) { + fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) + if err != nil { + return nil, nil, err + } + return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 609e7e1ed..93d8c4a47 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -350,8 +350,19 @@ github.com/opencontainers/image-spec/specs-go github.com/opencontainers/image-spec/specs-go/v1 # github.com/opencontainers/runc v1.1.8 ## explicit; go 1.17 +github.com/opencontainers/runc/libcontainer/cgroups +github.com/opencontainers/runc/libcontainer/cgroups/devices +github.com/opencontainers/runc/libcontainer/cgroups/ebpf +github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter +github.com/opencontainers/runc/libcontainer/cgroups/fs +github.com/opencontainers/runc/libcontainer/cgroups/fs2 +github.com/opencontainers/runc/libcontainer/cgroups/fscommon +github.com/opencontainers/runc/libcontainer/cgroups/systemd +github.com/opencontainers/runc/libcontainer/configs github.com/opencontainers/runc/libcontainer/devices github.com/opencontainers/runc/libcontainer/user +github.com/opencontainers/runc/libcontainer/userns +github.com/opencontainers/runc/libcontainer/utils # github.com/opencontainers/runtime-spec v1.1.0 ## explicit github.com/opencontainers/runtime-spec/specs-go