Revert "Merge pull request 101888 from kolyshkin/update-runc-rc94"

This reverts commit b1b06fe0a4, reversing
changes made to 382a33986b.
This commit is contained in:
Jordan Liggitt
2021-05-18 09:12:04 -04:00
parent 7ccd90e7d7
commit 4b45d0d921
336 changed files with 5393 additions and 17166 deletions

View File

@@ -7,44 +7,37 @@ import (
)
type Manager interface {
// Apply creates a cgroup, if not yet created, and adds a process
// with the specified pid into that cgroup. A special value of -1
// can be used to merely create a cgroup.
// Applies cgroup configuration to the process with the specified pid
Apply(pid int) error
// GetPids returns the PIDs of all processes inside the cgroup.
// Returns the PIDs inside the cgroup set
GetPids() ([]int, error)
// GetAllPids returns the PIDs of all processes inside the cgroup
// any all its sub-cgroups.
// Returns the PIDs inside the cgroup set & all sub-cgroups
GetAllPids() ([]int, error)
// GetStats returns cgroups statistics.
// Returns statistics for the cgroup set
GetStats() (*Stats, error)
// Freeze sets the freezer cgroup to the specified state.
// Toggles the freezer cgroup according with specified state
Freeze(state configs.FreezerState) error
// Destroy removes cgroup.
// Destroys the cgroup set
Destroy() error
// Path returns a cgroup path to the specified controller/subsystem.
// For cgroupv2, the argument is unused and can be empty.
Path(string) string
// Set sets cgroup resources parameters/limits. If the argument is nil,
// the resources specified during Manager creation (or the previous call
// to Set) are used.
Set(r *configs.Resources) error
// Sets the cgroup as configured.
Set(container *configs.Config) error
// GetPaths returns cgroup path(s) to save in a state file in order to
// restore later.
// GetPaths returns cgroup path(s) to save in a state file in order to restore later.
//
// For cgroup v1, a key is cgroup subsystem name, and the value is the
// path to the cgroup for this subsystem.
// For cgroup v1, a key is cgroup subsystem name, and the value is the path
// to the cgroup for this subsystem.
//
// For cgroup v2 unified hierarchy, a key is "", and the value is the
// unified path.
// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
GetPaths() map[string]string
// GetCgroups returns the cgroup data as configured.
@@ -53,9 +46,6 @@ type Manager interface {
// GetFreezerState retrieves the current FreezerState of the cgroup.
GetFreezerState() (configs.FreezerState, error)
// Exists returns whether the cgroup path exists or not.
// Whether the cgroup path exists or not
Exists() bool
// OOMKillCount reports OOM kill count for the cgroup.
OOMKillCount() (uint64, error)
}

View File

@@ -127,10 +127,10 @@ func (p *program) appendDevice(dev *devices.Rule) error {
}
if hasAccess {
p.insts = append(p.insts,
// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
// if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next
asm.Mov.Reg32(asm.R1, asm.R3),
asm.And.Imm32(asm.R1, bpfAccess),
asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
asm.JEq.Imm(asm.R1, 0, nextBlockSym),
)
}
if hasMajor {

View File

@@ -3,7 +3,6 @@ package ebpf
import (
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/link"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
@@ -33,23 +32,12 @@ func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD
if err != nil {
return nilCloser, err
}
err = link.RawAttachProgram(link.RawAttachProgramOptions{
Target: dirFD,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
Flags: unix.BPF_F_ALLOW_MULTI,
})
if err != nil {
if err := prog.Attach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil {
return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
}
closer := func() error {
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFD,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE)")
if err := prog.Detach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil {
return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
}
return nil
}

View File

@@ -25,19 +25,19 @@ func (s *BlkioGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *BlkioGroup) Set(path string, r *configs.Resources) error {
if r.BlkioWeight != 0 {
if err := fscommon.WriteFile(path, "blkio.weight", strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.BlkioWeight != 0 {
if err := fscommon.WriteFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil {
return err
}
}
if r.BlkioLeafWeight != 0 {
if err := fscommon.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil {
if cgroup.Resources.BlkioLeafWeight != 0 {
if err := fscommon.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil {
return err
}
}
for _, wd := range r.BlkioWeightDevice {
for _, wd := range cgroup.Resources.BlkioWeightDevice {
if err := fscommon.WriteFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
return err
}
@@ -45,22 +45,22 @@ func (s *BlkioGroup) Set(path string, r *configs.Resources) error {
return err
}
}
for _, td := range r.BlkioThrottleReadBpsDevice {
for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
if err := fscommon.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteBpsDevice {
for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
if err := fscommon.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleReadIOPSDevice {
for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
if err := fscommon.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteIOPSDevice {
for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
if err := fscommon.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err
}

View File

@@ -32,7 +32,7 @@ func (s *CpuGroup) Apply(path string, d *cgroupData) error {
// We should set the real-Time group scheduling settings before moving
// in the process because if the process is already in SCHED_RR mode
// and no RT bandwidth is set, adding it will fail.
if err := s.SetRtSched(path, d.config.Resources); err != nil {
if err := s.SetRtSched(path, d.config); err != nil {
return err
}
// Since we are not using join(), we need to place the pid
@@ -40,23 +40,23 @@ func (s *CpuGroup) Apply(path string, d *cgroupData) error {
return cgroups.WriteCgroupProc(path, d.pid)
}
func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error {
if r.CpuRtPeriod != 0 {
if err := fscommon.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil {
func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuRtPeriod != 0 {
if err := fscommon.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
return err
}
}
if r.CpuRtRuntime != 0 {
if err := fscommon.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil {
if cgroup.Resources.CpuRtRuntime != 0 {
if err := fscommon.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil {
return err
}
}
return nil
}
func (s *CpuGroup) Set(path string, r *configs.Resources) error {
if r.CpuShares != 0 {
shares := r.CpuShares
func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuShares != 0 {
shares := cgroup.Resources.CpuShares
if err := fscommon.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil {
return err
}
@@ -72,17 +72,17 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error {
return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead)
}
}
if r.CpuPeriod != 0 {
if err := fscommon.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(r.CpuPeriod, 10)); err != nil {
if cgroup.Resources.CpuPeriod != 0 {
if err := fscommon.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil {
return err
}
}
if r.CpuQuota != 0 {
if err := fscommon.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil {
if cgroup.Resources.CpuQuota != 0 {
if err := fscommon.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil {
return err
}
}
return s.SetRtSched(path, r)
return s.SetRtSched(path, cgroup)
}
func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
@@ -97,7 +97,7 @@ func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
sc := bufio.NewScanner(f)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
if err != nil {
return err
}

View File

@@ -43,7 +43,7 @@ func (s *CpuacctGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *CpuacctGroup) Set(_ string, _ *configs.Resources) error {
func (s *CpuacctGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}

View File

@@ -24,17 +24,17 @@ func (s *CpusetGroup) Name() string {
}
func (s *CpusetGroup) Apply(path string, d *cgroupData) error {
return s.ApplyDir(path, d.config.Resources, d.pid)
return s.ApplyDir(path, d.config, d.pid)
}
func (s *CpusetGroup) Set(path string, r *configs.Resources) error {
if r.CpusetCpus != "" {
if err := fscommon.WriteFile(path, "cpuset.cpus", r.CpusetCpus); err != nil {
func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpusetCpus != "" {
if err := fscommon.WriteFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
return err
}
}
if r.CpusetMems != "" {
if err := fscommon.WriteFile(path, "cpuset.mems", r.CpusetMems); err != nil {
if cgroup.Resources.CpusetMems != "" {
if err := fscommon.WriteFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
return err
}
}
@@ -144,7 +144,7 @@ func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}
func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error {
func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error {
// This might happen if we have no cpuset cgroup mounted.
// Just do nothing and don't fail.
if dir == "" {
@@ -166,7 +166,7 @@ func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error
// specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatibility.
if err := s.ensureCpusAndMems(dir, r); err != nil {
if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
return err
}
@@ -241,8 +241,8 @@ func isEmptyCpuset(str string) bool {
return str == "" || str == "\n"
}
func (s *CpusetGroup) ensureCpusAndMems(path string, r *configs.Resources) error {
if err := s.Set(path, r); err != nil {
func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error {
if err := s.Set(path, cgroup); err != nil {
return err
}
return cpusetCopyIfNeeded(path, filepath.Dir(path))

View File

@@ -12,7 +12,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/opencontainers/runc/libcontainer/system"
)
type DevicesGroup struct {
@@ -54,8 +54,8 @@ func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) {
return emu, nil
}
func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
if userns.RunningInUserNS() || r.SkipDevices {
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
if system.RunningInUserNS() || cgroup.SkipDevices {
return nil
}
@@ -65,7 +65,7 @@ func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
if err != nil {
return err
}
target, err := buildEmulator(r.Devices)
target, err := buildEmulator(cgroup.Resources.Devices)
if err != nil {
return err
}

View File

@@ -12,7 +12,6 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@@ -27,62 +26,29 @@ func (s *FreezerGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
switch r.Freezer {
func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
switch cgroup.Resources.Freezer {
case configs.Frozen:
defer func() {
if Err != nil {
// Freezing failed, and it is bad and dangerous
// to leave the cgroup in FROZEN or FREEZING
// state, so (try to) thaw it back.
_ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
}
}()
// As per older kernel docs (freezer-subsystem.txt before
// kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
// userspace should either retry or thaw. While current
// kernel cgroup v1 docs no longer mention a need to retry,
// even a recent kernel (v5.4, Ubuntu 20.04) can't reliably
// freeze a cgroup v1 while new processes keep appearing in it
// the kernel (tested on v5.4, Ubuntu 20.04) can't reliably
// freeze a cgroup while new processes keep appearing in it
// (either via fork/clone or by writing new PIDs to
// cgroup.procs).
//
// The numbers below are empirically chosen to have a decent
// chance to succeed in various scenarios ("runc pause/unpause
// with parallel runc exec" and "bare freeze/unfreeze on a very
// slow system"), tested on RHEL7 and Ubuntu 20.04 kernels.
// The number of retries below is chosen to have a decent
// chance to succeed even in the worst case scenario (runc
// pause/unpause with parallel runc exec).
//
// Adding any amount of sleep in between retries did not
// increase the chances of successful freeze in "pause/unpause
// with parallel exec" reproducer. OTOH, adding an occasional
// sleep helped for the case where the system is extremely slow
// (CentOS 7 VM on GHA CI).
//
// Alas, this is still a game of chances, since the real fix
// belong to the kernel (cgroup v2 do not have this bug).
// increase the chances of successful freeze.
for i := 0; i < 1000; i++ {
if i%50 == 49 {
// Occasional thaw and sleep improves
// the chances to succeed in freezing
// in case new processes keep appearing
// in the cgroup.
_ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
time.Sleep(10 * time.Millisecond)
}
if err := fscommon.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil {
return err
}
if i%25 == 24 {
// Occasional short sleep before reading
// the state back also improves the chances to
// succeed in freezing in case of a very slow
// system.
time.Sleep(10 * time.Microsecond)
}
state, err := fscommon.ReadFile(path, "freezer.state")
if err != nil {
return err
@@ -92,9 +58,6 @@ func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
case "FREEZING":
continue
case string(configs.Frozen):
if i > 1 {
logrus.Debugf("frozen after %d retries", i)
}
return nil
default:
// should never happen
@@ -102,13 +65,16 @@ func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
}
}
// Despite our best efforts, it got stuck in FREEZING.
// Leaving it in this state is bad and dangerous, so
// let's (try to) thaw it back and error out.
_ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
return errors.New("unable to freeze")
case configs.Thawed:
return fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
case configs.Undefined:
return nil
default:
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer))
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer))
}
}

View File

@@ -9,7 +9,6 @@ import (
"sync"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
@@ -44,8 +43,8 @@ type subsystem interface {
GetStats(path string, stats *cgroups.Stats) error
// Creates and joins the cgroup represented by 'cgroupData'.
Apply(path string, c *cgroupData) error
// Set sets the cgroup resources.
Set(path string, r *configs.Resources) error
// Set the cgroup represented by cgroup.
Set(path string, cgroup *configs.Cgroup) error
}
type manager struct {
@@ -274,8 +273,8 @@ func (m *manager) GetStats() (*cgroups.Stats, error) {
return stats, nil
}
func (m *manager) Set(r *configs.Resources) error {
if r == nil {
func (m *manager) Set(container *configs.Config) error {
if container.Cgroups == nil {
return nil
}
@@ -284,7 +283,7 @@ func (m *manager) Set(r *configs.Resources) error {
if m.cgroups != nil && m.cgroups.Paths != nil {
return nil
}
if r.Unified != nil {
if container.Cgroups.Resources.Unified != nil {
return cgroups.ErrV1NoUnified
}
@@ -292,11 +291,11 @@ func (m *manager) Set(r *configs.Resources) error {
defer m.mu.Unlock()
for _, sys := range subsystems {
path := m.paths[sys.Name()]
if err := sys.Set(path, r); err != nil {
if err := sys.Set(path, container.Cgroups); err != nil {
if m.rootless && sys.Name() == "devices" {
continue
}
// When m.rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if path == "" {
@@ -322,7 +321,7 @@ func (m *manager) Freeze(state configs.FreezerState) error {
prevState := m.cgroups.Resources.Freezer
m.cgroups.Resources.Freezer = state
freezer := &FreezerGroup{}
if err := freezer.Set(path, m.cgroups.Resources); err != nil {
if err := freezer.Set(path, m.cgroups); err != nil {
m.cgroups.Resources.Freezer = prevState
return err
}
@@ -422,17 +421,3 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) {
func (m *manager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}
func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.Path("memory"))
// Ignore ENOENT when rootless as it couldn't create cgroup.
if err != nil && m.rootless && os.IsNotExist(err) {
err = nil
}
return c, err
}

View File

@@ -22,8 +22,8 @@ func (s *HugetlbGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
for _, hugetlb := range r.HugetlbLimit {
func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error {
for _, hugetlb := range cgroup.Resources.HugetlbLimit {
if err := fscommon.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}

View File

@@ -0,0 +1,56 @@
// +build linux,!nokmem
package fs
import (
"errors"
"fmt"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"golang.org/x/sys/unix"
)
const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
func EnableKernelMemoryAccounting(path string) error {
// Ensure that kernel memory is available in this kernel build. If it
// isn't, we just ignore it because EnableKernelMemoryAccounting is
// automatically called for all memory limits.
if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
return nil
}
// We have to limit the kernel memory here as it won't be accounted at all
// until a limit is set on the cgroup and limit cannot be set once the
// cgroup has children, or if there are already tasks in the cgroup.
for _, i := range []int64{1, -1} {
if err := setKernelMemory(path, i); err != nil {
return err
}
}
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
if path == "" {
return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
}
if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
// We have specifically been asked to set a kmem limit. If the kernel
// doesn't support it we *must* error out.
return errors.New("kernel memory accounting not supported by this kernel")
}
if err := fscommon.WriteFile(path, cgroupKernelMemoryLimit, strconv.FormatInt(kernelMemoryLimit, 10)); err != nil {
// Check if the error number returned by the syscall is "EBUSY"
// The EBUSY signal is returned on attempts to write to the
// memory.kmem.limit_in_bytes file if the cgroup has children or
// once tasks have been attached to the cgroup
if errors.Is(err, unix.EBUSY) {
return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
}
return err
}
return nil
}

View File

@@ -0,0 +1,15 @@
// +build linux,nokmem
package fs
import (
"errors"
)
func EnableKernelMemoryAccounting(path string) error {
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
return errors.New("kernel memory accounting disabled in this runc build")
}

View File

@@ -14,15 +14,11 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
const (
cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
cgroupMemoryLimit = "memory.limit_in_bytes"
cgroupMemoryUsage = "memory.usage_in_bytes"
cgroupMemoryMaxUsage = "memory.max_usage_in_bytes"
)
type MemoryGroup struct {
@@ -33,55 +29,48 @@ func (s *MemoryGroup) Name() string {
}
func (s *MemoryGroup) Apply(path string, d *cgroupData) (err error) {
if path == "" {
return nil
}
if memoryAssigned(d.config) {
if _, err := os.Stat(path); os.IsNotExist(err) {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
// Only enable kernel memory accouting when this cgroup
// is created by libcontainer, otherwise we might get
// error when people use `cgroupsPath` to join an existed
// cgroup whose kernel memory is not initialized.
if err := EnableKernelMemoryAccounting(path); err != nil {
return err
}
}
}
defer func() {
if err != nil {
os.RemoveAll(path)
}
}()
// We need to join memory cgroup after set memory limits, because
// kmem.limit_in_bytes can only be set when the cgroup is empty.
return join(path, d.pid)
}
func setMemory(path string, val int64) error {
if val == 0 {
return nil
}
err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10))
if !errors.Is(err, unix.EBUSY) {
return err
}
// EBUSY means the kernel can't set new limit as it's too low
// (lower than the current usage). Return more specific error.
usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage)
if err != nil {
return err
}
max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage)
if err != nil {
return err
}
return errors.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max)
}
func setSwap(path string, val int64) error {
if val == 0 {
return nil
}
return fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10))
}
func setMemoryAndSwap(path string, r *configs.Resources) error {
func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
// If the memory update is set to -1 and the swap is not explicitly
// set, we should also set swap to -1, it means unlimited memory.
if r.Memory == -1 && r.MemorySwap == 0 {
if cgroup.Resources.Memory == -1 && cgroup.Resources.MemorySwap == 0 {
// Only set swap if it's enabled in kernel
if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
r.MemorySwap = -1
cgroup.Resources.MemorySwap = -1
}
}
// When memory and swap memory are both set, we need to handle the cases
// for updating container.
if r.Memory != 0 && r.MemorySwap != 0 {
curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit)
if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 {
memoryUsage, err := getMemoryData(path, "")
if err != nil {
return err
}
@@ -89,53 +78,72 @@ func setMemoryAndSwap(path string, r *configs.Resources) error {
// When update memory limit, we should adapt the write sequence
// for memory and swap memory, so it won't fail because the new
// value and the old value don't fit kernel's validation.
if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) {
if err := setSwap(path, r.MemorySwap); err != nil {
if cgroup.Resources.MemorySwap == -1 || memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) {
if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
if err := setMemory(path, r.Memory); err != nil {
if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
} else {
if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
}
} else {
if cgroup.Resources.Memory != 0 {
if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
}
if cgroup.Resources.MemorySwap != 0 {
if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
return nil
}
}
if err := setMemory(path, r.Memory); err != nil {
return err
}
if err := setSwap(path, r.MemorySwap); err != nil {
return err
}
return nil
}
func (s *MemoryGroup) Set(path string, r *configs.Resources) error {
if err := setMemoryAndSwap(path, r); err != nil {
func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
if err := setMemoryAndSwap(path, cgroup); err != nil {
return err
}
// ignore KernelMemory and KernelMemoryTCP
if r.MemoryReservation != 0 {
if err := fscommon.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil {
if cgroup.Resources.KernelMemory != 0 {
if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil {
return err
}
}
if r.OomKillDisable {
if cgroup.Resources.MemoryReservation != 0 {
if err := fscommon.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
return err
}
}
if cgroup.Resources.KernelMemoryTCP != 0 {
if err := fscommon.WriteFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
return err
}
}
if cgroup.Resources.OomKillDisable {
if err := fscommon.WriteFile(path, "memory.oom_control", "1"); err != nil {
return err
}
}
if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 {
if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
return nil
} else if *r.MemorySwappiness <= 100 {
if err := fscommon.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil {
} else if *cgroup.Resources.MemorySwappiness <= 100 {
if err := fscommon.WriteFile(path, "memory.swappiness", strconv.FormatUint(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
return err
}
} else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *r.MemorySwappiness)
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *cgroup.Resources.MemorySwappiness)
}
return nil
@@ -154,7 +162,7 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
sc := bufio.NewScanner(statsFile)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
if err != nil {
return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err)
}
@@ -204,6 +212,8 @@ func memoryAssigned(cgroup *configs.Cgroup) bool {
return cgroup.Resources.Memory != 0 ||
cgroup.Resources.MemoryReservation != 0 ||
cgroup.Resources.MemorySwap > 0 ||
cgroup.Resources.KernelMemory > 0 ||
cgroup.Resources.KernelMemoryTCP > 0 ||
cgroup.Resources.OomKillDisable ||
(cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1)
}
@@ -224,9 +234,7 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
if name != "" && os.IsNotExist(err) {
// Ignore ENOENT as swap and kmem controllers
// are optional in the kernel.
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err)
@@ -234,16 +242,25 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
memoryData.Usage = value
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", maxUsage, err)
}
memoryData.MaxUsage = value
value, err = fscommon.GetCgroupParamUint(path, failcnt)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
}
memoryData.Failcnt = value
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
}
memoryData.Limit = value

View File

@@ -24,7 +24,7 @@ func (s *NameGroup) Apply(path string, d *cgroupData) error {
return nil
}
func (s *NameGroup) Set(_ string, _ *configs.Resources) error {
func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}

View File

@@ -21,9 +21,9 @@ func (s *NetClsGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *NetClsGroup) Set(path string, r *configs.Resources) error {
if r.NetClsClassid != 0 {
if err := fscommon.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil {
func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.NetClsClassid != 0 {
if err := fscommon.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil {
return err
}
}

View File

@@ -19,8 +19,8 @@ func (s *NetPrioGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *NetPrioGroup) Set(path string, r *configs.Resources) error {
for _, prioMap := range r.NetPrioIfpriomap {
func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error {
for _, prioMap := range cgroup.Resources.NetPrioIfpriomap {
if err := fscommon.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
return err
}

View File

@@ -18,7 +18,7 @@ func (s *PerfEventGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *PerfEventGroup) Set(_ string, _ *configs.Resources) error {
func (s *PerfEventGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}

View File

@@ -23,13 +23,13 @@ func (s *PidsGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *PidsGroup) Set(path string, r *configs.Resources) error {
if r.PidsLimit != 0 {
func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.PidsLimit != 0 {
// "max" is the fallback value.
limit := "max"
if r.PidsLimit > 0 {
limit = strconv.FormatInt(r.PidsLimit, 10)
if cgroup.Resources.PidsLimit > 0 {
limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
}
if err := fscommon.WriteFile(path, "pids.max", limit); err != nil {

View File

@@ -12,14 +12,15 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
func isCpuSet(r *configs.Resources) bool {
return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0
func isCpuSet(cgroup *configs.Cgroup) bool {
return cgroup.Resources.CpuWeight != 0 || cgroup.Resources.CpuQuota != 0 || cgroup.Resources.CpuPeriod != 0
}
func setCpu(dirPath string, r *configs.Resources) error {
if !isCpuSet(r) {
func setCpu(dirPath string, cgroup *configs.Cgroup) error {
if !isCpuSet(cgroup) {
return nil
}
r := cgroup.Resources
// NOTE: .CpuShares is not used here. Conversion is the caller's responsibility.
if r.CpuWeight != 0 {
@@ -56,7 +57,7 @@ func statCpu(dirPath string, stats *cgroups.Stats) error {
sc := bufio.NewScanner(f)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
if err != nil {
return err
}
@@ -69,15 +70,6 @@ func statCpu(dirPath string, stats *cgroups.Stats) error {
case "system_usec":
stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000
case "nr_periods":
stats.CpuStats.ThrottlingData.Periods = v
case "nr_throttled":
stats.CpuStats.ThrottlingData.ThrottledPeriods = v
case "throttled_usec":
stats.CpuStats.ThrottlingData.ThrottledTime = v * 1000
}
}
return nil

View File

@@ -7,22 +7,22 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
func isCpusetSet(r *configs.Resources) bool {
return r.CpusetCpus != "" || r.CpusetMems != ""
func isCpusetSet(cgroup *configs.Cgroup) bool {
return cgroup.Resources.CpusetCpus != "" || cgroup.Resources.CpusetMems != ""
}
func setCpuset(dirPath string, r *configs.Resources) error {
if !isCpusetSet(r) {
func setCpuset(dirPath string, cgroup *configs.Cgroup) error {
if !isCpusetSet(cgroup) {
return nil
}
if r.CpusetCpus != "" {
if err := fscommon.WriteFile(dirPath, "cpuset.cpus", r.CpusetCpus); err != nil {
if cgroup.Resources.CpusetCpus != "" {
if err := fscommon.WriteFile(dirPath, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
return err
}
}
if r.CpusetMems != "" {
if err := fscommon.WriteFile(dirPath, "cpuset.mems", r.CpusetMems); err != nil {
if cgroup.Resources.CpusetMems != "" {
if err := fscommon.WriteFile(dirPath, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
return err
}
}

View File

@@ -10,7 +10,7 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
func supportedControllers() (string, error) {
func supportedControllers(cgroup *configs.Cgroup) (string, error) {
return fscommon.ReadFile(UnifiedMountpoint, "/cgroup.controllers")
}
@@ -18,13 +18,13 @@ func supportedControllers() (string, error) {
// based on (1) controllers available and (2) resources that are being set.
// We don't check "pseudo" controllers such as
// "freezer" and "devices".
func needAnyControllers(r *configs.Resources) (bool, error) {
if r == nil {
func needAnyControllers(cgroup *configs.Cgroup) (bool, error) {
if cgroup == nil {
return false, nil
}
// list of all available controllers
content, err := supportedControllers()
content, err := supportedControllers(cgroup)
if err != nil {
return false, err
}
@@ -39,22 +39,22 @@ func needAnyControllers(r *configs.Resources) (bool, error) {
return ok
}
if isPidsSet(r) && have("pids") {
if isPidsSet(cgroup) && have("pids") {
return true, nil
}
if isMemorySet(r) && have("memory") {
if isMemorySet(cgroup) && have("memory") {
return true, nil
}
if isIoSet(r) && have("io") {
if isIoSet(cgroup) && have("io") {
return true, nil
}
if isCpuSet(r) && have("cpu") {
if isCpuSet(cgroup) && have("cpu") {
return true, nil
}
if isCpusetSet(r) && have("cpuset") {
if isCpusetSet(cgroup) && have("cpuset") {
return true, nil
}
if isHugeTlbSet(r) && have("hugetlb") {
if isHugeTlbSet(cgroup) && have("hugetlb") {
return true, nil
}
@@ -64,8 +64,8 @@ func needAnyControllers(r *configs.Resources) (bool, error) {
// containsDomainController returns whether the current config contains domain controller or not.
// Refer to: http://man7.org/linux/man-pages/man7/cgroups.7.html
// As at Linux 4.19, the following controllers are threaded: cpu, perf_event, and pids.
func containsDomainController(r *configs.Resources) bool {
return isMemorySet(r) || isIoSet(r) || isCpuSet(r) || isHugeTlbSet(r)
func containsDomainController(cg *configs.Cgroup) bool {
return isMemorySet(cg) || isIoSet(cg) || isCpuSet(cg) || isHugeTlbSet(cg)
}
// CreateCgroupPath creates cgroupv2 path, enabling all the supported controllers.
@@ -74,7 +74,7 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
return fmt.Errorf("invalid cgroup path %s", path)
}
content, err := supportedControllers()
content, err := supportedControllers(c)
if err != nil {
return err
}
@@ -115,7 +115,7 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
// the controllers requested are thread-aware we can simply put the cgroup into
// threaded mode.
case "domain invalid":
if containsDomainController(c.Resources) {
if containsDomainController(c) {
return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in an invalid state", current)
} else {
// Not entirely correct (in theory we'd always want to be a domain --
@@ -129,7 +129,7 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
case "domain threaded":
fallthrough
case "threaded":
if containsDomainController(c.Resources) {
if containsDomainController(c) {
return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType)
}
}

View File

@@ -7,8 +7,6 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
@@ -28,40 +26,26 @@ func isRWM(perms devices.Permissions) bool {
return r && w && m
}
// This is similar to the logic applied in crun for handling errors from bpf(2)
// <https://github.com/containers/crun/blob/0.17/src/libcrun/cgroup.c#L2438-L2470>.
func canSkipEBPFError(r *configs.Resources) bool {
// If we're running in a user namespace we can ignore eBPF rules because we
// usually cannot use bpf(2), as well as rootless containers usually don't
// have the necessary privileges to mknod(2) device inodes or access
// host-level instances (though ideally we would be blocking device access
// for rootless containers anyway).
if userns.RunningInUserNS() {
return true
}
// We cannot ignore an eBPF load error if any rule if is a block rule or it
// doesn't permit all access modes.
//
// NOTE: This will sometimes trigger in cases where access modes are split
// between different rules but to handle this correctly would require
// using ".../libcontainer/cgroup/devices".Emulator.
for _, dev := range r.Devices {
if !dev.Allow || !isRWM(dev.Permissions) {
// the logic is from crun
// https://github.com/containers/crun/blob/0.10.2/src/libcrun/cgroup.c#L1644-L1652
func canSkipEBPFError(cgroup *configs.Cgroup) bool {
for _, dev := range cgroup.Resources.Devices {
if dev.Allow || !isRWM(dev.Permissions) {
return false
}
}
return true
}
func setDevices(dirPath string, r *configs.Resources) error {
if r.SkipDevices {
func setDevices(dirPath string, cgroup *configs.Cgroup) error {
if cgroup.SkipDevices {
return nil
}
// XXX: This is currently a white-list (but all callers pass a blacklist of
// devices). This is bad for a whole variety of reasons, but will need
// to be fixed with co-ordinated effort with downstreams.
insts, license, err := devicefilter.DeviceFilter(r.Devices)
devices := cgroup.Devices
insts, license, err := devicefilter.DeviceFilter(devices)
if err != nil {
return err
}
@@ -82,7 +66,7 @@ func setDevices(dirPath string, r *configs.Resources) error {
// programs. You could temporarily insert a deny-everything program
// but that would result in spurrious failures during updates.
if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
if !canSkipEBPFError(r) {
if !canSkipEBPFError(cgroup) {
return err
}
}

View File

@@ -75,7 +75,7 @@ func (m *manager) Apply(pid int) error {
// - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if m.rootless {
if m.config.Path == "" {
if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed {
if blNeed, nErr := needAnyControllers(m.config); nErr == nil && !blNeed {
return nil
}
return errors.Wrap(err, "rootless needs no limits + no cgrouppath when no permission is granted for cgroups")
@@ -103,27 +103,43 @@ func (m *manager) GetStats() (*cgroups.Stats, error) {
)
st := cgroups.NewStats()
if err := m.getControllers(); err != nil {
return st, err
}
// pids (since kernel 4.5)
if err := statPids(m.dirPath, st); err != nil {
errs = append(errs, err)
if _, ok := m.controllers["pids"]; ok {
if err := statPids(m.dirPath, st); err != nil {
errs = append(errs, err)
}
} else {
if err := statPidsWithoutController(m.dirPath, st); err != nil {
errs = append(errs, err)
}
}
// memory (since kernel 4.5)
if err := statMemory(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
if _, ok := m.controllers["memory"]; ok {
if err := statMemory(m.dirPath, st); err != nil {
errs = append(errs, err)
}
}
// io (since kernel 4.5)
if err := statIo(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
if _, ok := m.controllers["io"]; ok {
if err := statIo(m.dirPath, st); err != nil {
errs = append(errs, err)
}
}
// cpu (since kernel 4.15)
// Note cpu.stat is available even if the controller is not enabled.
if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
if _, ok := m.controllers["cpu"]; ok {
if err := statCpu(m.dirPath, st); err != nil {
errs = append(errs, err)
}
}
// hugetlb (since kernel 5.6)
if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
if _, ok := m.controllers["hugetlb"]; ok {
if err := statHugeTlb(m.dirPath, st); err != nil {
errs = append(errs, err)
}
}
if len(errs) > 0 && !m.rootless {
return st, errors.Errorf("error while statting cgroup v2: %+v", errs)
@@ -147,50 +163,53 @@ func (m *manager) Path(_ string) string {
return m.dirPath
}
func (m *manager) Set(r *configs.Resources) error {
func (m *manager) Set(container *configs.Config) error {
if container == nil || container.Cgroups == nil {
return nil
}
if err := m.getControllers(); err != nil {
return err
}
// pids (since kernel 4.5)
if err := setPids(m.dirPath, r); err != nil {
if err := setPids(m.dirPath, container.Cgroups); err != nil {
return err
}
// memory (since kernel 4.5)
if err := setMemory(m.dirPath, r); err != nil {
if err := setMemory(m.dirPath, container.Cgroups); err != nil {
return err
}
// io (since kernel 4.5)
if err := setIo(m.dirPath, r); err != nil {
if err := setIo(m.dirPath, container.Cgroups); err != nil {
return err
}
// cpu (since kernel 4.15)
if err := setCpu(m.dirPath, r); err != nil {
if err := setCpu(m.dirPath, container.Cgroups); err != nil {
return err
}
// devices (since kernel 4.15, pseudo-controller)
//
// When m.rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if err := setDevices(m.dirPath, r); err != nil && !m.rootless {
if err := setDevices(m.dirPath, container.Cgroups); err != nil && !m.rootless {
return err
}
// cpuset (since kernel 5.0)
if err := setCpuset(m.dirPath, r); err != nil {
if err := setCpuset(m.dirPath, container.Cgroups); err != nil {
return err
}
// hugetlb (since kernel 5.6)
if err := setHugeTlb(m.dirPath, r); err != nil {
if err := setHugeTlb(m.dirPath, container.Cgroups); err != nil {
return err
}
// freezer (since kernel 5.2, pseudo-controller)
if err := setFreezer(m.dirPath, r.Freezer); err != nil {
if err := setFreezer(m.dirPath, container.Cgroups.Freezer); err != nil {
return err
}
if err := m.setUnified(r.Unified); err != nil {
if err := m.setUnified(container.Cgroups.Unified); err != nil {
return err
}
m.config.Resources = r
m.config = container.Cgroups
return nil
}
@@ -238,16 +257,3 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) {
func (m *manager) Exists() bool {
return cgroups.PathExists(m.dirPath)
}
func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.events", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.dirPath)
if err != nil && m.rootless && os.IsNotExist(err) {
err = nil
}
return c, err
}

View File

@@ -12,15 +12,15 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
func isHugeTlbSet(r *configs.Resources) bool {
return len(r.HugetlbLimit) > 0
func isHugeTlbSet(cgroup *configs.Cgroup) bool {
return len(cgroup.Resources.HugetlbLimit) > 0
}
func setHugeTlb(dirPath string, r *configs.Resources) error {
if !isHugeTlbSet(r) {
func setHugeTlb(dirPath string, cgroup *configs.Cgroup) error {
if !isHugeTlbSet(cgroup) {
return nil
}
for _, hugetlb := range r.HugetlbLimit {
for _, hugetlb := range cgroup.Resources.HugetlbLimit {
if err := fscommon.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}
@@ -44,10 +44,14 @@ func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
hugetlbStats.Usage = value
fileName := "hugetlb." + pagesize + ".events"
value, err = fscommon.GetValueByKey(dirPath, fileName, "max")
contents, err := fscommon.ReadFile(dirPath, fileName)
if err != nil {
return errors.Wrap(err, "failed to read stats")
}
_, value, err = fscommon.GetCgroupParamKeyValue(contents)
if err != nil {
return errors.Wrap(err, "failed to parse "+fileName)
}
hugetlbStats.Failcnt = value
stats.HugetlbStats[pagesize] = hugetlbStats

View File

@@ -13,50 +13,42 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
func isIoSet(r *configs.Resources) bool {
return r.BlkioWeight != 0 ||
len(r.BlkioThrottleReadBpsDevice) > 0 ||
len(r.BlkioThrottleWriteBpsDevice) > 0 ||
len(r.BlkioThrottleReadIOPSDevice) > 0 ||
len(r.BlkioThrottleWriteIOPSDevice) > 0
func isIoSet(cgroup *configs.Cgroup) bool {
return cgroup.Resources.BlkioWeight != 0 ||
len(cgroup.Resources.BlkioThrottleReadBpsDevice) > 0 ||
len(cgroup.Resources.BlkioThrottleWriteBpsDevice) > 0 ||
len(cgroup.Resources.BlkioThrottleReadIOPSDevice) > 0 ||
len(cgroup.Resources.BlkioThrottleWriteIOPSDevice) > 0
}
func setIo(dirPath string, r *configs.Resources) error {
if !isIoSet(r) {
func setIo(dirPath string, cgroup *configs.Cgroup) error {
if !isIoSet(cgroup) {
return nil
}
if r.BlkioWeight != 0 {
if cgroup.Resources.BlkioWeight != 0 {
filename := "io.bfq.weight"
if err := fscommon.WriteFile(dirPath, filename,
strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
// if io.bfq.weight does not exist, then bfq module is not loaded.
// Fallback to use io.weight with a conversion scheme
if !os.IsNotExist(err) {
return err
}
v := cgroups.ConvertBlkIOToIOWeightValue(r.BlkioWeight)
if err := fscommon.WriteFile(dirPath, "io.weight", strconv.FormatUint(v, 10)); err != nil {
return err
}
strconv.FormatUint(cgroups.ConvertBlkIOToCgroupV2Value(cgroup.Resources.BlkioWeight), 10)); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleReadBpsDevice {
for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteBpsDevice {
for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleReadIOPSDevice {
for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteIOPSDevice {
for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil {
return err
}

View File

@@ -4,16 +4,13 @@ package fs2
import (
"bufio"
"math"
"os"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
// numToStr converts an int64 value to a string for writing to a
@@ -33,20 +30,21 @@ func numToStr(value int64) (ret string) {
return ret
}
func isMemorySet(r *configs.Resources) bool {
return r.MemoryReservation != 0 || r.Memory != 0 || r.MemorySwap != 0
func isMemorySet(cgroup *configs.Cgroup) bool {
return cgroup.Resources.MemoryReservation != 0 ||
cgroup.Resources.Memory != 0 || cgroup.Resources.MemorySwap != 0
}
func setMemory(dirPath string, r *configs.Resources) error {
if !isMemorySet(r) {
func setMemory(dirPath string, cgroup *configs.Cgroup) error {
if !isMemorySet(cgroup) {
return nil
}
swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(cgroup.Resources.MemorySwap, cgroup.Resources.Memory)
if err != nil {
return err
}
swapStr := numToStr(swap)
if swapStr == "" && swap == 0 && r.MemorySwap > 0 {
if swapStr == "" && swap == 0 && cgroup.Resources.MemorySwap > 0 {
// memory and memorySwap set to the same value -- disable swap
swapStr = "0"
}
@@ -57,7 +55,7 @@ func setMemory(dirPath string, r *configs.Resources) error {
}
}
if val := numToStr(r.Memory); val != "" {
if val := numToStr(cgroup.Resources.Memory); val != "" {
if err := fscommon.WriteFile(dirPath, "memory.max", val); err != nil {
return err
}
@@ -65,7 +63,7 @@ func setMemory(dirPath string, r *configs.Resources) error {
// cgroup.Resources.KernelMemory is ignored
if val := numToStr(r.MemoryReservation); val != "" {
if val := numToStr(cgroup.Resources.MemoryReservation); val != "" {
if err := fscommon.WriteFile(dirPath, "memory.low", val); err != nil {
return err
}
@@ -84,24 +82,16 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
sc := bufio.NewScanner(statsFile)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
if err != nil {
return errors.Wrapf(err, "failed to parse memory.stat (%q)", sc.Text())
}
stats.MemoryStats.Stats[t] = v
}
stats.MemoryStats.Cache = stats.MemoryStats.Stats["file"]
// Unlike cgroup v1 which has memory.use_hierarchy binary knob,
// cgroup v2 is always hierarchical.
stats.MemoryStats.UseHierarchy = true
stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]
memoryUsage, err := getMemoryDataV2(dirPath, "")
if err != nil {
if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint {
// The root cgroup does not have memory.{current,max}
// so emulate those using data from /proc/meminfo.
return statsFromMeminfo(stats)
}
return err
}
stats.MemoryStats.Usage = memoryUsage
@@ -109,15 +99,9 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
if err != nil {
return err
}
// As cgroup v1 reports SwapUsage values as mem+swap combined,
// while in cgroup v2 swap values do not include memory,
// report combined mem+swap for v1 compatibility.
swapUsage.Usage += memoryUsage.Usage
if swapUsage.Limit != math.MaxUint64 {
swapUsage.Limit += memoryUsage.Limit
}
stats.MemoryStats.SwapUsage = swapUsage
stats.MemoryStats.UseHierarchy = true
return nil
}
@@ -133,10 +117,7 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
if name != "" && os.IsNotExist(err) {
// Ignore EEXIST as there's no swap accounting
// if kernel CONFIG_MEMCG_SWAP is not set or
// swapaccount=0 kernel boot parameter is given.
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", usage)
@@ -145,69 +126,12 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", limit)
}
memoryData.Limit = value
return memoryData, nil
}
func statsFromMeminfo(stats *cgroups.Stats) error {
f, err := os.Open("/proc/meminfo")
if err != nil {
return err
}
defer f.Close()
// Fields we are interested in.
var (
swap_free uint64
swap_total uint64
main_total uint64
main_free uint64
)
mem := map[string]*uint64{
"SwapFree": &swap_free,
"SwapTotal": &swap_total,
"MemTotal": &main_total,
"MemFree": &main_free,
}
found := 0
sc := bufio.NewScanner(f)
for sc.Scan() {
parts := strings.SplitN(sc.Text(), ":", 3)
if len(parts) != 2 {
// Should not happen.
continue
}
k := parts[0]
p, ok := mem[k]
if !ok {
// Unknown field -- not interested.
continue
}
vStr := strings.TrimSpace(strings.TrimSuffix(parts[1], " kB"))
*p, err = strconv.ParseUint(vStr, 10, 64)
if err != nil {
return errors.Wrap(err, "parsing /proc/meminfo "+k)
}
found++
if found == len(mem) {
// Got everything we need -- skip the rest.
break
}
}
if sc.Err() != nil {
return sc.Err()
}
stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024
stats.MemoryStats.SwapUsage.Limit = math.MaxUint64
stats.MemoryStats.Usage.Usage = (main_total - main_free) * 1024
stats.MemoryStats.Usage.Limit = math.MaxUint64
return nil
}

View File

@@ -3,7 +3,6 @@
package fs2
import (
"os"
"path/filepath"
"strings"
@@ -14,15 +13,15 @@ import (
"golang.org/x/sys/unix"
)
func isPidsSet(r *configs.Resources) bool {
return r.PidsLimit != 0
func isPidsSet(cgroup *configs.Cgroup) bool {
return cgroup.Resources.PidsLimit != 0
}
func setPids(dirPath string, r *configs.Resources) error {
if !isPidsSet(r) {
func setPids(dirPath string, cgroup *configs.Cgroup) error {
if !isPidsSet(cgroup) {
return nil
}
if val := numToStr(r.PidsLimit); val != "" {
if val := numToStr(cgroup.Resources.PidsLimit); val != "" {
if err := fscommon.WriteFile(dirPath, "pids.max", val); err != nil {
return err
}
@@ -31,7 +30,7 @@ func setPids(dirPath string, r *configs.Resources) error {
return nil
}
func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error {
func statPidsWithoutController(dirPath string, stats *cgroups.Stats) error {
// if the controller is not enabled, let's read PIDS from cgroups.procs
// (or threads if cgroup.threads is enabled)
contents, err := fscommon.ReadFile(dirPath, "cgroup.procs")
@@ -41,8 +40,13 @@ func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error {
if err != nil {
return err
}
pids := strings.Count(contents, "\n")
stats.PidsStats.Current = uint64(pids)
pids := make(map[string]string)
for _, i := range strings.Split(contents, "\n") {
if i != "" {
pids[i] = i
}
}
stats.PidsStats.Current = uint64(len(pids))
stats.PidsStats.Limit = 0
return nil
}
@@ -50,9 +54,6 @@ func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error {
func statPids(dirPath string, stats *cgroups.Stats) error {
current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current")
if err != nil {
if os.IsNotExist(err) {
return statPidsFromCgroupProcs(dirPath, stats)
}
return errors.Wrap(err, "failed to parse pids.current")
}

View File

@@ -5,6 +5,7 @@ import (
"strings"
"sync"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
@@ -16,7 +17,7 @@ const (
)
var (
// TestMode is set to true by unit tests that need "fake" cgroupfs.
// Set to true by fs unit tests
TestMode bool
cgroupFd int = -1
@@ -70,12 +71,12 @@ func OpenFile(dir, file string, flags int) (*os.File, error) {
flags |= os.O_TRUNC | os.O_CREATE
mode = 0o600
}
if prepareOpenat2() != nil {
return openFallback(dir, file, flags, mode)
}
reldir := strings.TrimPrefix(dir, cgroupfsPrefix)
if len(reldir) == len(dir) { // non-standard path, old system?
return openFallback(dir, file, flags, mode)
return openWithSecureJoin(dir, file, flags, mode)
}
if prepareOpenat2() != nil {
return openWithSecureJoin(dir, file, flags, mode)
}
relname := reldir + "/" + file
@@ -92,29 +93,11 @@ func OpenFile(dir, file string, flags int) (*os.File, error) {
return os.NewFile(uintptr(fd), cgroupfsPrefix+relname), nil
}
var errNotCgroupfs = errors.New("not a cgroup file")
// openFallback is used when openat2(2) is not available. It checks the opened
// file is on cgroupfs, returning an error otherwise.
func openFallback(dir, file string, flags int, mode os.FileMode) (*os.File, error) {
path := dir + "/" + file
fd, err := os.OpenFile(path, flags, mode)
func openWithSecureJoin(dir, file string, flags int, mode os.FileMode) (*os.File, error) {
path, err := securejoin.SecureJoin(dir, file)
if err != nil {
return nil, err
}
if TestMode {
return fd, nil
}
// Check this is a cgroupfs file.
var st unix.Statfs_t
if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
_ = fd.Close()
return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
}
if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
_ = fd.Close()
return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
}
return fd, nil
return os.OpenFile(path, flags, mode)
}

View File

@@ -35,42 +35,22 @@ func ParseUint(s string, base, bitSize int) (uint64, error) {
return value, nil
}
// ParseKeyValue parses a space-separated "name value" kind of cgroup
// parameter and returns its key as a string, and its value as uint64
// (ParseUint is used to convert the value). For example,
// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234.
func ParseKeyValue(t string) (string, uint64, error) {
parts := strings.SplitN(t, " ", 3)
if len(parts) != 2 {
return "", 0, fmt.Errorf("line %q is not in key value format", t)
}
value, err := ParseUint(parts[1], 10, 64)
if err != nil {
return "", 0, fmt.Errorf("unable to convert to uint64: %v", err)
}
return parts[0], value, nil
}
// GetValueByKey reads a key-value pairs from the specified cgroup file,
// and returns a value of the specified key. ParseUint is used for value
// conversion.
func GetValueByKey(path, file, key string) (uint64, error) {
content, err := ReadFile(path, file)
if err != nil {
return 0, err
}
lines := strings.Split(string(content), "\n")
for _, line := range lines {
arr := strings.Split(line, " ")
if len(arr) == 2 && arr[0] == key {
return ParseUint(arr[1], 10, 64)
// GetCgroupParamKeyValue parses a space-separated "name value" kind of cgroup
// parameter and returns its components. For example, "io_service_bytes 1234"
// will return as "io_service_bytes", 1234.
func GetCgroupParamKeyValue(t string) (string, uint64, error) {
parts := strings.Fields(t)
switch len(parts) {
case 2:
value, err := ParseUint(parts[1], 10, 64)
if err != nil {
return "", 0, fmt.Errorf("unable to convert to uint64: %v", err)
}
}
return 0, nil
return parts[0], value, nil
default:
return "", 0, ErrNotValidFormat
}
}
// GetCgroupParamUint reads a single uint64 value from the specified cgroup file.

View File

@@ -2,7 +2,6 @@ package systemd
import (
"bufio"
"context"
"fmt"
"math"
"os"
@@ -29,6 +28,10 @@ const (
)
var (
connOnce sync.Once
connDbus *systemdDbus.Conn
connErr error
versionOnce sync.Once
version int
@@ -288,6 +291,19 @@ func generateDeviceProperties(rules []*devices.Rule) ([]systemdDbus.Property, er
return properties, nil
}
// getDbusConnection lazy initializes systemd dbus connection
// and returns it
func getDbusConnection(rootless bool) (*systemdDbus.Conn, error) {
connOnce.Do(func() {
if rootless {
connDbus, connErr = NewUserSystemdDbus()
} else {
connDbus, connErr = systemdDbus.New()
}
})
return connDbus, connErr
}
func newProp(name string, units interface{}) systemdDbus.Property {
return systemdDbus.Property{
Name: name,
@@ -303,42 +319,32 @@ func getUnitName(c *configs.Cgroup) string {
return c.Name
}
// isDbusError returns true if the error is a specific dbus error.
func isDbusError(err error, name string) bool {
// isUnitExists returns true if the error is that a systemd unit already exists.
func isUnitExists(err error) bool {
if err != nil {
var derr *dbus.Error
if errors.As(err, &derr) {
return strings.Contains(derr.Name, name)
if dbusError, ok := err.(dbus.Error); ok {
return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
}
}
return false
}
// isUnitExists returns true if the error is that a systemd unit already exists.
func isUnitExists(err error) bool {
return isDbusError(err, "org.freedesktop.systemd1.UnitExists")
}
func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property) error {
func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []systemdDbus.Property) error {
statusChan := make(chan string, 1)
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
_, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan)
return err
})
if err == nil {
if _, err := dbusConnection.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
timeout := time.NewTimer(30 * time.Second)
defer timeout.Stop()
select {
case s := <-statusChan:
close(statusChan)
// Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
// Please refer to https://godoc.org/github.com/coreos/go-systemd/dbus#Conn.StartUnit
if s != "done" {
resetFailedUnit(cm, unitName)
dbusConnection.ResetFailedUnit(unitName)
return errors.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
}
case <-timeout.C:
resetFailedUnit(cm, unitName)
dbusConnection.ResetFailedUnit(unitName)
return errors.New("Timeout waiting for systemd to create " + unitName)
}
} else if !isUnitExists(err) {
@@ -348,17 +354,13 @@ func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Pr
return nil
}
func stopUnit(cm *dbusConnManager, unitName string) error {
func stopUnit(dbusConnection *systemdDbus.Conn, unitName string) error {
statusChan := make(chan string, 1)
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
_, err := c.StopUnitContext(context.TODO(), unitName, "replace", statusChan)
return err
})
if err == nil {
if _, err := dbusConnection.StopUnit(unitName, "replace", statusChan); err == nil {
select {
case s := <-statusChan:
close(statusChan)
// Please refer to https://godoc.org/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
// Please refer to https://godoc.org/github.com/coreos/go-systemd/dbus#Conn.StartUnit
if s != "done" {
logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s)
}
@@ -369,38 +371,10 @@ func stopUnit(cm *dbusConnManager, unitName string) error {
return nil
}
func resetFailedUnit(cm *dbusConnManager, name string) {
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
return c.ResetFailedUnitContext(context.TODO(), name)
})
if err != nil {
logrus.Warnf("unable to reset failed unit: %v", err)
}
}
func setUnitProperties(cm *dbusConnManager, name string, properties ...systemdDbus.Property) error {
return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
return c.SetUnitPropertiesContext(context.TODO(), name, true, properties...)
})
}
func getManagerProperty(cm *dbusConnManager, name string) (string, error) {
str := ""
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
var err error
str, err = c.GetManagerProperty(name)
return err
})
if err != nil {
return "", err
}
return strconv.Unquote(str)
}
func systemdVersion(cm *dbusConnManager) int {
func systemdVersion(conn *systemdDbus.Conn) int {
versionOnce.Do(func() {
version = -1
verStr, err := getManagerProperty(cm, "Version")
verStr, err := conn.GetManagerProperty("Version")
if err == nil {
version, err = systemdVersionAtoi(verStr)
}
@@ -415,11 +389,11 @@ func systemdVersion(cm *dbusConnManager) int {
func systemdVersionAtoi(verStr string) (int, error) {
// verStr should be of the form:
// "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32" (without quotes).
// The result for all of the above should be 245.
// Thus, we unconditionally remove the "v" prefix
// and then match on the first integer we can grab.
re := regexp.MustCompile(`v?([0-9]+)`)
// "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32"
// all the input strings include quotes, and the output int should be 245
// thus, we unconditionally remove the `"v`
// and then match on the first integer we can grab
re := regexp.MustCompile(`"?v?([0-9]+)`)
matches := re.FindStringSubmatch(verStr)
if len(matches) < 2 {
return 0, errors.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches)
@@ -428,10 +402,10 @@ func systemdVersionAtoi(verStr string) (int, error) {
return ver, errors.Wrapf(err, "can't parse version %s", verStr)
}
func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota int64, period uint64) {
func addCpuQuota(conn *systemdDbus.Conn, properties *[]systemdDbus.Property, quota int64, period uint64) {
if period != 0 {
// systemd only supports CPUQuotaPeriodUSec since v242
sdVer := systemdVersion(cm)
sdVer := systemdVersion(conn)
if sdVer >= 242 {
*properties = append(*properties,
newProp("CPUQuotaPeriodUSec", period))
@@ -462,13 +436,13 @@ func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota
}
}
func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems string) error {
func addCpuset(conn *systemdDbus.Conn, props *[]systemdDbus.Property, cpus, mems string) error {
if cpus == "" && mems == "" {
return nil
}
// systemd only supports AllowedCPUs/AllowedMemoryNodes since v244
sdVer := systemdVersion(cm)
sdVer := systemdVersion(conn)
if sdVer < 244 {
logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+
" (settings will still be applied to cgroupfs)", sdVer)

View File

@@ -1,96 +0,0 @@
// +build linux
package systemd
import (
"context"
"sync"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
)
var (
dbusC *systemdDbus.Conn
dbusMu sync.RWMutex
dbusInited bool
dbusRootless bool
)
type dbusConnManager struct {
}
// newDbusConnManager initializes systemd dbus connection manager.
func newDbusConnManager(rootless bool) *dbusConnManager {
if dbusInited && rootless != dbusRootless {
panic("can't have both root and rootless dbus")
}
dbusRootless = rootless
return &dbusConnManager{}
}
// getConnection lazily initializes and returns systemd dbus connection.
func (d *dbusConnManager) getConnection() (*systemdDbus.Conn, error) {
// In the case where dbusC != nil
// Use the read lock the first time to ensure
// that Conn can be acquired at the same time.
dbusMu.RLock()
if conn := dbusC; conn != nil {
dbusMu.RUnlock()
return conn, nil
}
dbusMu.RUnlock()
// In the case where dbusC == nil
// Use write lock to ensure that only one
// will be created
dbusMu.Lock()
defer dbusMu.Unlock()
if conn := dbusC; conn != nil {
return conn, nil
}
conn, err := d.newConnection()
if err != nil {
return nil, err
}
dbusC = conn
return conn, nil
}
func (d *dbusConnManager) newConnection() (*systemdDbus.Conn, error) {
if dbusRootless {
return newUserSystemdDbus()
}
return systemdDbus.NewWithContext(context.TODO())
}
// resetConnection resets the connection to its initial state
// (so it can be reconnected if necessary).
func (d *dbusConnManager) resetConnection(conn *systemdDbus.Conn) {
dbusMu.Lock()
defer dbusMu.Unlock()
if dbusC != nil && dbusC == conn {
dbusC.Close()
dbusC = nil
}
}
var errDbusConnClosed = dbus.ErrClosed.Error()
// retryOnDisconnect calls op, and if the error it returns is about closed dbus
// connection, the connection is re-established and the op is retried. This helps
// with the situation when dbus is restarted and we have a stale connection.
func (d *dbusConnManager) retryOnDisconnect(op func(*systemdDbus.Conn) error) error {
for {
conn, err := d.getConnection()
if err != nil {
return err
}
err = op(conn)
if !isDbusError(err, errDbusConnClosed) {
return err
}
d.resetConnection(conn)
}
}

View File

@@ -13,12 +13,12 @@ import (
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/pkg/errors"
)
// newUserSystemdDbus creates a connection for systemd user-instance.
func newUserSystemdDbus() (*systemdDbus.Conn, error) {
// NewUserSystemdDbus creates a connection for systemd user-instance.
func NewUserSystemdDbus() (*systemdDbus.Conn, error) {
addr, err := DetectUserDbusSessionBusAddress()
if err != nil {
return nil, err
@@ -52,7 +52,7 @@ func newUserSystemdDbus() (*systemdDbus.Conn, error) {
//
// Otherwise returns os.Getuid() .
func DetectUID() (int, error) {
if !userns.RunningInUserNS() {
if !system.RunningInUserNS() {
return os.Getuid(), nil
}
b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput()

View File

@@ -12,6 +12,7 @@ import (
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
)
@@ -20,14 +21,12 @@ type legacyManager struct {
mu sync.Mutex
cgroups *configs.Cgroup
paths map[string]string
dbus *dbusConnManager
}
func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &legacyManager{
cgroups: cg,
paths: paths,
dbus: newDbusConnManager(false),
}
}
@@ -36,8 +35,8 @@ type subsystem interface {
Name() string
// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
GetStats(path string, stats *cgroups.Stats) error
// Set sets cgroup resource limits.
Set(path string, r *configs.Resources) error
// Set the cgroup represented by cgroup.
Set(path string, cgroup *configs.Cgroup) error
}
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
@@ -58,8 +57,9 @@ var legacySubsystems = []subsystem{
&fs.NameGroup{GroupName: "name=systemd"},
}
func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
func genV1ResourcesProperties(c *configs.Cgroup, conn *systemdDbus.Conn) ([]systemdDbus.Property, error) {
var properties []systemdDbus.Property
r := c.Resources
deviceProperties, err := generateDeviceProperties(r.Devices)
if err != nil {
@@ -77,7 +77,7 @@ func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
newProp("CPUShares", r.CpuShares))
}
addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
addCpuQuota(conn, &properties, r.CpuQuota, r.CpuPeriod)
if r.BlkioWeight != 0 {
properties = append(properties,
@@ -86,10 +86,11 @@ func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
if r.PidsLimit > 0 || r.PidsLimit == -1 {
properties = append(properties,
newProp("TasksAccounting", true),
newProp("TasksMax", uint64(r.PidsLimit)))
}
err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
err = addCpuset(conn, &properties, r.CpusetCpus, r.CpusetMems)
if err != nil {
return nil, err
}
@@ -157,17 +158,32 @@ func (m *legacyManager) Apply(pid int) error {
properties = append(properties,
newProp("MemoryAccounting", true),
newProp("CPUAccounting", true),
newProp("BlockIOAccounting", true),
newProp("TasksAccounting", true),
)
newProp("BlockIOAccounting", true))
// Assume DefaultDependencies= will always work (the check for it was previously broken.)
properties = append(properties,
newProp("DefaultDependencies", false))
dbusConnection, err := getDbusConnection(false)
if err != nil {
return err
}
resourcesProperties, err := genV1ResourcesProperties(c, dbusConnection)
if err != nil {
return err
}
properties = append(properties, resourcesProperties...)
properties = append(properties, c.SystemdProps...)
if err := startUnit(m.dbus, unitName, properties); err != nil {
// We have to set kernel memory here, as we can't change it once
// processes have been attached to the cgroup.
if c.Resources.KernelMemory != 0 {
if err := enableKmem(c); err != nil {
return err
}
}
if err := startUnit(dbusConnection, unitName, properties); err != nil {
return err
}
@@ -205,8 +221,13 @@ func (m *legacyManager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
stopErr := stopUnit(m.dbus, getUnitName(m.cgroups))
dbusConnection, err := getDbusConnection(false)
if err != nil {
return err
}
unitName := getUnitName(m.cgroups)
stopErr := stopUnit(dbusConnection, unitName)
// Both on success and on error, cleanup all the cgroups we are aware of.
// Some of them were created directly by Apply() and are not managed by systemd.
if err := cgroups.RemovePaths(m.paths); err != nil {
@@ -231,7 +252,7 @@ func (m *legacyManager) joinCgroups(pid int) error {
case "cpuset":
if path, ok := m.paths[name]; ok {
s := &fs.CpusetGroup{}
if err := s.ApplyDir(path, m.cgroups.Resources, pid); err != nil {
if err := s.ApplyDir(path, m.cgroups, pid); err != nil {
return err
}
}
@@ -284,7 +305,7 @@ func (m *legacyManager) Freeze(state configs.FreezerState) error {
prevState := m.cgroups.Resources.Freezer
m.cgroups.Resources.Freezer = state
freezer := &fs.FreezerGroup{}
if err := freezer.Set(path, m.cgroups.Resources); err != nil {
if err := freezer.Set(path, m.cgroups); err != nil {
m.cgroups.Resources.Freezer = prevState
return err
}
@@ -324,16 +345,20 @@ func (m *legacyManager) GetStats() (*cgroups.Stats, error) {
return stats, nil
}
func (m *legacyManager) Set(r *configs.Resources) error {
func (m *legacyManager) Set(container *configs.Config) error {
// If Paths are set, then we are just joining cgroups paths
// and there is no need to set any values.
if m.cgroups.Paths != nil {
return nil
}
if r.Unified != nil {
if container.Cgroups.Resources.Unified != nil {
return cgroups.ErrV1NoUnified
}
properties, err := genV1ResourcesProperties(r, m.dbus)
dbusConnection, err := getDbusConnection(false)
if err != nil {
return err
}
properties, err := genV1ResourcesProperties(container.Cgroups, dbusConnection)
if err != nil {
return err
}
@@ -361,7 +386,7 @@ func (m *legacyManager) Set(r *configs.Resources) error {
}
}
if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {
if err := dbusConnection.SetUnitProperties(getUnitName(container.Cgroups), true, properties...); err != nil {
_ = m.Freeze(targetFreezerState)
return err
}
@@ -376,7 +401,7 @@ func (m *legacyManager) Set(r *configs.Resources) error {
if !ok {
continue
}
if err := sys.Set(path, r); err != nil {
if err := sys.Set(path, container.Cgroups); err != nil {
return err
}
}
@@ -384,6 +409,30 @@ func (m *legacyManager) Set(r *configs.Resources) error {
return nil
}
func enableKmem(c *configs.Cgroup) error {
path, err := getSubsystemPath(c, "memory")
if err != nil {
if cgroups.IsNotFound(err) {
return nil
}
return err
}
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
// do not try to enable the kernel memory if we already have
// tasks in the cgroup.
content, err := fscommon.ReadFile(path, "tasks")
if err != nil {
return err
}
if len(content) > 0 {
return nil
}
return fs.EnableKernelMemoryAccounting(path)
}
func (m *legacyManager) GetPaths() map[string]string {
m.mu.Lock()
defer m.mu.Unlock()
@@ -406,7 +455,3 @@ func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
func (m *legacyManager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}
func (m *legacyManager) OOMKillCount() (uint64, error) {
return fs.OOMKillCount(m.Path("memory"))
}

View File

@@ -26,7 +26,6 @@ type unifiedManager struct {
// path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
path string
rootless bool
dbus *dbusConnManager
}
func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgroups.Manager {
@@ -34,7 +33,6 @@ func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgrou
cgroups: config,
path: path,
rootless: rootless,
dbus: newDbusConnManager(rootless),
}
}
@@ -47,7 +45,7 @@ func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgrou
// For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt
//
// For the list of systemd unit properties, see systemd.resource-control(5).
func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) {
func unifiedResToSystemdProps(conn *systemdDbus.Conn, res map[string]string) (props []systemdDbus.Property, _ error) {
var err error
for k, v := range res {
@@ -85,7 +83,7 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props
return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err)
}
}
addCpuQuota(cm, &props, quota, period)
addCpuQuota(conn, &props, quota, period)
case "cpu.weight":
num, err := strconv.ParseUint(v, 10, 64)
@@ -105,7 +103,7 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props
"cpuset.mems": "AllowedMemoryNodes",
}
// systemd only supports these properties since v244
sdVer := systemdVersion(cm)
sdVer := systemdVersion(conn)
if sdVer >= 244 {
props = append(props,
newProp(m[k], bits))
@@ -143,6 +141,7 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props
}
}
props = append(props,
newProp("TasksAccounting", true),
newProp("TasksMax", num))
case "memory.oom.group":
@@ -164,8 +163,9 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props
return props, nil
}
func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
func genV2ResourcesProperties(c *configs.Cgroup, conn *systemdDbus.Conn) ([]systemdDbus.Property, error) {
var properties []systemdDbus.Property
r := c.Resources
// NOTE: This is of questionable correctness because we insert our own
// devices eBPF program later. Two programs with identical rules
@@ -201,14 +201,15 @@ func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
newProp("CPUWeight", r.CpuWeight))
}
addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
addCpuQuota(conn, &properties, r.CpuQuota, r.CpuPeriod)
if r.PidsLimit > 0 || r.PidsLimit == -1 {
properties = append(properties,
newProp("TasksAccounting", true),
newProp("TasksMax", uint64(r.PidsLimit)))
}
err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
err = addCpuset(conn, &properties, r.CpusetCpus, r.CpusetMems)
if err != nil {
return nil, err
}
@@ -217,7 +218,7 @@ func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
// convert Resources.Unified map to systemd properties
if r.Unified != nil {
unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified)
unifiedProps, err := unifiedResToSystemdProps(conn, r.Unified)
if err != nil {
return nil, err
}
@@ -272,21 +273,28 @@ func (m *unifiedManager) Apply(pid int) error {
properties = append(properties,
newProp("MemoryAccounting", true),
newProp("CPUAccounting", true),
newProp("IOAccounting", true),
newProp("TasksAccounting", true),
)
newProp("IOAccounting", true))
// Assume DefaultDependencies= will always work (the check for it was previously broken.)
properties = append(properties,
newProp("DefaultDependencies", false))
dbusConnection, err := getDbusConnection(m.rootless)
if err != nil {
return err
}
resourcesProperties, err := genV2ResourcesProperties(c, dbusConnection)
if err != nil {
return err
}
properties = append(properties, resourcesProperties...)
properties = append(properties, c.SystemdProps...)
if err := startUnit(m.dbus, unitName, properties); err != nil {
if err := startUnit(dbusConnection, unitName, properties); err != nil {
return errors.Wrapf(err, "error while starting unit %q with properties %+v", unitName, properties)
}
if err := m.initPath(); err != nil {
if err = m.initPath(); err != nil {
return err
}
if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
@@ -302,13 +310,17 @@ func (m *unifiedManager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
dbusConnection, err := getDbusConnection(m.rootless)
if err != nil {
return err
}
unitName := getUnitName(m.cgroups)
if err := stopUnit(m.dbus, unitName); err != nil {
if err := stopUnit(dbusConnection, unitName); err != nil {
return err
}
// XXX this is probably not needed, systemd should handle it
err := os.Remove(m.path)
err = os.Remove(m.path)
if err != nil && !os.IsNotExist(err) {
return err
}
@@ -317,7 +329,6 @@ func (m *unifiedManager) Destroy() error {
}
func (m *unifiedManager) Path(_ string) string {
_ = m.initPath()
return m.path
}
@@ -338,8 +349,16 @@ func (m *unifiedManager) getSliceFull() (string, error) {
}
if m.rootless {
// managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service".
managerCG, err := getManagerProperty(m.dbus, "ControlGroup")
dbusConnection, err := getDbusConnection(m.rootless)
if err != nil {
return "", err
}
// managerCGQuoted is typically "/user.slice/user-${uid}.slice/user@${uid}.service" including the quote symbols
managerCGQuoted, err := dbusConnection.GetManagerProperty("ControlGroup")
if err != nil {
return "", err
}
managerCG, err := strconv.Unquote(managerCGQuoted)
if err != nil {
return "", err
}
@@ -412,8 +431,12 @@ func (m *unifiedManager) GetStats() (*cgroups.Stats, error) {
return fsMgr.GetStats()
}
func (m *unifiedManager) Set(r *configs.Resources) error {
properties, err := genV2ResourcesProperties(r, m.dbus)
func (m *unifiedManager) Set(container *configs.Config) error {
dbusConnection, err := getDbusConnection(m.rootless)
if err != nil {
return err
}
properties, err := genV2ResourcesProperties(m.cgroups, dbusConnection)
if err != nil {
return err
}
@@ -441,7 +464,7 @@ func (m *unifiedManager) Set(r *configs.Resources) error {
}
}
if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {
if err := dbusConnection.SetUnitProperties(getUnitName(m.cgroups), true, properties...); err != nil {
_ = m.Freeze(targetFreezerState)
return errors.Wrap(err, "error while setting unit properties")
}
@@ -454,7 +477,7 @@ func (m *unifiedManager) Set(r *configs.Resources) error {
if err != nil {
return err
}
return fsMgr.Set(r)
return fsMgr.Set(container)
}
func (m *unifiedManager) GetPaths() map[string]string {
@@ -478,11 +501,3 @@ func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) {
func (m *unifiedManager) Exists() bool {
return cgroups.PathExists(m.path)
}
func (m *unifiedManager) OOMKillCount() (uint64, error) {
fsMgr, err := m.fsManager()
if err != nil {
return 0, err
}
return fsMgr.OOMKillCount()
}

View File

@@ -16,7 +16,7 @@ import (
"time"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@@ -37,7 +37,7 @@ func IsCgroup2UnifiedMode() bool {
var st unix.Statfs_t
err := unix.Statfs(unifiedMountpoint, &st)
if err != nil {
if os.IsNotExist(err) && userns.RunningInUserNS() {
if os.IsNotExist(err) && system.RunningInUserNS() {
// ignore the "not found" error if running in userns
logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
isUnified = false
@@ -400,6 +400,17 @@ func WriteCgroupProc(dir string, pid int) error {
return err
}
// Since the OCI spec is designed for cgroup v1, in some cases
// there is need to convert from the cgroup v1 configuration to cgroup v2
// the formula for BlkIOWeight is y = (1 + (x - 10) * 9999 / 990)
// convert linearly from [10-1000] to [1-10000]
func ConvertBlkIOToCgroupV2Value(blkIoWeight uint16) uint64 {
if blkIoWeight == 0 {
return 0
}
return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
}
// Since the OCI spec is designed for cgroup v1, in some cases
// there is need to convert from the cgroup v1 configuration to cgroup v2
// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
@@ -439,14 +450,3 @@ func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
return memorySwap - memory, nil
}
// Since the OCI spec is designed for cgroup v1, in some cases
// there is need to convert from the cgroup v1 configuration to cgroup v2
// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
// convert linearly from [10-1000] to [1-10000]
func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
if blkIoWeight == 0 {
return 0
}
return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
}