Revert "Merge pull request 101888 from kolyshkin/update-runc-rc94"

This reverts commit b1b06fe0a4, reversing
changes made to 382a33986b.
This commit is contained in:
Jordan Liggitt
2021-05-18 09:12:04 -04:00
parent 7ccd90e7d7
commit 4b45d0d921
336 changed files with 5393 additions and 17166 deletions

View File

@@ -57,10 +57,6 @@ struct describing how the container is to be created. A sample would look simila
```go
defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
var devices []*configs.DeviceRule
for _, device := range specconv.AllowedDevices {
devices = append(devices, &device.Rule)
}
config := &configs.Config{
Rootfs: "/your/path/to/rootfs",
Capabilities: &configs.Capabilities{
@@ -159,7 +155,7 @@ config := &configs.Config{
Parent: "system",
Resources: &configs.Resources{
MemorySwappiness: nil,
Devices: devices,
Devices: specconv.AllowedDevices,
},
},
MaskPaths: []string{
@@ -317,7 +313,7 @@ state, err := container.State()
#### Checkpoint & Restore
libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
This lets you save the state of a process running inside a container to disk, and then restore
This let's you save the state of a process running inside a container to disk, and then restore
that state into a new process, on the same machine or on another machine.
`criu` version 1.5.2 or higher is required to use checkpoint and restore.

View File

@@ -1,41 +1,27 @@
package apparmor
import (
"errors"
"bytes"
"fmt"
"io/ioutil"
"os"
"sync"
"github.com/opencontainers/runc/libcontainer/utils"
)
var (
appArmorEnabled bool
checkAppArmor sync.Once
)
// IsEnabled returns true if apparmor is enabled for the host.
func IsEnabled() bool {
checkAppArmor.Do(func() {
if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil {
buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled")
appArmorEnabled = err == nil && len(buf) > 1 && buf[0] == 'Y'
}
})
return appArmorEnabled
if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil {
buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled")
return err == nil && bytes.HasPrefix(buf, []byte("Y"))
}
return false
}
func setProcAttr(attr, value string) error {
// Under AppArmor you can only change your own attr, so use /proc/self/
// instead of /proc/<tid>/ like libapparmor does
attrPath := "/proc/self/attr/apparmor/" + attr
if _, err := os.Stat(attrPath); errors.Is(err, os.ErrNotExist) {
// fall back to the old convention
attrPath = "/proc/self/attr/" + attr
}
f, err := os.OpenFile(attrPath, os.O_WRONLY, 0)
f, err := os.OpenFile("/proc/self/attr/"+attr, os.O_WRONLY, 0)
if err != nil {
return err
}

View File

@@ -3,26 +3,16 @@
package capabilities
import (
"sort"
"fmt"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
"github.com/syndtr/gocapability/capability"
)
const allCapabilityTypes = capability.CAPS | capability.BOUNDING | capability.AMBIENT
const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS
var (
capabilityMap map[string]capability.Cap
capTypes = []capability.CapType{
capability.BOUNDING,
capability.PERMITTED,
capability.INHERITABLE,
capability.EFFECTIVE,
capability.AMBIENT,
}
)
var capabilityMap map[string]capability.Cap
func init() {
capabilityMap = make(map[string]capability.Cap, capability.CAP_LAST_CAP+1)
@@ -34,78 +24,73 @@ func init() {
}
}
// New creates a new Caps from the given Capabilities config. Unknown Capabilities
// or Capabilities that are unavailable in the current environment are ignored,
// printing a warning instead.
// New creates a new Caps from the given Capabilities config.
func New(capConfig *configs.Capabilities) (*Caps, error) {
var (
err error
c Caps
err error
caps Caps
)
unknownCaps := make(map[string]struct{})
c.caps = map[capability.CapType][]capability.Cap{
capability.BOUNDING: capSlice(capConfig.Bounding, unknownCaps),
capability.EFFECTIVE: capSlice(capConfig.Effective, unknownCaps),
capability.INHERITABLE: capSlice(capConfig.Inheritable, unknownCaps),
capability.PERMITTED: capSlice(capConfig.Permitted, unknownCaps),
capability.AMBIENT: capSlice(capConfig.Ambient, unknownCaps),
}
if c.pid, err = capability.NewPid2(0); err != nil {
if caps.bounding, err = capSlice(capConfig.Bounding); err != nil {
return nil, err
}
if err = c.pid.Load(); err != nil {
if caps.effective, err = capSlice(capConfig.Effective); err != nil {
return nil, err
}
if len(unknownCaps) > 0 {
logrus.Warn("ignoring unknown or unavailable capabilities: ", mapKeys(unknownCaps))
if caps.inheritable, err = capSlice(capConfig.Inheritable); err != nil {
return nil, err
}
return &c, nil
if caps.permitted, err = capSlice(capConfig.Permitted); err != nil {
return nil, err
}
if caps.ambient, err = capSlice(capConfig.Ambient); err != nil {
return nil, err
}
if caps.pid, err = capability.NewPid2(0); err != nil {
return nil, err
}
if err = caps.pid.Load(); err != nil {
return nil, err
}
return &caps, nil
}
// capSlice converts the slice of capability names in caps, to their numeric
// equivalent, and returns them as a slice. Unknown or unavailable capabilities
// are not returned, but appended to unknownCaps.
func capSlice(caps []string, unknownCaps map[string]struct{}) []capability.Cap {
var out []capability.Cap
for _, c := range caps {
if v, ok := capabilityMap[c]; !ok {
unknownCaps[c] = struct{}{}
} else {
out = append(out, v)
func capSlice(caps []string) ([]capability.Cap, error) {
out := make([]capability.Cap, len(caps))
for i, c := range caps {
v, ok := capabilityMap[c]
if !ok {
return nil, fmt.Errorf("unknown capability %q", c)
}
out[i] = v
}
return out
}
// mapKeys returns the keys of input in sorted order
func mapKeys(input map[string]struct{}) []string {
var keys []string
for c := range input {
keys = append(keys, c)
}
sort.Strings(keys)
return keys
return out, nil
}
// Caps holds the capabilities for a container.
type Caps struct {
pid capability.Capabilities
caps map[capability.CapType][]capability.Cap
pid capability.Capabilities
bounding []capability.Cap
effective []capability.Cap
inheritable []capability.Cap
permitted []capability.Cap
ambient []capability.Cap
}
// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
func (c *Caps) ApplyBoundingSet() error {
c.pid.Clear(capability.BOUNDING)
c.pid.Set(capability.BOUNDING, c.caps[capability.BOUNDING]...)
return c.pid.Apply(capability.BOUNDING)
c.pid.Clear(capability.BOUNDS)
c.pid.Set(capability.BOUNDS, c.bounding...)
return c.pid.Apply(capability.BOUNDS)
}
// Apply sets all the capabilities for the current process in the config.
func (c *Caps) ApplyCaps() error {
c.pid.Clear(allCapabilityTypes)
for _, g := range capTypes {
c.pid.Set(g, c.caps[g]...)
}
c.pid.Set(capability.BOUNDS, c.bounding...)
c.pid.Set(capability.PERMITTED, c.permitted...)
c.pid.Set(capability.INHERITABLE, c.inheritable...)
c.pid.Set(capability.EFFECTIVE, c.effective...)
c.pid.Set(capability.AMBIENT, c.ambient...)
return c.pid.Apply(allCapabilityTypes)
}

View File

@@ -7,44 +7,37 @@ import (
)
type Manager interface {
// Apply creates a cgroup, if not yet created, and adds a process
// with the specified pid into that cgroup. A special value of -1
// can be used to merely create a cgroup.
// Applies cgroup configuration to the process with the specified pid
Apply(pid int) error
// GetPids returns the PIDs of all processes inside the cgroup.
// Returns the PIDs inside the cgroup set
GetPids() ([]int, error)
// GetAllPids returns the PIDs of all processes inside the cgroup
// any all its sub-cgroups.
// Returns the PIDs inside the cgroup set & all sub-cgroups
GetAllPids() ([]int, error)
// GetStats returns cgroups statistics.
// Returns statistics for the cgroup set
GetStats() (*Stats, error)
// Freeze sets the freezer cgroup to the specified state.
// Toggles the freezer cgroup according with specified state
Freeze(state configs.FreezerState) error
// Destroy removes cgroup.
// Destroys the cgroup set
Destroy() error
// Path returns a cgroup path to the specified controller/subsystem.
// For cgroupv2, the argument is unused and can be empty.
Path(string) string
// Set sets cgroup resources parameters/limits. If the argument is nil,
// the resources specified during Manager creation (or the previous call
// to Set) are used.
Set(r *configs.Resources) error
// Sets the cgroup as configured.
Set(container *configs.Config) error
// GetPaths returns cgroup path(s) to save in a state file in order to
// restore later.
// GetPaths returns cgroup path(s) to save in a state file in order to restore later.
//
// For cgroup v1, a key is cgroup subsystem name, and the value is the
// path to the cgroup for this subsystem.
// For cgroup v1, a key is cgroup subsystem name, and the value is the path
// to the cgroup for this subsystem.
//
// For cgroup v2 unified hierarchy, a key is "", and the value is the
// unified path.
// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
GetPaths() map[string]string
// GetCgroups returns the cgroup data as configured.
@@ -53,9 +46,6 @@ type Manager interface {
// GetFreezerState retrieves the current FreezerState of the cgroup.
GetFreezerState() (configs.FreezerState, error)
// Exists returns whether the cgroup path exists or not.
// Whether the cgroup path exists or not
Exists() bool
// OOMKillCount reports OOM kill count for the cgroup.
OOMKillCount() (uint64, error)
}

View File

@@ -127,10 +127,10 @@ func (p *program) appendDevice(dev *devices.Rule) error {
}
if hasAccess {
p.insts = append(p.insts,
// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
// if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next
asm.Mov.Reg32(asm.R1, asm.R3),
asm.And.Imm32(asm.R1, bpfAccess),
asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
asm.JEq.Imm(asm.R1, 0, nextBlockSym),
)
}
if hasMajor {

View File

@@ -3,7 +3,6 @@ package ebpf
import (
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/link"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
@@ -33,23 +32,12 @@ func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD
if err != nil {
return nilCloser, err
}
err = link.RawAttachProgram(link.RawAttachProgramOptions{
Target: dirFD,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
Flags: unix.BPF_F_ALLOW_MULTI,
})
if err != nil {
if err := prog.Attach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil {
return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
}
closer := func() error {
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFD,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE)")
if err := prog.Detach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil {
return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)")
}
return nil
}

View File

@@ -25,19 +25,19 @@ func (s *BlkioGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *BlkioGroup) Set(path string, r *configs.Resources) error {
if r.BlkioWeight != 0 {
if err := fscommon.WriteFile(path, "blkio.weight", strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
func (s *BlkioGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.BlkioWeight != 0 {
if err := fscommon.WriteFile(path, "blkio.weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil {
return err
}
}
if r.BlkioLeafWeight != 0 {
if err := fscommon.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil {
if cgroup.Resources.BlkioLeafWeight != 0 {
if err := fscommon.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(cgroup.Resources.BlkioLeafWeight), 10)); err != nil {
return err
}
}
for _, wd := range r.BlkioWeightDevice {
for _, wd := range cgroup.Resources.BlkioWeightDevice {
if err := fscommon.WriteFile(path, "blkio.weight_device", wd.WeightString()); err != nil {
return err
}
@@ -45,22 +45,22 @@ func (s *BlkioGroup) Set(path string, r *configs.Resources) error {
return err
}
}
for _, td := range r.BlkioThrottleReadBpsDevice {
for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
if err := fscommon.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteBpsDevice {
for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
if err := fscommon.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleReadIOPSDevice {
for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
if err := fscommon.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteIOPSDevice {
for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
if err := fscommon.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err
}

View File

@@ -32,7 +32,7 @@ func (s *CpuGroup) Apply(path string, d *cgroupData) error {
// We should set the real-Time group scheduling settings before moving
// in the process because if the process is already in SCHED_RR mode
// and no RT bandwidth is set, adding it will fail.
if err := s.SetRtSched(path, d.config.Resources); err != nil {
if err := s.SetRtSched(path, d.config); err != nil {
return err
}
// Since we are not using join(), we need to place the pid
@@ -40,23 +40,23 @@ func (s *CpuGroup) Apply(path string, d *cgroupData) error {
return cgroups.WriteCgroupProc(path, d.pid)
}
func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error {
if r.CpuRtPeriod != 0 {
if err := fscommon.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil {
func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuRtPeriod != 0 {
if err := fscommon.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(cgroup.Resources.CpuRtPeriod, 10)); err != nil {
return err
}
}
if r.CpuRtRuntime != 0 {
if err := fscommon.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil {
if cgroup.Resources.CpuRtRuntime != 0 {
if err := fscommon.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(cgroup.Resources.CpuRtRuntime, 10)); err != nil {
return err
}
}
return nil
}
func (s *CpuGroup) Set(path string, r *configs.Resources) error {
if r.CpuShares != 0 {
shares := r.CpuShares
func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpuShares != 0 {
shares := cgroup.Resources.CpuShares
if err := fscommon.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil {
return err
}
@@ -72,17 +72,17 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error {
return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead)
}
}
if r.CpuPeriod != 0 {
if err := fscommon.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(r.CpuPeriod, 10)); err != nil {
if cgroup.Resources.CpuPeriod != 0 {
if err := fscommon.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil {
return err
}
}
if r.CpuQuota != 0 {
if err := fscommon.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil {
if cgroup.Resources.CpuQuota != 0 {
if err := fscommon.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(cgroup.Resources.CpuQuota, 10)); err != nil {
return err
}
}
return s.SetRtSched(path, r)
return s.SetRtSched(path, cgroup)
}
func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
@@ -97,7 +97,7 @@ func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
sc := bufio.NewScanner(f)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
if err != nil {
return err
}

View File

@@ -43,7 +43,7 @@ func (s *CpuacctGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *CpuacctGroup) Set(_ string, _ *configs.Resources) error {
func (s *CpuacctGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}

View File

@@ -24,17 +24,17 @@ func (s *CpusetGroup) Name() string {
}
func (s *CpusetGroup) Apply(path string, d *cgroupData) error {
return s.ApplyDir(path, d.config.Resources, d.pid)
return s.ApplyDir(path, d.config, d.pid)
}
func (s *CpusetGroup) Set(path string, r *configs.Resources) error {
if r.CpusetCpus != "" {
if err := fscommon.WriteFile(path, "cpuset.cpus", r.CpusetCpus); err != nil {
func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.CpusetCpus != "" {
if err := fscommon.WriteFile(path, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
return err
}
}
if r.CpusetMems != "" {
if err := fscommon.WriteFile(path, "cpuset.mems", r.CpusetMems); err != nil {
if cgroup.Resources.CpusetMems != "" {
if err := fscommon.WriteFile(path, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
return err
}
}
@@ -144,7 +144,7 @@ func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}
func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error {
func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error {
// This might happen if we have no cpuset cgroup mounted.
// Just do nothing and don't fail.
if dir == "" {
@@ -166,7 +166,7 @@ func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error
// specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatibility.
if err := s.ensureCpusAndMems(dir, r); err != nil {
if err := s.ensureCpusAndMems(dir, cgroup); err != nil {
return err
}
@@ -241,8 +241,8 @@ func isEmptyCpuset(str string) bool {
return str == "" || str == "\n"
}
func (s *CpusetGroup) ensureCpusAndMems(path string, r *configs.Resources) error {
if err := s.Set(path, r); err != nil {
func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error {
if err := s.Set(path, cgroup); err != nil {
return err
}
return cpusetCopyIfNeeded(path, filepath.Dir(path))

View File

@@ -12,7 +12,7 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/opencontainers/runc/libcontainer/system"
)
type DevicesGroup struct {
@@ -54,8 +54,8 @@ func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) {
return emu, nil
}
func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
if userns.RunningInUserNS() || r.SkipDevices {
func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
if system.RunningInUserNS() || cgroup.SkipDevices {
return nil
}
@@ -65,7 +65,7 @@ func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
if err != nil {
return err
}
target, err := buildEmulator(r.Devices)
target, err := buildEmulator(cgroup.Resources.Devices)
if err != nil {
return err
}

View File

@@ -12,7 +12,6 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@@ -27,62 +26,29 @@ func (s *FreezerGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
switch r.Freezer {
func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
switch cgroup.Resources.Freezer {
case configs.Frozen:
defer func() {
if Err != nil {
// Freezing failed, and it is bad and dangerous
// to leave the cgroup in FROZEN or FREEZING
// state, so (try to) thaw it back.
_ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
}
}()
// As per older kernel docs (freezer-subsystem.txt before
// kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
// userspace should either retry or thaw. While current
// kernel cgroup v1 docs no longer mention a need to retry,
// even a recent kernel (v5.4, Ubuntu 20.04) can't reliably
// freeze a cgroup v1 while new processes keep appearing in it
// the kernel (tested on v5.4, Ubuntu 20.04) can't reliably
// freeze a cgroup while new processes keep appearing in it
// (either via fork/clone or by writing new PIDs to
// cgroup.procs).
//
// The numbers below are empirically chosen to have a decent
// chance to succeed in various scenarios ("runc pause/unpause
// with parallel runc exec" and "bare freeze/unfreeze on a very
// slow system"), tested on RHEL7 and Ubuntu 20.04 kernels.
// The number of retries below is chosen to have a decent
// chance to succeed even in the worst case scenario (runc
// pause/unpause with parallel runc exec).
//
// Adding any amount of sleep in between retries did not
// increase the chances of successful freeze in "pause/unpause
// with parallel exec" reproducer. OTOH, adding an occasional
// sleep helped for the case where the system is extremely slow
// (CentOS 7 VM on GHA CI).
//
// Alas, this is still a game of chances, since the real fix
// belong to the kernel (cgroup v2 do not have this bug).
// increase the chances of successful freeze.
for i := 0; i < 1000; i++ {
if i%50 == 49 {
// Occasional thaw and sleep improves
// the chances to succeed in freezing
// in case new processes keep appearing
// in the cgroup.
_ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
time.Sleep(10 * time.Millisecond)
}
if err := fscommon.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil {
return err
}
if i%25 == 24 {
// Occasional short sleep before reading
// the state back also improves the chances to
// succeed in freezing in case of a very slow
// system.
time.Sleep(10 * time.Microsecond)
}
state, err := fscommon.ReadFile(path, "freezer.state")
if err != nil {
return err
@@ -92,9 +58,6 @@ func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
case "FREEZING":
continue
case string(configs.Frozen):
if i > 1 {
logrus.Debugf("frozen after %d retries", i)
}
return nil
default:
// should never happen
@@ -102,13 +65,16 @@ func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
}
}
// Despite our best efforts, it got stuck in FREEZING.
// Leaving it in this state is bad and dangerous, so
// let's (try to) thaw it back and error out.
_ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
return errors.New("unable to freeze")
case configs.Thawed:
return fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
case configs.Undefined:
return nil
default:
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer))
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer))
}
}

View File

@@ -9,7 +9,6 @@ import (
"sync"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
@@ -44,8 +43,8 @@ type subsystem interface {
GetStats(path string, stats *cgroups.Stats) error
// Creates and joins the cgroup represented by 'cgroupData'.
Apply(path string, c *cgroupData) error
// Set sets the cgroup resources.
Set(path string, r *configs.Resources) error
// Set the cgroup represented by cgroup.
Set(path string, cgroup *configs.Cgroup) error
}
type manager struct {
@@ -274,8 +273,8 @@ func (m *manager) GetStats() (*cgroups.Stats, error) {
return stats, nil
}
func (m *manager) Set(r *configs.Resources) error {
if r == nil {
func (m *manager) Set(container *configs.Config) error {
if container.Cgroups == nil {
return nil
}
@@ -284,7 +283,7 @@ func (m *manager) Set(r *configs.Resources) error {
if m.cgroups != nil && m.cgroups.Paths != nil {
return nil
}
if r.Unified != nil {
if container.Cgroups.Resources.Unified != nil {
return cgroups.ErrV1NoUnified
}
@@ -292,11 +291,11 @@ func (m *manager) Set(r *configs.Resources) error {
defer m.mu.Unlock()
for _, sys := range subsystems {
path := m.paths[sys.Name()]
if err := sys.Set(path, r); err != nil {
if err := sys.Set(path, container.Cgroups); err != nil {
if m.rootless && sys.Name() == "devices" {
continue
}
// When m.rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if path == "" {
@@ -322,7 +321,7 @@ func (m *manager) Freeze(state configs.FreezerState) error {
prevState := m.cgroups.Resources.Freezer
m.cgroups.Resources.Freezer = state
freezer := &FreezerGroup{}
if err := freezer.Set(path, m.cgroups.Resources); err != nil {
if err := freezer.Set(path, m.cgroups); err != nil {
m.cgroups.Resources.Freezer = prevState
return err
}
@@ -422,17 +421,3 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) {
func (m *manager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}
func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.Path("memory"))
// Ignore ENOENT when rootless as it couldn't create cgroup.
if err != nil && m.rootless && os.IsNotExist(err) {
err = nil
}
return c, err
}

View File

@@ -22,8 +22,8 @@ func (s *HugetlbGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
for _, hugetlb := range r.HugetlbLimit {
func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error {
for _, hugetlb := range cgroup.Resources.HugetlbLimit {
if err := fscommon.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}

View File

@@ -0,0 +1,56 @@
// +build linux,!nokmem
package fs
import (
"errors"
"fmt"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"golang.org/x/sys/unix"
)
const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes"
func EnableKernelMemoryAccounting(path string) error {
// Ensure that kernel memory is available in this kernel build. If it
// isn't, we just ignore it because EnableKernelMemoryAccounting is
// automatically called for all memory limits.
if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
return nil
}
// We have to limit the kernel memory here as it won't be accounted at all
// until a limit is set on the cgroup and limit cannot be set once the
// cgroup has children, or if there are already tasks in the cgroup.
for _, i := range []int64{1, -1} {
if err := setKernelMemory(path, i); err != nil {
return err
}
}
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
if path == "" {
return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit)
}
if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) {
// We have specifically been asked to set a kmem limit. If the kernel
// doesn't support it we *must* error out.
return errors.New("kernel memory accounting not supported by this kernel")
}
if err := fscommon.WriteFile(path, cgroupKernelMemoryLimit, strconv.FormatInt(kernelMemoryLimit, 10)); err != nil {
// Check if the error number returned by the syscall is "EBUSY"
// The EBUSY signal is returned on attempts to write to the
// memory.kmem.limit_in_bytes file if the cgroup has children or
// once tasks have been attached to the cgroup
if errors.Is(err, unix.EBUSY) {
return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
}
return err
}
return nil
}

View File

@@ -0,0 +1,15 @@
// +build linux,nokmem
package fs
import (
"errors"
)
func EnableKernelMemoryAccounting(path string) error {
return nil
}
func setKernelMemory(path string, kernelMemoryLimit int64) error {
return errors.New("kernel memory accounting disabled in this runc build")
}

View File

@@ -14,15 +14,11 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
const (
cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
cgroupMemoryLimit = "memory.limit_in_bytes"
cgroupMemoryUsage = "memory.usage_in_bytes"
cgroupMemoryMaxUsage = "memory.max_usage_in_bytes"
)
type MemoryGroup struct {
@@ -33,55 +29,48 @@ func (s *MemoryGroup) Name() string {
}
func (s *MemoryGroup) Apply(path string, d *cgroupData) (err error) {
if path == "" {
return nil
}
if memoryAssigned(d.config) {
if _, err := os.Stat(path); os.IsNotExist(err) {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
// Only enable kernel memory accouting when this cgroup
// is created by libcontainer, otherwise we might get
// error when people use `cgroupsPath` to join an existed
// cgroup whose kernel memory is not initialized.
if err := EnableKernelMemoryAccounting(path); err != nil {
return err
}
}
}
defer func() {
if err != nil {
os.RemoveAll(path)
}
}()
// We need to join memory cgroup after set memory limits, because
// kmem.limit_in_bytes can only be set when the cgroup is empty.
return join(path, d.pid)
}
func setMemory(path string, val int64) error {
if val == 0 {
return nil
}
err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10))
if !errors.Is(err, unix.EBUSY) {
return err
}
// EBUSY means the kernel can't set new limit as it's too low
// (lower than the current usage). Return more specific error.
usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage)
if err != nil {
return err
}
max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage)
if err != nil {
return err
}
return errors.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max)
}
func setSwap(path string, val int64) error {
if val == 0 {
return nil
}
return fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10))
}
func setMemoryAndSwap(path string, r *configs.Resources) error {
func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
// If the memory update is set to -1 and the swap is not explicitly
// set, we should also set swap to -1, it means unlimited memory.
if r.Memory == -1 && r.MemorySwap == 0 {
if cgroup.Resources.Memory == -1 && cgroup.Resources.MemorySwap == 0 {
// Only set swap if it's enabled in kernel
if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
r.MemorySwap = -1
cgroup.Resources.MemorySwap = -1
}
}
// When memory and swap memory are both set, we need to handle the cases
// for updating container.
if r.Memory != 0 && r.MemorySwap != 0 {
curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit)
if cgroup.Resources.Memory != 0 && cgroup.Resources.MemorySwap != 0 {
memoryUsage, err := getMemoryData(path, "")
if err != nil {
return err
}
@@ -89,53 +78,72 @@ func setMemoryAndSwap(path string, r *configs.Resources) error {
// When update memory limit, we should adapt the write sequence
// for memory and swap memory, so it won't fail because the new
// value and the old value don't fit kernel's validation.
if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) {
if err := setSwap(path, r.MemorySwap); err != nil {
if cgroup.Resources.MemorySwap == -1 || memoryUsage.Limit < uint64(cgroup.Resources.MemorySwap) {
if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
if err := setMemory(path, r.Memory); err != nil {
if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
} else {
if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
}
} else {
if cgroup.Resources.Memory != 0 {
if err := fscommon.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
return err
}
}
if cgroup.Resources.MemorySwap != 0 {
if err := fscommon.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
return err
}
return nil
}
}
if err := setMemory(path, r.Memory); err != nil {
return err
}
if err := setSwap(path, r.MemorySwap); err != nil {
return err
}
return nil
}
func (s *MemoryGroup) Set(path string, r *configs.Resources) error {
if err := setMemoryAndSwap(path, r); err != nil {
func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
if err := setMemoryAndSwap(path, cgroup); err != nil {
return err
}
// ignore KernelMemory and KernelMemoryTCP
if r.MemoryReservation != 0 {
if err := fscommon.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil {
if cgroup.Resources.KernelMemory != 0 {
if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil {
return err
}
}
if r.OomKillDisable {
if cgroup.Resources.MemoryReservation != 0 {
if err := fscommon.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
return err
}
}
if cgroup.Resources.KernelMemoryTCP != 0 {
if err := fscommon.WriteFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil {
return err
}
}
if cgroup.Resources.OomKillDisable {
if err := fscommon.WriteFile(path, "memory.oom_control", "1"); err != nil {
return err
}
}
if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 {
if cgroup.Resources.MemorySwappiness == nil || int64(*cgroup.Resources.MemorySwappiness) == -1 {
return nil
} else if *r.MemorySwappiness <= 100 {
if err := fscommon.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil {
} else if *cgroup.Resources.MemorySwappiness <= 100 {
if err := fscommon.WriteFile(path, "memory.swappiness", strconv.FormatUint(*cgroup.Resources.MemorySwappiness, 10)); err != nil {
return err
}
} else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *r.MemorySwappiness)
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *cgroup.Resources.MemorySwappiness)
}
return nil
@@ -154,7 +162,7 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
sc := bufio.NewScanner(statsFile)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
if err != nil {
return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err)
}
@@ -204,6 +212,8 @@ func memoryAssigned(cgroup *configs.Cgroup) bool {
return cgroup.Resources.Memory != 0 ||
cgroup.Resources.MemoryReservation != 0 ||
cgroup.Resources.MemorySwap > 0 ||
cgroup.Resources.KernelMemory > 0 ||
cgroup.Resources.KernelMemoryTCP > 0 ||
cgroup.Resources.OomKillDisable ||
(cgroup.Resources.MemorySwappiness != nil && int64(*cgroup.Resources.MemorySwappiness) != -1)
}
@@ -224,9 +234,7 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
if name != "" && os.IsNotExist(err) {
// Ignore ENOENT as swap and kmem controllers
// are optional in the kernel.
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err)
@@ -234,16 +242,25 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
memoryData.Usage = value
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", maxUsage, err)
}
memoryData.MaxUsage = value
value, err = fscommon.GetCgroupParamUint(path, failcnt)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
}
memoryData.Failcnt = value
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
}
memoryData.Limit = value

View File

@@ -24,7 +24,7 @@ func (s *NameGroup) Apply(path string, d *cgroupData) error {
return nil
}
func (s *NameGroup) Set(_ string, _ *configs.Resources) error {
func (s *NameGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}

View File

@@ -21,9 +21,9 @@ func (s *NetClsGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *NetClsGroup) Set(path string, r *configs.Resources) error {
if r.NetClsClassid != 0 {
if err := fscommon.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil {
func (s *NetClsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.NetClsClassid != 0 {
if err := fscommon.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(cgroup.Resources.NetClsClassid), 10)); err != nil {
return err
}
}

View File

@@ -19,8 +19,8 @@ func (s *NetPrioGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *NetPrioGroup) Set(path string, r *configs.Resources) error {
for _, prioMap := range r.NetPrioIfpriomap {
func (s *NetPrioGroup) Set(path string, cgroup *configs.Cgroup) error {
for _, prioMap := range cgroup.Resources.NetPrioIfpriomap {
if err := fscommon.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
return err
}

View File

@@ -18,7 +18,7 @@ func (s *PerfEventGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *PerfEventGroup) Set(_ string, _ *configs.Resources) error {
func (s *PerfEventGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}

View File

@@ -23,13 +23,13 @@ func (s *PidsGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func (s *PidsGroup) Set(path string, r *configs.Resources) error {
if r.PidsLimit != 0 {
func (s *PidsGroup) Set(path string, cgroup *configs.Cgroup) error {
if cgroup.Resources.PidsLimit != 0 {
// "max" is the fallback value.
limit := "max"
if r.PidsLimit > 0 {
limit = strconv.FormatInt(r.PidsLimit, 10)
if cgroup.Resources.PidsLimit > 0 {
limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
}
if err := fscommon.WriteFile(path, "pids.max", limit); err != nil {

View File

@@ -12,14 +12,15 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
func isCpuSet(r *configs.Resources) bool {
return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0
func isCpuSet(cgroup *configs.Cgroup) bool {
return cgroup.Resources.CpuWeight != 0 || cgroup.Resources.CpuQuota != 0 || cgroup.Resources.CpuPeriod != 0
}
func setCpu(dirPath string, r *configs.Resources) error {
if !isCpuSet(r) {
func setCpu(dirPath string, cgroup *configs.Cgroup) error {
if !isCpuSet(cgroup) {
return nil
}
r := cgroup.Resources
// NOTE: .CpuShares is not used here. Conversion is the caller's responsibility.
if r.CpuWeight != 0 {
@@ -56,7 +57,7 @@ func statCpu(dirPath string, stats *cgroups.Stats) error {
sc := bufio.NewScanner(f)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
if err != nil {
return err
}
@@ -69,15 +70,6 @@ func statCpu(dirPath string, stats *cgroups.Stats) error {
case "system_usec":
stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000
case "nr_periods":
stats.CpuStats.ThrottlingData.Periods = v
case "nr_throttled":
stats.CpuStats.ThrottlingData.ThrottledPeriods = v
case "throttled_usec":
stats.CpuStats.ThrottlingData.ThrottledTime = v * 1000
}
}
return nil

View File

@@ -7,22 +7,22 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
func isCpusetSet(r *configs.Resources) bool {
return r.CpusetCpus != "" || r.CpusetMems != ""
func isCpusetSet(cgroup *configs.Cgroup) bool {
return cgroup.Resources.CpusetCpus != "" || cgroup.Resources.CpusetMems != ""
}
func setCpuset(dirPath string, r *configs.Resources) error {
if !isCpusetSet(r) {
func setCpuset(dirPath string, cgroup *configs.Cgroup) error {
if !isCpusetSet(cgroup) {
return nil
}
if r.CpusetCpus != "" {
if err := fscommon.WriteFile(dirPath, "cpuset.cpus", r.CpusetCpus); err != nil {
if cgroup.Resources.CpusetCpus != "" {
if err := fscommon.WriteFile(dirPath, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
return err
}
}
if r.CpusetMems != "" {
if err := fscommon.WriteFile(dirPath, "cpuset.mems", r.CpusetMems); err != nil {
if cgroup.Resources.CpusetMems != "" {
if err := fscommon.WriteFile(dirPath, "cpuset.mems", cgroup.Resources.CpusetMems); err != nil {
return err
}
}

View File

@@ -10,7 +10,7 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
func supportedControllers() (string, error) {
func supportedControllers(cgroup *configs.Cgroup) (string, error) {
return fscommon.ReadFile(UnifiedMountpoint, "/cgroup.controllers")
}
@@ -18,13 +18,13 @@ func supportedControllers() (string, error) {
// based on (1) controllers available and (2) resources that are being set.
// We don't check "pseudo" controllers such as
// "freezer" and "devices".
func needAnyControllers(r *configs.Resources) (bool, error) {
if r == nil {
func needAnyControllers(cgroup *configs.Cgroup) (bool, error) {
if cgroup == nil {
return false, nil
}
// list of all available controllers
content, err := supportedControllers()
content, err := supportedControllers(cgroup)
if err != nil {
return false, err
}
@@ -39,22 +39,22 @@ func needAnyControllers(r *configs.Resources) (bool, error) {
return ok
}
if isPidsSet(r) && have("pids") {
if isPidsSet(cgroup) && have("pids") {
return true, nil
}
if isMemorySet(r) && have("memory") {
if isMemorySet(cgroup) && have("memory") {
return true, nil
}
if isIoSet(r) && have("io") {
if isIoSet(cgroup) && have("io") {
return true, nil
}
if isCpuSet(r) && have("cpu") {
if isCpuSet(cgroup) && have("cpu") {
return true, nil
}
if isCpusetSet(r) && have("cpuset") {
if isCpusetSet(cgroup) && have("cpuset") {
return true, nil
}
if isHugeTlbSet(r) && have("hugetlb") {
if isHugeTlbSet(cgroup) && have("hugetlb") {
return true, nil
}
@@ -64,8 +64,8 @@ func needAnyControllers(r *configs.Resources) (bool, error) {
// containsDomainController returns whether the current config contains domain controller or not.
// Refer to: http://man7.org/linux/man-pages/man7/cgroups.7.html
// As at Linux 4.19, the following controllers are threaded: cpu, perf_event, and pids.
func containsDomainController(r *configs.Resources) bool {
return isMemorySet(r) || isIoSet(r) || isCpuSet(r) || isHugeTlbSet(r)
func containsDomainController(cg *configs.Cgroup) bool {
return isMemorySet(cg) || isIoSet(cg) || isCpuSet(cg) || isHugeTlbSet(cg)
}
// CreateCgroupPath creates cgroupv2 path, enabling all the supported controllers.
@@ -74,7 +74,7 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
return fmt.Errorf("invalid cgroup path %s", path)
}
content, err := supportedControllers()
content, err := supportedControllers(c)
if err != nil {
return err
}
@@ -115,7 +115,7 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
// the controllers requested are thread-aware we can simply put the cgroup into
// threaded mode.
case "domain invalid":
if containsDomainController(c.Resources) {
if containsDomainController(c) {
return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in an invalid state", current)
} else {
// Not entirely correct (in theory we'd always want to be a domain --
@@ -129,7 +129,7 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
case "domain threaded":
fallthrough
case "threaded":
if containsDomainController(c.Resources) {
if containsDomainController(c) {
return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType)
}
}

View File

@@ -7,8 +7,6 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
@@ -28,40 +26,26 @@ func isRWM(perms devices.Permissions) bool {
return r && w && m
}
// This is similar to the logic applied in crun for handling errors from bpf(2)
// <https://github.com/containers/crun/blob/0.17/src/libcrun/cgroup.c#L2438-L2470>.
func canSkipEBPFError(r *configs.Resources) bool {
// If we're running in a user namespace we can ignore eBPF rules because we
// usually cannot use bpf(2), as well as rootless containers usually don't
// have the necessary privileges to mknod(2) device inodes or access
// host-level instances (though ideally we would be blocking device access
// for rootless containers anyway).
if userns.RunningInUserNS() {
return true
}
// We cannot ignore an eBPF load error if any rule if is a block rule or it
// doesn't permit all access modes.
//
// NOTE: This will sometimes trigger in cases where access modes are split
// between different rules but to handle this correctly would require
// using ".../libcontainer/cgroup/devices".Emulator.
for _, dev := range r.Devices {
if !dev.Allow || !isRWM(dev.Permissions) {
// the logic is from crun
// https://github.com/containers/crun/blob/0.10.2/src/libcrun/cgroup.c#L1644-L1652
func canSkipEBPFError(cgroup *configs.Cgroup) bool {
for _, dev := range cgroup.Resources.Devices {
if dev.Allow || !isRWM(dev.Permissions) {
return false
}
}
return true
}
func setDevices(dirPath string, r *configs.Resources) error {
if r.SkipDevices {
func setDevices(dirPath string, cgroup *configs.Cgroup) error {
if cgroup.SkipDevices {
return nil
}
// XXX: This is currently a white-list (but all callers pass a blacklist of
// devices). This is bad for a whole variety of reasons, but will need
// to be fixed with co-ordinated effort with downstreams.
insts, license, err := devicefilter.DeviceFilter(r.Devices)
devices := cgroup.Devices
insts, license, err := devicefilter.DeviceFilter(devices)
if err != nil {
return err
}
@@ -82,7 +66,7 @@ func setDevices(dirPath string, r *configs.Resources) error {
// programs. You could temporarily insert a deny-everything program
// but that would result in spurrious failures during updates.
if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
if !canSkipEBPFError(r) {
if !canSkipEBPFError(cgroup) {
return err
}
}

View File

@@ -75,7 +75,7 @@ func (m *manager) Apply(pid int) error {
// - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if m.rootless {
if m.config.Path == "" {
if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed {
if blNeed, nErr := needAnyControllers(m.config); nErr == nil && !blNeed {
return nil
}
return errors.Wrap(err, "rootless needs no limits + no cgrouppath when no permission is granted for cgroups")
@@ -103,27 +103,43 @@ func (m *manager) GetStats() (*cgroups.Stats, error) {
)
st := cgroups.NewStats()
if err := m.getControllers(); err != nil {
return st, err
}
// pids (since kernel 4.5)
if err := statPids(m.dirPath, st); err != nil {
errs = append(errs, err)
if _, ok := m.controllers["pids"]; ok {
if err := statPids(m.dirPath, st); err != nil {
errs = append(errs, err)
}
} else {
if err := statPidsWithoutController(m.dirPath, st); err != nil {
errs = append(errs, err)
}
}
// memory (since kernel 4.5)
if err := statMemory(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
if _, ok := m.controllers["memory"]; ok {
if err := statMemory(m.dirPath, st); err != nil {
errs = append(errs, err)
}
}
// io (since kernel 4.5)
if err := statIo(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
if _, ok := m.controllers["io"]; ok {
if err := statIo(m.dirPath, st); err != nil {
errs = append(errs, err)
}
}
// cpu (since kernel 4.15)
// Note cpu.stat is available even if the controller is not enabled.
if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
if _, ok := m.controllers["cpu"]; ok {
if err := statCpu(m.dirPath, st); err != nil {
errs = append(errs, err)
}
}
// hugetlb (since kernel 5.6)
if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
if _, ok := m.controllers["hugetlb"]; ok {
if err := statHugeTlb(m.dirPath, st); err != nil {
errs = append(errs, err)
}
}
if len(errs) > 0 && !m.rootless {
return st, errors.Errorf("error while statting cgroup v2: %+v", errs)
@@ -147,50 +163,53 @@ func (m *manager) Path(_ string) string {
return m.dirPath
}
func (m *manager) Set(r *configs.Resources) error {
func (m *manager) Set(container *configs.Config) error {
if container == nil || container.Cgroups == nil {
return nil
}
if err := m.getControllers(); err != nil {
return err
}
// pids (since kernel 4.5)
if err := setPids(m.dirPath, r); err != nil {
if err := setPids(m.dirPath, container.Cgroups); err != nil {
return err
}
// memory (since kernel 4.5)
if err := setMemory(m.dirPath, r); err != nil {
if err := setMemory(m.dirPath, container.Cgroups); err != nil {
return err
}
// io (since kernel 4.5)
if err := setIo(m.dirPath, r); err != nil {
if err := setIo(m.dirPath, container.Cgroups); err != nil {
return err
}
// cpu (since kernel 4.15)
if err := setCpu(m.dirPath, r); err != nil {
if err := setCpu(m.dirPath, container.Cgroups); err != nil {
return err
}
// devices (since kernel 4.15, pseudo-controller)
//
// When m.rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if err := setDevices(m.dirPath, r); err != nil && !m.rootless {
if err := setDevices(m.dirPath, container.Cgroups); err != nil && !m.rootless {
return err
}
// cpuset (since kernel 5.0)
if err := setCpuset(m.dirPath, r); err != nil {
if err := setCpuset(m.dirPath, container.Cgroups); err != nil {
return err
}
// hugetlb (since kernel 5.6)
if err := setHugeTlb(m.dirPath, r); err != nil {
if err := setHugeTlb(m.dirPath, container.Cgroups); err != nil {
return err
}
// freezer (since kernel 5.2, pseudo-controller)
if err := setFreezer(m.dirPath, r.Freezer); err != nil {
if err := setFreezer(m.dirPath, container.Cgroups.Freezer); err != nil {
return err
}
if err := m.setUnified(r.Unified); err != nil {
if err := m.setUnified(container.Cgroups.Unified); err != nil {
return err
}
m.config.Resources = r
m.config = container.Cgroups
return nil
}
@@ -238,16 +257,3 @@ func (m *manager) GetFreezerState() (configs.FreezerState, error) {
func (m *manager) Exists() bool {
return cgroups.PathExists(m.dirPath)
}
func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.events", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.dirPath)
if err != nil && m.rootless && os.IsNotExist(err) {
err = nil
}
return c, err
}

View File

@@ -12,15 +12,15 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
func isHugeTlbSet(r *configs.Resources) bool {
return len(r.HugetlbLimit) > 0
func isHugeTlbSet(cgroup *configs.Cgroup) bool {
return len(cgroup.Resources.HugetlbLimit) > 0
}
func setHugeTlb(dirPath string, r *configs.Resources) error {
if !isHugeTlbSet(r) {
func setHugeTlb(dirPath string, cgroup *configs.Cgroup) error {
if !isHugeTlbSet(cgroup) {
return nil
}
for _, hugetlb := range r.HugetlbLimit {
for _, hugetlb := range cgroup.Resources.HugetlbLimit {
if err := fscommon.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}
@@ -44,10 +44,14 @@ func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
hugetlbStats.Usage = value
fileName := "hugetlb." + pagesize + ".events"
value, err = fscommon.GetValueByKey(dirPath, fileName, "max")
contents, err := fscommon.ReadFile(dirPath, fileName)
if err != nil {
return errors.Wrap(err, "failed to read stats")
}
_, value, err = fscommon.GetCgroupParamKeyValue(contents)
if err != nil {
return errors.Wrap(err, "failed to parse "+fileName)
}
hugetlbStats.Failcnt = value
stats.HugetlbStats[pagesize] = hugetlbStats

View File

@@ -13,50 +13,42 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
func isIoSet(r *configs.Resources) bool {
return r.BlkioWeight != 0 ||
len(r.BlkioThrottleReadBpsDevice) > 0 ||
len(r.BlkioThrottleWriteBpsDevice) > 0 ||
len(r.BlkioThrottleReadIOPSDevice) > 0 ||
len(r.BlkioThrottleWriteIOPSDevice) > 0
func isIoSet(cgroup *configs.Cgroup) bool {
return cgroup.Resources.BlkioWeight != 0 ||
len(cgroup.Resources.BlkioThrottleReadBpsDevice) > 0 ||
len(cgroup.Resources.BlkioThrottleWriteBpsDevice) > 0 ||
len(cgroup.Resources.BlkioThrottleReadIOPSDevice) > 0 ||
len(cgroup.Resources.BlkioThrottleWriteIOPSDevice) > 0
}
func setIo(dirPath string, r *configs.Resources) error {
if !isIoSet(r) {
func setIo(dirPath string, cgroup *configs.Cgroup) error {
if !isIoSet(cgroup) {
return nil
}
if r.BlkioWeight != 0 {
if cgroup.Resources.BlkioWeight != 0 {
filename := "io.bfq.weight"
if err := fscommon.WriteFile(dirPath, filename,
strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
// if io.bfq.weight does not exist, then bfq module is not loaded.
// Fallback to use io.weight with a conversion scheme
if !os.IsNotExist(err) {
return err
}
v := cgroups.ConvertBlkIOToIOWeightValue(r.BlkioWeight)
if err := fscommon.WriteFile(dirPath, "io.weight", strconv.FormatUint(v, 10)); err != nil {
return err
}
strconv.FormatUint(cgroups.ConvertBlkIOToCgroupV2Value(cgroup.Resources.BlkioWeight), 10)); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleReadBpsDevice {
for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteBpsDevice {
for _, td := range cgroup.Resources.BlkioThrottleWriteBpsDevice {
if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleReadIOPSDevice {
for _, td := range cgroup.Resources.BlkioThrottleReadIOPSDevice {
if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteIOPSDevice {
for _, td := range cgroup.Resources.BlkioThrottleWriteIOPSDevice {
if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil {
return err
}

View File

@@ -4,16 +4,13 @@ package fs2
import (
"bufio"
"math"
"os"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
// numToStr converts an int64 value to a string for writing to a
@@ -33,20 +30,21 @@ func numToStr(value int64) (ret string) {
return ret
}
func isMemorySet(r *configs.Resources) bool {
return r.MemoryReservation != 0 || r.Memory != 0 || r.MemorySwap != 0
func isMemorySet(cgroup *configs.Cgroup) bool {
return cgroup.Resources.MemoryReservation != 0 ||
cgroup.Resources.Memory != 0 || cgroup.Resources.MemorySwap != 0
}
func setMemory(dirPath string, r *configs.Resources) error {
if !isMemorySet(r) {
func setMemory(dirPath string, cgroup *configs.Cgroup) error {
if !isMemorySet(cgroup) {
return nil
}
swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(cgroup.Resources.MemorySwap, cgroup.Resources.Memory)
if err != nil {
return err
}
swapStr := numToStr(swap)
if swapStr == "" && swap == 0 && r.MemorySwap > 0 {
if swapStr == "" && swap == 0 && cgroup.Resources.MemorySwap > 0 {
// memory and memorySwap set to the same value -- disable swap
swapStr = "0"
}
@@ -57,7 +55,7 @@ func setMemory(dirPath string, r *configs.Resources) error {
}
}
if val := numToStr(r.Memory); val != "" {
if val := numToStr(cgroup.Resources.Memory); val != "" {
if err := fscommon.WriteFile(dirPath, "memory.max", val); err != nil {
return err
}
@@ -65,7 +63,7 @@ func setMemory(dirPath string, r *configs.Resources) error {
// cgroup.Resources.KernelMemory is ignored
if val := numToStr(r.MemoryReservation); val != "" {
if val := numToStr(cgroup.Resources.MemoryReservation); val != "" {
if err := fscommon.WriteFile(dirPath, "memory.low", val); err != nil {
return err
}
@@ -84,24 +82,16 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
sc := bufio.NewScanner(statsFile)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
t, v, err := fscommon.GetCgroupParamKeyValue(sc.Text())
if err != nil {
return errors.Wrapf(err, "failed to parse memory.stat (%q)", sc.Text())
}
stats.MemoryStats.Stats[t] = v
}
stats.MemoryStats.Cache = stats.MemoryStats.Stats["file"]
// Unlike cgroup v1 which has memory.use_hierarchy binary knob,
// cgroup v2 is always hierarchical.
stats.MemoryStats.UseHierarchy = true
stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]
memoryUsage, err := getMemoryDataV2(dirPath, "")
if err != nil {
if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint {
// The root cgroup does not have memory.{current,max}
// so emulate those using data from /proc/meminfo.
return statsFromMeminfo(stats)
}
return err
}
stats.MemoryStats.Usage = memoryUsage
@@ -109,15 +99,9 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
if err != nil {
return err
}
// As cgroup v1 reports SwapUsage values as mem+swap combined,
// while in cgroup v2 swap values do not include memory,
// report combined mem+swap for v1 compatibility.
swapUsage.Usage += memoryUsage.Usage
if swapUsage.Limit != math.MaxUint64 {
swapUsage.Limit += memoryUsage.Limit
}
stats.MemoryStats.SwapUsage = swapUsage
stats.MemoryStats.UseHierarchy = true
return nil
}
@@ -133,10 +117,7 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
if name != "" && os.IsNotExist(err) {
// Ignore EEXIST as there's no swap accounting
// if kernel CONFIG_MEMCG_SWAP is not set or
// swapaccount=0 kernel boot parameter is given.
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", usage)
@@ -145,69 +126,12 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
if moduleName != "memory" && os.IsNotExist(err) {
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", limit)
}
memoryData.Limit = value
return memoryData, nil
}
func statsFromMeminfo(stats *cgroups.Stats) error {
f, err := os.Open("/proc/meminfo")
if err != nil {
return err
}
defer f.Close()
// Fields we are interested in.
var (
swap_free uint64
swap_total uint64
main_total uint64
main_free uint64
)
mem := map[string]*uint64{
"SwapFree": &swap_free,
"SwapTotal": &swap_total,
"MemTotal": &main_total,
"MemFree": &main_free,
}
found := 0
sc := bufio.NewScanner(f)
for sc.Scan() {
parts := strings.SplitN(sc.Text(), ":", 3)
if len(parts) != 2 {
// Should not happen.
continue
}
k := parts[0]
p, ok := mem[k]
if !ok {
// Unknown field -- not interested.
continue
}
vStr := strings.TrimSpace(strings.TrimSuffix(parts[1], " kB"))
*p, err = strconv.ParseUint(vStr, 10, 64)
if err != nil {
return errors.Wrap(err, "parsing /proc/meminfo "+k)
}
found++
if found == len(mem) {
// Got everything we need -- skip the rest.
break
}
}
if sc.Err() != nil {
return sc.Err()
}
stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024
stats.MemoryStats.SwapUsage.Limit = math.MaxUint64
stats.MemoryStats.Usage.Usage = (main_total - main_free) * 1024
stats.MemoryStats.Usage.Limit = math.MaxUint64
return nil
}

View File

@@ -3,7 +3,6 @@
package fs2
import (
"os"
"path/filepath"
"strings"
@@ -14,15 +13,15 @@ import (
"golang.org/x/sys/unix"
)
func isPidsSet(r *configs.Resources) bool {
return r.PidsLimit != 0
func isPidsSet(cgroup *configs.Cgroup) bool {
return cgroup.Resources.PidsLimit != 0
}
func setPids(dirPath string, r *configs.Resources) error {
if !isPidsSet(r) {
func setPids(dirPath string, cgroup *configs.Cgroup) error {
if !isPidsSet(cgroup) {
return nil
}
if val := numToStr(r.PidsLimit); val != "" {
if val := numToStr(cgroup.Resources.PidsLimit); val != "" {
if err := fscommon.WriteFile(dirPath, "pids.max", val); err != nil {
return err
}
@@ -31,7 +30,7 @@ func setPids(dirPath string, r *configs.Resources) error {
return nil
}
func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error {
func statPidsWithoutController(dirPath string, stats *cgroups.Stats) error {
// if the controller is not enabled, let's read PIDS from cgroups.procs
// (or threads if cgroup.threads is enabled)
contents, err := fscommon.ReadFile(dirPath, "cgroup.procs")
@@ -41,8 +40,13 @@ func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error {
if err != nil {
return err
}
pids := strings.Count(contents, "\n")
stats.PidsStats.Current = uint64(pids)
pids := make(map[string]string)
for _, i := range strings.Split(contents, "\n") {
if i != "" {
pids[i] = i
}
}
stats.PidsStats.Current = uint64(len(pids))
stats.PidsStats.Limit = 0
return nil
}
@@ -50,9 +54,6 @@ func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error {
func statPids(dirPath string, stats *cgroups.Stats) error {
current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current")
if err != nil {
if os.IsNotExist(err) {
return statPidsFromCgroupProcs(dirPath, stats)
}
return errors.Wrap(err, "failed to parse pids.current")
}

View File

@@ -5,6 +5,7 @@ import (
"strings"
"sync"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
@@ -16,7 +17,7 @@ const (
)
var (
// TestMode is set to true by unit tests that need "fake" cgroupfs.
// Set to true by fs unit tests
TestMode bool
cgroupFd int = -1
@@ -70,12 +71,12 @@ func OpenFile(dir, file string, flags int) (*os.File, error) {
flags |= os.O_TRUNC | os.O_CREATE
mode = 0o600
}
if prepareOpenat2() != nil {
return openFallback(dir, file, flags, mode)
}
reldir := strings.TrimPrefix(dir, cgroupfsPrefix)
if len(reldir) == len(dir) { // non-standard path, old system?
return openFallback(dir, file, flags, mode)
return openWithSecureJoin(dir, file, flags, mode)
}
if prepareOpenat2() != nil {
return openWithSecureJoin(dir, file, flags, mode)
}
relname := reldir + "/" + file
@@ -92,29 +93,11 @@ func OpenFile(dir, file string, flags int) (*os.File, error) {
return os.NewFile(uintptr(fd), cgroupfsPrefix+relname), nil
}
var errNotCgroupfs = errors.New("not a cgroup file")
// openFallback is used when openat2(2) is not available. It checks the opened
// file is on cgroupfs, returning an error otherwise.
func openFallback(dir, file string, flags int, mode os.FileMode) (*os.File, error) {
path := dir + "/" + file
fd, err := os.OpenFile(path, flags, mode)
func openWithSecureJoin(dir, file string, flags int, mode os.FileMode) (*os.File, error) {
path, err := securejoin.SecureJoin(dir, file)
if err != nil {
return nil, err
}
if TestMode {
return fd, nil
}
// Check this is a cgroupfs file.
var st unix.Statfs_t
if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
_ = fd.Close()
return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
}
if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
_ = fd.Close()
return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
}
return fd, nil
return os.OpenFile(path, flags, mode)
}

View File

@@ -35,42 +35,22 @@ func ParseUint(s string, base, bitSize int) (uint64, error) {
return value, nil
}
// ParseKeyValue parses a space-separated "name value" kind of cgroup
// parameter and returns its key as a string, and its value as uint64
// (ParseUint is used to convert the value). For example,
// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234.
func ParseKeyValue(t string) (string, uint64, error) {
parts := strings.SplitN(t, " ", 3)
if len(parts) != 2 {
return "", 0, fmt.Errorf("line %q is not in key value format", t)
}
value, err := ParseUint(parts[1], 10, 64)
if err != nil {
return "", 0, fmt.Errorf("unable to convert to uint64: %v", err)
}
return parts[0], value, nil
}
// GetValueByKey reads a key-value pairs from the specified cgroup file,
// and returns a value of the specified key. ParseUint is used for value
// conversion.
func GetValueByKey(path, file, key string) (uint64, error) {
content, err := ReadFile(path, file)
if err != nil {
return 0, err
}
lines := strings.Split(string(content), "\n")
for _, line := range lines {
arr := strings.Split(line, " ")
if len(arr) == 2 && arr[0] == key {
return ParseUint(arr[1], 10, 64)
// GetCgroupParamKeyValue parses a space-separated "name value" kind of cgroup
// parameter and returns its components. For example, "io_service_bytes 1234"
// will return as "io_service_bytes", 1234.
func GetCgroupParamKeyValue(t string) (string, uint64, error) {
parts := strings.Fields(t)
switch len(parts) {
case 2:
value, err := ParseUint(parts[1], 10, 64)
if err != nil {
return "", 0, fmt.Errorf("unable to convert to uint64: %v", err)
}
}
return 0, nil
return parts[0], value, nil
default:
return "", 0, ErrNotValidFormat
}
}
// GetCgroupParamUint reads a single uint64 value from the specified cgroup file.

View File

@@ -2,7 +2,6 @@ package systemd
import (
"bufio"
"context"
"fmt"
"math"
"os"
@@ -29,6 +28,10 @@ const (
)
var (
connOnce sync.Once
connDbus *systemdDbus.Conn
connErr error
versionOnce sync.Once
version int
@@ -288,6 +291,19 @@ func generateDeviceProperties(rules []*devices.Rule) ([]systemdDbus.Property, er
return properties, nil
}
// getDbusConnection lazy initializes systemd dbus connection
// and returns it
func getDbusConnection(rootless bool) (*systemdDbus.Conn, error) {
connOnce.Do(func() {
if rootless {
connDbus, connErr = NewUserSystemdDbus()
} else {
connDbus, connErr = systemdDbus.New()
}
})
return connDbus, connErr
}
func newProp(name string, units interface{}) systemdDbus.Property {
return systemdDbus.Property{
Name: name,
@@ -303,42 +319,32 @@ func getUnitName(c *configs.Cgroup) string {
return c.Name
}
// isDbusError returns true if the error is a specific dbus error.
func isDbusError(err error, name string) bool {
// isUnitExists returns true if the error is that a systemd unit already exists.
func isUnitExists(err error) bool {
if err != nil {
var derr *dbus.Error
if errors.As(err, &derr) {
return strings.Contains(derr.Name, name)
if dbusError, ok := err.(dbus.Error); ok {
return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
}
}
return false
}
// isUnitExists returns true if the error is that a systemd unit already exists.
func isUnitExists(err error) bool {
return isDbusError(err, "org.freedesktop.systemd1.UnitExists")
}
func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property) error {
func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []systemdDbus.Property) error {
statusChan := make(chan string, 1)
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
_, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan)
return err
})
if err == nil {
if _, err := dbusConnection.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
timeout := time.NewTimer(30 * time.Second)
defer timeout.Stop()
select {
case s := <-statusChan:
close(statusChan)
// Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
// Please refer to https://godoc.org/github.com/coreos/go-systemd/dbus#Conn.StartUnit
if s != "done" {
resetFailedUnit(cm, unitName)
dbusConnection.ResetFailedUnit(unitName)
return errors.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
}
case <-timeout.C:
resetFailedUnit(cm, unitName)
dbusConnection.ResetFailedUnit(unitName)
return errors.New("Timeout waiting for systemd to create " + unitName)
}
} else if !isUnitExists(err) {
@@ -348,17 +354,13 @@ func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Pr
return nil
}
func stopUnit(cm *dbusConnManager, unitName string) error {
func stopUnit(dbusConnection *systemdDbus.Conn, unitName string) error {
statusChan := make(chan string, 1)
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
_, err := c.StopUnitContext(context.TODO(), unitName, "replace", statusChan)
return err
})
if err == nil {
if _, err := dbusConnection.StopUnit(unitName, "replace", statusChan); err == nil {
select {
case s := <-statusChan:
close(statusChan)
// Please refer to https://godoc.org/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
// Please refer to https://godoc.org/github.com/coreos/go-systemd/dbus#Conn.StartUnit
if s != "done" {
logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s)
}
@@ -369,38 +371,10 @@ func stopUnit(cm *dbusConnManager, unitName string) error {
return nil
}
func resetFailedUnit(cm *dbusConnManager, name string) {
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
return c.ResetFailedUnitContext(context.TODO(), name)
})
if err != nil {
logrus.Warnf("unable to reset failed unit: %v", err)
}
}
func setUnitProperties(cm *dbusConnManager, name string, properties ...systemdDbus.Property) error {
return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
return c.SetUnitPropertiesContext(context.TODO(), name, true, properties...)
})
}
func getManagerProperty(cm *dbusConnManager, name string) (string, error) {
str := ""
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
var err error
str, err = c.GetManagerProperty(name)
return err
})
if err != nil {
return "", err
}
return strconv.Unquote(str)
}
func systemdVersion(cm *dbusConnManager) int {
func systemdVersion(conn *systemdDbus.Conn) int {
versionOnce.Do(func() {
version = -1
verStr, err := getManagerProperty(cm, "Version")
verStr, err := conn.GetManagerProperty("Version")
if err == nil {
version, err = systemdVersionAtoi(verStr)
}
@@ -415,11 +389,11 @@ func systemdVersion(cm *dbusConnManager) int {
func systemdVersionAtoi(verStr string) (int, error) {
// verStr should be of the form:
// "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32" (without quotes).
// The result for all of the above should be 245.
// Thus, we unconditionally remove the "v" prefix
// and then match on the first integer we can grab.
re := regexp.MustCompile(`v?([0-9]+)`)
// "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32"
// all the input strings include quotes, and the output int should be 245
// thus, we unconditionally remove the `"v`
// and then match on the first integer we can grab
re := regexp.MustCompile(`"?v?([0-9]+)`)
matches := re.FindStringSubmatch(verStr)
if len(matches) < 2 {
return 0, errors.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches)
@@ -428,10 +402,10 @@ func systemdVersionAtoi(verStr string) (int, error) {
return ver, errors.Wrapf(err, "can't parse version %s", verStr)
}
func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota int64, period uint64) {
func addCpuQuota(conn *systemdDbus.Conn, properties *[]systemdDbus.Property, quota int64, period uint64) {
if period != 0 {
// systemd only supports CPUQuotaPeriodUSec since v242
sdVer := systemdVersion(cm)
sdVer := systemdVersion(conn)
if sdVer >= 242 {
*properties = append(*properties,
newProp("CPUQuotaPeriodUSec", period))
@@ -462,13 +436,13 @@ func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota
}
}
func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems string) error {
func addCpuset(conn *systemdDbus.Conn, props *[]systemdDbus.Property, cpus, mems string) error {
if cpus == "" && mems == "" {
return nil
}
// systemd only supports AllowedCPUs/AllowedMemoryNodes since v244
sdVer := systemdVersion(cm)
sdVer := systemdVersion(conn)
if sdVer < 244 {
logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+
" (settings will still be applied to cgroupfs)", sdVer)

View File

@@ -1,96 +0,0 @@
// +build linux
package systemd
import (
"context"
"sync"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
)
var (
dbusC *systemdDbus.Conn
dbusMu sync.RWMutex
dbusInited bool
dbusRootless bool
)
type dbusConnManager struct {
}
// newDbusConnManager initializes systemd dbus connection manager.
func newDbusConnManager(rootless bool) *dbusConnManager {
if dbusInited && rootless != dbusRootless {
panic("can't have both root and rootless dbus")
}
dbusRootless = rootless
return &dbusConnManager{}
}
// getConnection lazily initializes and returns systemd dbus connection.
func (d *dbusConnManager) getConnection() (*systemdDbus.Conn, error) {
// In the case where dbusC != nil
// Use the read lock the first time to ensure
// that Conn can be acquired at the same time.
dbusMu.RLock()
if conn := dbusC; conn != nil {
dbusMu.RUnlock()
return conn, nil
}
dbusMu.RUnlock()
// In the case where dbusC == nil
// Use write lock to ensure that only one
// will be created
dbusMu.Lock()
defer dbusMu.Unlock()
if conn := dbusC; conn != nil {
return conn, nil
}
conn, err := d.newConnection()
if err != nil {
return nil, err
}
dbusC = conn
return conn, nil
}
func (d *dbusConnManager) newConnection() (*systemdDbus.Conn, error) {
if dbusRootless {
return newUserSystemdDbus()
}
return systemdDbus.NewWithContext(context.TODO())
}
// resetConnection resets the connection to its initial state
// (so it can be reconnected if necessary).
func (d *dbusConnManager) resetConnection(conn *systemdDbus.Conn) {
dbusMu.Lock()
defer dbusMu.Unlock()
if dbusC != nil && dbusC == conn {
dbusC.Close()
dbusC = nil
}
}
var errDbusConnClosed = dbus.ErrClosed.Error()
// retryOnDisconnect calls op, and if the error it returns is about closed dbus
// connection, the connection is re-established and the op is retried. This helps
// with the situation when dbus is restarted and we have a stale connection.
func (d *dbusConnManager) retryOnDisconnect(op func(*systemdDbus.Conn) error) error {
for {
conn, err := d.getConnection()
if err != nil {
return err
}
err = op(conn)
if !isDbusError(err, errDbusConnClosed) {
return err
}
d.resetConnection(conn)
}
}

View File

@@ -13,12 +13,12 @@ import (
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/pkg/errors"
)
// newUserSystemdDbus creates a connection for systemd user-instance.
func newUserSystemdDbus() (*systemdDbus.Conn, error) {
// NewUserSystemdDbus creates a connection for systemd user-instance.
func NewUserSystemdDbus() (*systemdDbus.Conn, error) {
addr, err := DetectUserDbusSessionBusAddress()
if err != nil {
return nil, err
@@ -52,7 +52,7 @@ func newUserSystemdDbus() (*systemdDbus.Conn, error) {
//
// Otherwise returns os.Getuid() .
func DetectUID() (int, error) {
if !userns.RunningInUserNS() {
if !system.RunningInUserNS() {
return os.Getuid(), nil
}
b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput()

View File

@@ -12,6 +12,7 @@ import (
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
)
@@ -20,14 +21,12 @@ type legacyManager struct {
mu sync.Mutex
cgroups *configs.Cgroup
paths map[string]string
dbus *dbusConnManager
}
func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) cgroups.Manager {
return &legacyManager{
cgroups: cg,
paths: paths,
dbus: newDbusConnManager(false),
}
}
@@ -36,8 +35,8 @@ type subsystem interface {
Name() string
// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
GetStats(path string, stats *cgroups.Stats) error
// Set sets cgroup resource limits.
Set(path string, r *configs.Resources) error
// Set the cgroup represented by cgroup.
Set(path string, cgroup *configs.Cgroup) error
}
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
@@ -58,8 +57,9 @@ var legacySubsystems = []subsystem{
&fs.NameGroup{GroupName: "name=systemd"},
}
func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
func genV1ResourcesProperties(c *configs.Cgroup, conn *systemdDbus.Conn) ([]systemdDbus.Property, error) {
var properties []systemdDbus.Property
r := c.Resources
deviceProperties, err := generateDeviceProperties(r.Devices)
if err != nil {
@@ -77,7 +77,7 @@ func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
newProp("CPUShares", r.CpuShares))
}
addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
addCpuQuota(conn, &properties, r.CpuQuota, r.CpuPeriod)
if r.BlkioWeight != 0 {
properties = append(properties,
@@ -86,10 +86,11 @@ func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
if r.PidsLimit > 0 || r.PidsLimit == -1 {
properties = append(properties,
newProp("TasksAccounting", true),
newProp("TasksMax", uint64(r.PidsLimit)))
}
err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
err = addCpuset(conn, &properties, r.CpusetCpus, r.CpusetMems)
if err != nil {
return nil, err
}
@@ -157,17 +158,32 @@ func (m *legacyManager) Apply(pid int) error {
properties = append(properties,
newProp("MemoryAccounting", true),
newProp("CPUAccounting", true),
newProp("BlockIOAccounting", true),
newProp("TasksAccounting", true),
)
newProp("BlockIOAccounting", true))
// Assume DefaultDependencies= will always work (the check for it was previously broken.)
properties = append(properties,
newProp("DefaultDependencies", false))
dbusConnection, err := getDbusConnection(false)
if err != nil {
return err
}
resourcesProperties, err := genV1ResourcesProperties(c, dbusConnection)
if err != nil {
return err
}
properties = append(properties, resourcesProperties...)
properties = append(properties, c.SystemdProps...)
if err := startUnit(m.dbus, unitName, properties); err != nil {
// We have to set kernel memory here, as we can't change it once
// processes have been attached to the cgroup.
if c.Resources.KernelMemory != 0 {
if err := enableKmem(c); err != nil {
return err
}
}
if err := startUnit(dbusConnection, unitName, properties); err != nil {
return err
}
@@ -205,8 +221,13 @@ func (m *legacyManager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
stopErr := stopUnit(m.dbus, getUnitName(m.cgroups))
dbusConnection, err := getDbusConnection(false)
if err != nil {
return err
}
unitName := getUnitName(m.cgroups)
stopErr := stopUnit(dbusConnection, unitName)
// Both on success and on error, cleanup all the cgroups we are aware of.
// Some of them were created directly by Apply() and are not managed by systemd.
if err := cgroups.RemovePaths(m.paths); err != nil {
@@ -231,7 +252,7 @@ func (m *legacyManager) joinCgroups(pid int) error {
case "cpuset":
if path, ok := m.paths[name]; ok {
s := &fs.CpusetGroup{}
if err := s.ApplyDir(path, m.cgroups.Resources, pid); err != nil {
if err := s.ApplyDir(path, m.cgroups, pid); err != nil {
return err
}
}
@@ -284,7 +305,7 @@ func (m *legacyManager) Freeze(state configs.FreezerState) error {
prevState := m.cgroups.Resources.Freezer
m.cgroups.Resources.Freezer = state
freezer := &fs.FreezerGroup{}
if err := freezer.Set(path, m.cgroups.Resources); err != nil {
if err := freezer.Set(path, m.cgroups); err != nil {
m.cgroups.Resources.Freezer = prevState
return err
}
@@ -324,16 +345,20 @@ func (m *legacyManager) GetStats() (*cgroups.Stats, error) {
return stats, nil
}
func (m *legacyManager) Set(r *configs.Resources) error {
func (m *legacyManager) Set(container *configs.Config) error {
// If Paths are set, then we are just joining cgroups paths
// and there is no need to set any values.
if m.cgroups.Paths != nil {
return nil
}
if r.Unified != nil {
if container.Cgroups.Resources.Unified != nil {
return cgroups.ErrV1NoUnified
}
properties, err := genV1ResourcesProperties(r, m.dbus)
dbusConnection, err := getDbusConnection(false)
if err != nil {
return err
}
properties, err := genV1ResourcesProperties(container.Cgroups, dbusConnection)
if err != nil {
return err
}
@@ -361,7 +386,7 @@ func (m *legacyManager) Set(r *configs.Resources) error {
}
}
if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {
if err := dbusConnection.SetUnitProperties(getUnitName(container.Cgroups), true, properties...); err != nil {
_ = m.Freeze(targetFreezerState)
return err
}
@@ -376,7 +401,7 @@ func (m *legacyManager) Set(r *configs.Resources) error {
if !ok {
continue
}
if err := sys.Set(path, r); err != nil {
if err := sys.Set(path, container.Cgroups); err != nil {
return err
}
}
@@ -384,6 +409,30 @@ func (m *legacyManager) Set(r *configs.Resources) error {
return nil
}
func enableKmem(c *configs.Cgroup) error {
path, err := getSubsystemPath(c, "memory")
if err != nil {
if cgroups.IsNotFound(err) {
return nil
}
return err
}
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
// do not try to enable the kernel memory if we already have
// tasks in the cgroup.
content, err := fscommon.ReadFile(path, "tasks")
if err != nil {
return err
}
if len(content) > 0 {
return nil
}
return fs.EnableKernelMemoryAccounting(path)
}
func (m *legacyManager) GetPaths() map[string]string {
m.mu.Lock()
defer m.mu.Unlock()
@@ -406,7 +455,3 @@ func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
func (m *legacyManager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}
func (m *legacyManager) OOMKillCount() (uint64, error) {
return fs.OOMKillCount(m.Path("memory"))
}

View File

@@ -26,7 +26,6 @@ type unifiedManager struct {
// path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
path string
rootless bool
dbus *dbusConnManager
}
func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgroups.Manager {
@@ -34,7 +33,6 @@ func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgrou
cgroups: config,
path: path,
rootless: rootless,
dbus: newDbusConnManager(rootless),
}
}
@@ -47,7 +45,7 @@ func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgrou
// For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt
//
// For the list of systemd unit properties, see systemd.resource-control(5).
func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) {
func unifiedResToSystemdProps(conn *systemdDbus.Conn, res map[string]string) (props []systemdDbus.Property, _ error) {
var err error
for k, v := range res {
@@ -85,7 +83,7 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props
return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err)
}
}
addCpuQuota(cm, &props, quota, period)
addCpuQuota(conn, &props, quota, period)
case "cpu.weight":
num, err := strconv.ParseUint(v, 10, 64)
@@ -105,7 +103,7 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props
"cpuset.mems": "AllowedMemoryNodes",
}
// systemd only supports these properties since v244
sdVer := systemdVersion(cm)
sdVer := systemdVersion(conn)
if sdVer >= 244 {
props = append(props,
newProp(m[k], bits))
@@ -143,6 +141,7 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props
}
}
props = append(props,
newProp("TasksAccounting", true),
newProp("TasksMax", num))
case "memory.oom.group":
@@ -164,8 +163,9 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props
return props, nil
}
func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
func genV2ResourcesProperties(c *configs.Cgroup, conn *systemdDbus.Conn) ([]systemdDbus.Property, error) {
var properties []systemdDbus.Property
r := c.Resources
// NOTE: This is of questionable correctness because we insert our own
// devices eBPF program later. Two programs with identical rules
@@ -201,14 +201,15 @@ func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
newProp("CPUWeight", r.CpuWeight))
}
addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
addCpuQuota(conn, &properties, r.CpuQuota, r.CpuPeriod)
if r.PidsLimit > 0 || r.PidsLimit == -1 {
properties = append(properties,
newProp("TasksAccounting", true),
newProp("TasksMax", uint64(r.PidsLimit)))
}
err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
err = addCpuset(conn, &properties, r.CpusetCpus, r.CpusetMems)
if err != nil {
return nil, err
}
@@ -217,7 +218,7 @@ func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
// convert Resources.Unified map to systemd properties
if r.Unified != nil {
unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified)
unifiedProps, err := unifiedResToSystemdProps(conn, r.Unified)
if err != nil {
return nil, err
}
@@ -272,21 +273,28 @@ func (m *unifiedManager) Apply(pid int) error {
properties = append(properties,
newProp("MemoryAccounting", true),
newProp("CPUAccounting", true),
newProp("IOAccounting", true),
newProp("TasksAccounting", true),
)
newProp("IOAccounting", true))
// Assume DefaultDependencies= will always work (the check for it was previously broken.)
properties = append(properties,
newProp("DefaultDependencies", false))
dbusConnection, err := getDbusConnection(m.rootless)
if err != nil {
return err
}
resourcesProperties, err := genV2ResourcesProperties(c, dbusConnection)
if err != nil {
return err
}
properties = append(properties, resourcesProperties...)
properties = append(properties, c.SystemdProps...)
if err := startUnit(m.dbus, unitName, properties); err != nil {
if err := startUnit(dbusConnection, unitName, properties); err != nil {
return errors.Wrapf(err, "error while starting unit %q with properties %+v", unitName, properties)
}
if err := m.initPath(); err != nil {
if err = m.initPath(); err != nil {
return err
}
if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
@@ -302,13 +310,17 @@ func (m *unifiedManager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
dbusConnection, err := getDbusConnection(m.rootless)
if err != nil {
return err
}
unitName := getUnitName(m.cgroups)
if err := stopUnit(m.dbus, unitName); err != nil {
if err := stopUnit(dbusConnection, unitName); err != nil {
return err
}
// XXX this is probably not needed, systemd should handle it
err := os.Remove(m.path)
err = os.Remove(m.path)
if err != nil && !os.IsNotExist(err) {
return err
}
@@ -317,7 +329,6 @@ func (m *unifiedManager) Destroy() error {
}
func (m *unifiedManager) Path(_ string) string {
_ = m.initPath()
return m.path
}
@@ -338,8 +349,16 @@ func (m *unifiedManager) getSliceFull() (string, error) {
}
if m.rootless {
// managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service".
managerCG, err := getManagerProperty(m.dbus, "ControlGroup")
dbusConnection, err := getDbusConnection(m.rootless)
if err != nil {
return "", err
}
// managerCGQuoted is typically "/user.slice/user-${uid}.slice/user@${uid}.service" including the quote symbols
managerCGQuoted, err := dbusConnection.GetManagerProperty("ControlGroup")
if err != nil {
return "", err
}
managerCG, err := strconv.Unquote(managerCGQuoted)
if err != nil {
return "", err
}
@@ -412,8 +431,12 @@ func (m *unifiedManager) GetStats() (*cgroups.Stats, error) {
return fsMgr.GetStats()
}
func (m *unifiedManager) Set(r *configs.Resources) error {
properties, err := genV2ResourcesProperties(r, m.dbus)
func (m *unifiedManager) Set(container *configs.Config) error {
dbusConnection, err := getDbusConnection(m.rootless)
if err != nil {
return err
}
properties, err := genV2ResourcesProperties(m.cgroups, dbusConnection)
if err != nil {
return err
}
@@ -441,7 +464,7 @@ func (m *unifiedManager) Set(r *configs.Resources) error {
}
}
if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {
if err := dbusConnection.SetUnitProperties(getUnitName(m.cgroups), true, properties...); err != nil {
_ = m.Freeze(targetFreezerState)
return errors.Wrap(err, "error while setting unit properties")
}
@@ -454,7 +477,7 @@ func (m *unifiedManager) Set(r *configs.Resources) error {
if err != nil {
return err
}
return fsMgr.Set(r)
return fsMgr.Set(container)
}
func (m *unifiedManager) GetPaths() map[string]string {
@@ -478,11 +501,3 @@ func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) {
func (m *unifiedManager) Exists() bool {
return cgroups.PathExists(m.path)
}
func (m *unifiedManager) OOMKillCount() (uint64, error) {
fsMgr, err := m.fsManager()
if err != nil {
return 0, err
}
return fsMgr.OOMKillCount()
}

View File

@@ -16,7 +16,7 @@ import (
"time"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@@ -37,7 +37,7 @@ func IsCgroup2UnifiedMode() bool {
var st unix.Statfs_t
err := unix.Statfs(unifiedMountpoint, &st)
if err != nil {
if os.IsNotExist(err) && userns.RunningInUserNS() {
if os.IsNotExist(err) && system.RunningInUserNS() {
// ignore the "not found" error if running in userns
logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
isUnified = false
@@ -400,6 +400,17 @@ func WriteCgroupProc(dir string, pid int) error {
return err
}
// Since the OCI spec is designed for cgroup v1, in some cases
// there is need to convert from the cgroup v1 configuration to cgroup v2
// the formula for BlkIOWeight is y = (1 + (x - 10) * 9999 / 990)
// convert linearly from [10-1000] to [1-10000]
func ConvertBlkIOToCgroupV2Value(blkIoWeight uint16) uint64 {
if blkIoWeight == 0 {
return 0
}
return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
}
// Since the OCI spec is designed for cgroup v1, in some cases
// there is need to convert from the cgroup v1 configuration to cgroup v2
// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
@@ -439,14 +450,3 @@ func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
return memorySwap - memory, nil
}
// Since the OCI spec is designed for cgroup v1, in some cases
// there is need to convert from the cgroup v1 configuration to cgroup v2
// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
// convert linearly from [10-1000] to [1-10000]
func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
if blkIoWeight == 0 {
return 0
}
return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
}

View File

@@ -54,6 +54,12 @@ type Resources struct {
// Total memory usage (memory + swap); set `-1` to enable unlimited swap
MemorySwap int64 `json:"memory_swap"`
// Kernel memory limit (in bytes)
KernelMemory int64 `json:"kernel_memory"`
// Kernel memory limit for TCP use (in bytes)
KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
// CPU shares (relative weight vs. other containers)
CpuShares uint64 `json:"cpu_shares"`

View File

@@ -222,25 +222,25 @@ const (
// the runtime environment has been created but before the pivot_root has been executed.
// CreateRuntime is called immediately after the deprecated Prestart hook.
// CreateRuntime commands are called in the Runtime Namespace.
CreateRuntime HookName = "createRuntime"
CreateRuntime = "createRuntime"
// CreateContainer commands MUST be called as part of the create operation after
// the runtime environment has been created but before the pivot_root has been executed.
// CreateContainer commands are called in the Container namespace.
CreateContainer HookName = "createContainer"
CreateContainer = "createContainer"
// StartContainer commands MUST be called as part of the start operation and before
// the container process is started.
// StartContainer commands are called in the Container namespace.
StartContainer HookName = "startContainer"
StartContainer = "startContainer"
// Poststart commands are executed after the container init process starts.
// Poststart commands are called in the Runtime Namespace.
Poststart HookName = "poststart"
Poststart = "poststart"
// Poststop commands are executed after the container init process exits.
// Poststop commands are called in the Runtime Namespace.
Poststop HookName = "poststop"
Poststop = "poststop"
)
type Capabilities struct {
@@ -387,7 +387,7 @@ func (c Command) Run(s *specs.State) error {
return err
case <-timerCh:
cmd.Process.Kill()
<-errC
cmd.Wait()
return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
}
}

View File

@@ -1,9 +0,0 @@
// +build gofuzz
package configs
func FuzzUnmarshalJSON(data []byte) int {
hooks := Hooks{}
_ = hooks.UnmarshalJSON(data)
return 1
}

View File

@@ -11,9 +11,6 @@ import (
// rootlessEUID makes sure that the config can be applied when runc
// is being executed as a non-root user (euid != 0) in the current user namespace.
func (v *ConfigValidator) rootlessEUID(config *configs.Config) error {
if !config.RootlessEUID {
return nil
}
if err := rootlessEUIDMappings(config); err != nil {
return err
}

View File

@@ -8,7 +8,6 @@ import (
"strings"
"sync"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/intelrdt"
selinux "github.com/opencontainers/selinux/go-selinux"
@@ -26,30 +25,36 @@ func New() Validator {
type ConfigValidator struct {
}
type check func(config *configs.Config) error
func (v *ConfigValidator) Validate(config *configs.Config) error {
checks := []check{
v.rootfs,
v.network,
v.hostname,
v.security,
v.usernamespace,
v.cgroupnamespace,
v.sysctl,
v.intelrdt,
v.rootlessEUID,
v.mounts,
if err := v.rootfs(config); err != nil {
return err
}
for _, c := range checks {
if err := c(config); err != nil {
if err := v.network(config); err != nil {
return err
}
if err := v.hostname(config); err != nil {
return err
}
if err := v.security(config); err != nil {
return err
}
if err := v.usernamespace(config); err != nil {
return err
}
if err := v.cgroupnamespace(config); err != nil {
return err
}
if err := v.sysctl(config); err != nil {
return err
}
if err := v.intelrdt(config); err != nil {
return err
}
if config.RootlessEUID {
if err := v.rootlessEUID(config); err != nil {
return err
}
}
if err := v.cgroups(config); err != nil {
return err
}
return nil
}
@@ -218,45 +223,6 @@ func (v *ConfigValidator) intelrdt(config *configs.Config) error {
return nil
}
func (v *ConfigValidator) cgroups(config *configs.Config) error {
c := config.Cgroups
if c == nil {
return nil
}
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return fmt.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c)
}
r := c.Resources
if r == nil {
return nil
}
if !cgroups.IsCgroup2UnifiedMode() && r.Unified != nil {
return cgroups.ErrV1NoUnified
}
if cgroups.IsCgroup2UnifiedMode() {
_, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
if err != nil {
return err
}
}
return nil
}
func (v *ConfigValidator) mounts(config *configs.Config) error {
for _, m := range config.Mounts {
if !filepath.IsAbs(m.Destination) {
return fmt.Errorf("invalid mount %+v: mount destination not absolute", m)
}
}
return nil
}
func isHostNetNS(path string) (bool, error) {
const currentProcessNetns = "/proc/self/ns/net"

View File

@@ -27,13 +27,13 @@ import (
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/checkpoint-restore/go-criu/v5"
criurpc "github.com/checkpoint-restore/go-criu/v5/rpc"
"github.com/checkpoint-restore/go-criu/v4"
criurpc "github.com/checkpoint-restore/go-criu/v4/rpc"
"github.com/golang/protobuf/proto"
errorsf "github.com/pkg/errors"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink/nl"
"golang.org/x/sys/unix"
"google.golang.org/protobuf/proto"
)
const stdioFdCount = 3
@@ -55,7 +55,6 @@ type linuxContainer struct {
criuVersion int
state containerState
created time.Time
fifo *os.File
}
// State represents a running container's state
@@ -225,9 +224,9 @@ func (c *linuxContainer) Set(config configs.Config) error {
if status == Stopped {
return newGenericError(errors.New("container not running"), ContainerNotRunning)
}
if err := c.cgroupManager.Set(config.Cgroups.Resources); err != nil {
if err := c.cgroupManager.Set(&config); err != nil {
// Set configs back
if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
if err2 := c.cgroupManager.Set(c.config); err2 != nil {
logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
}
return err
@@ -235,7 +234,7 @@ func (c *linuxContainer) Set(config configs.Config) error {
if c.intelRdtManager != nil {
if err := c.intelRdtManager.Set(&config); err != nil {
// Set configs back
if err2 := c.cgroupManager.Set(c.config.Cgroups.Resources); err2 != nil {
if err2 := c.cgroupManager.Set(c.config); err2 != nil {
logrus.Warnf("Setting back cgroup configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2)
}
if err2 := c.intelRdtManager.Set(c.config); err2 != nil {
@@ -358,30 +357,17 @@ type openResult struct {
err error
}
func (c *linuxContainer) start(process *Process) (retErr error) {
func (c *linuxContainer) start(process *Process) error {
parent, err := c.newParentProcess(process)
if err != nil {
return newSystemErrorWithCause(err, "creating new parent process")
}
logsDone := parent.forwardChildLogs()
if logsDone != nil {
defer func() {
// Wait for log forwarder to finish. This depends on
// runc init closing the _LIBCONTAINER_LOGPIPE log fd.
err := <-logsDone
if err != nil && retErr == nil {
retErr = newSystemErrorWithCause(err, "forwarding init logs")
}
}()
}
parent.forwardChildLogs()
if err := parent.start(); err != nil {
return newSystemErrorWithCause(err, "starting container process")
}
if process.Init {
c.fifo.Close()
if c.config.Hooks != nil {
s, err := c.currentOCIState()
if err != nil {
@@ -457,13 +443,12 @@ func (c *linuxContainer) deleteExecFifo() {
// fd, with _LIBCONTAINER_FIFOFD set to its fd number.
func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
fifoName := filepath.Join(c.root, execFifoFilename)
fifo, err := os.OpenFile(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
fifoFd, err := unix.Open(fifoName, unix.O_PATH|unix.O_CLOEXEC, 0)
if err != nil {
return err
}
c.fifo = fifo
cmd.ExtraFiles = append(cmd.ExtraFiles, fifo)
cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName))
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
return nil
@@ -585,7 +570,6 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP
intelRdtPath: state.IntelRdtPath,
messageSockPair: messageSockPair,
logFilePair: logFilePair,
manager: c.cgroupManager,
config: c.newInitConfig(p),
process: p,
bootstrapData: data,
@@ -610,9 +594,6 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
AppArmorProfile: c.config.AppArmorProfile,
ProcessLabel: c.config.ProcessLabel,
Rlimits: c.config.Rlimits,
CreateConsole: process.ConsoleSocket != nil,
ConsoleWidth: process.ConsoleWidth,
ConsoleHeight: process.ConsoleHeight,
}
if process.NoNewPrivileges != nil {
cfg.NoNewPrivileges = *process.NoNewPrivileges
@@ -626,10 +607,9 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
if len(process.Rlimits) > 0 {
cfg.Rlimits = process.Rlimits
}
if cgroups.IsCgroup2UnifiedMode() {
cfg.Cgroup2Path = c.cgroupManager.Path("")
}
cfg.CreateConsole = process.ConsoleSocket != nil
cfg.ConsoleWidth = process.ConsoleWidth
cfg.ConsoleHeight = process.ConsoleHeight
return cfg
}
@@ -721,6 +701,7 @@ func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.
return errors.New("CRIU feature check failed")
}
logrus.Debugf("Feature check says: %s", criuFeatures)
missingFeatures := false
// The outer if checks if the fields actually exist
@@ -1254,38 +1235,11 @@ func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error
// Now go through all mounts and create the mountpoints
// if the mountpoints are not on a tmpfs, as CRIU will
// restore the complete tmpfs content from its checkpoint.
umounts := []string{}
defer func() {
for _, u := range umounts {
if e := unix.Unmount(u, unix.MNT_DETACH); e != nil {
if e != unix.EINVAL {
// Ignore EINVAL as it means 'target is not a mount point.'
// It probably has already been unmounted.
logrus.Warnf("Error during cleanup unmounting of %q (%v)", u, e)
}
}
}
}()
for _, m := range mounts {
if !isPathInPrefixList(m.Destination, tmpfs) {
if err := c.makeCriuRestoreMountpoints(m); err != nil {
return err
}
// If the mount point is a bind mount, we need to mount
// it now so that runc can create the necessary mount
// points for mounts in bind mounts.
// This also happens during initial container creation.
// Without this CRIU restore will fail
// See: https://github.com/opencontainers/runc/issues/2748
// It is also not necessary to order the mount points
// because during initial container creation mounts are
// set up in the order they are configured.
if m.Device == "bind" {
if err := unix.Mount(m.Source, m.Destination, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
return errorsf.Wrapf(err, "unable to bind mount %q to %q", m.Source, m.Destination)
}
umounts = append(umounts, m.Destination)
}
}
}
return nil
@@ -1462,7 +1416,7 @@ func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
return err
}
if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil {
if err := c.cgroupManager.Set(c.config); err != nil {
return newSystemError(err)
}
@@ -1521,6 +1475,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
// the initial CRIU run to detect the version. Skip it.
logrus.Debugf("Using CRIU %d at: %s", c.criuVersion, c.criuPath)
}
logrus.Debugf("Using CRIU with following args: %s", args)
cmd := exec.Command(c.criuPath, args...)
if process != nil {
cmd.Stdin = process.Stdin
@@ -1568,19 +1523,19 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
// should be empty. For older CRIU versions it still will be
// available but empty. criurpc.CriuReqType_VERSION actually
// has no req.GetOpts().
if logrus.GetLevel() >= logrus.DebugLevel &&
!(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK ||
req.GetType() == criurpc.CriuReqType_VERSION) {
if !(req.GetType() == criurpc.CriuReqType_FEATURE_CHECK ||
req.GetType() == criurpc.CriuReqType_VERSION) {
val := reflect.ValueOf(req.GetOpts())
v := reflect.Indirect(val)
for i := 0; i < v.NumField(); i++ {
st := v.Type()
name := st.Field(i).Name
if 'A' <= name[0] && name[0] <= 'Z' {
value := val.MethodByName("Get" + name).Call([]reflect.Value{})
logrus.Debugf("CRIU option %s with value %v", name, value[0])
if strings.HasPrefix(name, "XXX_") {
continue
}
value := val.MethodByName("Get" + name).Call([]reflect.Value{})
logrus.Debugf("CRIU option %s with value %v", name, value[0])
}
}
data, err := proto.Marshal(req)

View File

@@ -1,6 +1,6 @@
package libcontainer
import criu "github.com/checkpoint-restore/go-criu/v5/rpc"
import criu "github.com/checkpoint-restore/go-criu/v4/rpc"
type CriuPageServerInfo struct {
Address string // IP address of CRIU page server

View File

@@ -168,7 +168,3 @@ func (d *Rule) CgroupString() string {
}
return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions)
}
func (d *Rule) Mkdev() (uint64, error) {
return mkDev(d)
}

View File

@@ -4,118 +4,13 @@ package devices
import (
"errors"
"io/ioutil"
"os"
"path/filepath"
"golang.org/x/sys/unix"
)
var (
// ErrNotADevice denotes that a file is not a valid linux device.
ErrNotADevice = errors.New("not a device node")
)
// Testing dependencies
var (
unixLstat = unix.Lstat
ioutilReadDir = ioutil.ReadDir
)
func mkDev(d *Rule) (uint64, error) {
func (d *Rule) Mkdev() (uint64, error) {
if d.Major == Wildcard || d.Minor == Wildcard {
return 0, errors.New("cannot mkdev() device with wildcards")
}
return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
}
// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the
// information about a linux device and return that information as a Device struct.
func DeviceFromPath(path, permissions string) (*Device, error) {
var stat unix.Stat_t
err := unixLstat(path, &stat)
if err != nil {
return nil, err
}
var (
devType Type
mode = stat.Mode
devNumber = uint64(stat.Rdev)
major = unix.Major(devNumber)
minor = unix.Minor(devNumber)
)
switch mode & unix.S_IFMT {
case unix.S_IFBLK:
devType = BlockDevice
case unix.S_IFCHR:
devType = CharDevice
case unix.S_IFIFO:
devType = FifoDevice
default:
return nil, ErrNotADevice
}
return &Device{
Rule: Rule{
Type: devType,
Major: int64(major),
Minor: int64(minor),
Permissions: Permissions(permissions),
},
Path: path,
FileMode: os.FileMode(mode &^ unix.S_IFMT),
Uid: stat.Uid,
Gid: stat.Gid,
}, nil
}
// HostDevices returns all devices that can be found under /dev directory.
func HostDevices() ([]*Device, error) {
return GetDevices("/dev")
}
// GetDevices recursively traverses a directory specified by path
// and returns all devices found there.
func GetDevices(path string) ([]*Device, error) {
files, err := ioutilReadDir(path)
if err != nil {
return nil, err
}
var out []*Device
for _, f := range files {
switch {
case f.IsDir():
switch f.Name() {
// ".lxc" & ".lxd-mounts" added to address https://github.com/lxc/lxd/issues/2825
// ".udev" added to address https://github.com/opencontainers/runc/issues/2093
case "pts", "shm", "fd", "mqueue", ".lxc", ".lxd-mounts", ".udev":
continue
default:
sub, err := GetDevices(filepath.Join(path, f.Name()))
if err != nil {
return nil, err
}
out = append(out, sub...)
continue
}
case f.Name() == "console":
continue
}
device, err := DeviceFromPath(filepath.Join(path, f.Name()), "rwm")
if err != nil {
if err == ErrNotADevice {
continue
}
if os.IsNotExist(err) {
continue
}
return nil, err
}
if device.Type == FifoDevice {
continue
}
out = append(out, device)
}
return out, nil
}

View File

@@ -0,0 +1,5 @@
package devices
func (d *Rule) Mkdev() (uint64, error) {
return 0, nil
}

View File

@@ -0,0 +1,112 @@
package devices
import (
"errors"
"io/ioutil"
"os"
"path/filepath"
"golang.org/x/sys/unix"
)
var (
// ErrNotADevice denotes that a file is not a valid linux device.
ErrNotADevice = errors.New("not a device node")
)
// Testing dependencies
var (
unixLstat = unix.Lstat
ioutilReadDir = ioutil.ReadDir
)
// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the
// information about a linux device and return that information as a Device struct.
func DeviceFromPath(path, permissions string) (*Device, error) {
var stat unix.Stat_t
err := unixLstat(path, &stat)
if err != nil {
return nil, err
}
var (
devType Type
mode = stat.Mode
devNumber = uint64(stat.Rdev)
major = unix.Major(devNumber)
minor = unix.Minor(devNumber)
)
switch mode & unix.S_IFMT {
case unix.S_IFBLK:
devType = BlockDevice
case unix.S_IFCHR:
devType = CharDevice
case unix.S_IFIFO:
devType = FifoDevice
default:
return nil, ErrNotADevice
}
return &Device{
Rule: Rule{
Type: devType,
Major: int64(major),
Minor: int64(minor),
Permissions: Permissions(permissions),
},
Path: path,
FileMode: os.FileMode(mode),
Uid: stat.Uid,
Gid: stat.Gid,
}, nil
}
// HostDevices returns all devices that can be found under /dev directory.
func HostDevices() ([]*Device, error) {
return GetDevices("/dev")
}
// GetDevices recursively traverses a directory specified by path
// and returns all devices found there.
func GetDevices(path string) ([]*Device, error) {
files, err := ioutilReadDir(path)
if err != nil {
return nil, err
}
var out []*Device
for _, f := range files {
switch {
case f.IsDir():
switch f.Name() {
// ".lxc" & ".lxd-mounts" added to address https://github.com/lxc/lxd/issues/2825
// ".udev" added to address https://github.com/opencontainers/runc/issues/2093
case "pts", "shm", "fd", "mqueue", ".lxc", ".lxd-mounts", ".udev":
continue
default:
sub, err := GetDevices(filepath.Join(path, f.Name()))
if err != nil {
return nil, err
}
out = append(out, sub...)
continue
}
case f.Name() == "console":
continue
}
device, err := DeviceFromPath(filepath.Join(path, f.Name()), "rwm")
if err != nil {
if err == ErrNotADevice {
continue
}
if os.IsNotExist(err) {
continue
}
return nil, err
}
if device.Type == FifoDevice {
continue
}
out = append(out, device)
}
return out, nil
}

View File

@@ -185,7 +185,7 @@ func CriuPath(criupath string) func(*LinuxFactory) error {
// configures the factory with the provided option funcs.
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
if root != "" {
if err := os.MkdirAll(root, 0o700); err != nil {
if err := os.MkdirAll(root, 0700); err != nil {
return nil, newGenericError(err, SystemError)
}
}
@@ -225,7 +225,7 @@ type LinuxFactory struct {
// containers.
CriuPath string
// New{u,g}idmapPath is the path to the binaries used for mapping with
// New{u,g}uidmapPath is the path to the binaries used for mapping with
// rootless containers.
NewuidmapPath string
NewgidmapPath string
@@ -259,7 +259,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
} else if !os.IsNotExist(err) {
return nil, newGenericError(err, SystemError)
}
if err := os.MkdirAll(containerRoot, 0o711); err != nil {
if err := os.MkdirAll(containerRoot, 0711); err != nil {
return nil, newGenericError(err, SystemError)
}
if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
@@ -365,12 +365,6 @@ func (l *LinuxFactory) StartInitialization() (err error) {
defer consoleSocket.Close()
}
logPipeFdStr := os.Getenv("_LIBCONTAINER_LOGPIPE")
logPipeFd, err := strconv.Atoi(logPipeFdStr)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE=%s to int: %s", logPipeFdStr, err)
}
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
@@ -393,7 +387,7 @@ func (l *LinuxFactory) StartInitialization() (err error) {
}
}()
i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd)
i, err := newContainerInit(it, pipe, consoleSocket, fifofd)
if err != nil {
return err
}

View File

@@ -35,8 +35,8 @@ const (
)
type pid struct {
Pid int `json:"stage2_pid"`
PidFirstChild int `json:"stage1_pid"`
Pid int `json:"pid"`
PidFirstChild int `json:"pid_first"`
}
// network is an internal struct used to setup container networks.
@@ -70,14 +70,13 @@ type initConfig struct {
RootlessEUID bool `json:"rootless_euid,omitempty"`
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
SpecState *specs.State `json:"spec_state,omitempty"`
Cgroup2Path string `json:"cgroup2_path,omitempty"`
}
type initer interface {
Init() error
}
func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) (initer, error) {
func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd int) (initer, error) {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err
@@ -91,7 +90,6 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd,
pipe: pipe,
consoleSocket: consoleSocket,
config: config,
logFd: logFd,
}, nil
case initStandard:
return &linuxStandardInit{
@@ -100,7 +98,6 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd,
parentPid: unix.Getppid(),
config: config,
fifoFd: fifoFd,
logFd: logFd,
}, nil
}
return nil, fmt.Errorf("unknown init type %q", t)
@@ -132,26 +129,6 @@ func finalizeNamespace(config *initConfig) error {
return errors.Wrap(err, "close exec fds")
}
// we only do chdir if it's specified
doChdir := config.Cwd != ""
if doChdir {
// First, attempt the chdir before setting up the user.
// This could allow us to access a directory that the user running runc can access
// but the container user cannot.
err := unix.Chdir(config.Cwd)
switch {
case err == nil:
doChdir = false
case os.IsPermission(err):
// If we hit an EPERM, we should attempt again after setting up user.
// This will allow us to successfully chdir if the container user has access
// to the directory, but the user running runc does not.
// This is useful in cases where the cwd is also a volume that's been chowned to the container user.
default:
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err)
}
}
caps := &configs.Capabilities{}
if config.Capabilities != nil {
caps = config.Capabilities
@@ -173,8 +150,10 @@ func finalizeNamespace(config *initConfig) error {
if err := setupUser(config); err != nil {
return errors.Wrap(err, "setup user")
}
// Change working directory AFTER the user has been set up, if we haven't done it yet.
if doChdir {
// Change working directory AFTER the user has been set up.
// Otherwise, if the cwd is also a volume that's been chowned to the container user (and not the user running runc),
// this command will EPERM.
if config.Cwd != "" {
if err := unix.Chdir(config.Cwd); err != nil {
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err)
}

View File

@@ -6,14 +6,14 @@ import (
"fmt"
"io"
"os"
"strconv"
"sync"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
var (
configureMutex sync.Mutex
configureMutex = sync.Mutex{}
// loggingConfigured will be set once logging has been configured via invoking `ConfigureLogging`.
// Subsequent invocations of `ConfigureLogging` would be no-op
loggingConfigured = false
@@ -23,47 +23,41 @@ type Config struct {
LogLevel logrus.Level
LogFormat string
LogFilePath string
LogPipeFd int
LogCaller bool
LogPipeFd string
}
func ForwardLogs(logPipe io.ReadCloser) chan error {
done := make(chan error, 1)
s := bufio.NewScanner(logPipe)
go func() {
for s.Scan() {
processEntry(s.Bytes())
func ForwardLogs(logPipe io.Reader) {
lineReader := bufio.NewReader(logPipe)
for {
line, err := lineReader.ReadBytes('\n')
if len(line) > 0 {
processEntry(line)
}
if err := logPipe.Close(); err != nil {
logrus.Errorf("error closing log source: %v", err)
if err == io.EOF {
logrus.Debugf("log pipe has been closed: %+v", err)
return
}
// The only error we want to return is when reading from
// logPipe has failed.
done <- s.Err()
close(done)
}()
return done
if err != nil {
logrus.Errorf("log pipe read error: %+v", err)
}
}
}
func processEntry(text []byte) {
if len(text) == 0 {
return
}
var jl struct {
type jsonLog struct {
Level string `json:"level"`
Msg string `json:"msg"`
}
var jl jsonLog
if err := json.Unmarshal(text, &jl); err != nil {
logrus.Errorf("failed to decode %q to json: %v", text, err)
logrus.Errorf("failed to decode %q to json: %+v", text, err)
return
}
lvl, err := logrus.ParseLevel(jl.Level)
if err != nil {
logrus.Errorf("failed to parse log level %q: %v", jl.Level, err)
logrus.Errorf("failed to parse log level %q: %v\n", jl.Level, err)
return
}
logrus.StandardLogger().Logf(lvl, jl.Msg)
@@ -74,16 +68,18 @@ func ConfigureLogging(config Config) error {
defer configureMutex.Unlock()
if loggingConfigured {
return errors.New("logging has already been configured")
logrus.Debug("logging has already been configured")
return nil
}
logrus.SetLevel(config.LogLevel)
logrus.SetReportCaller(config.LogCaller)
// XXX: while 0 is a valid fd (usually stdin), here we assume
// that we never deliberately set LogPipeFd to 0.
if config.LogPipeFd > 0 {
logrus.SetOutput(os.NewFile(uintptr(config.LogPipeFd), "logpipe"))
if config.LogPipeFd != "" {
logPipeFdInt, err := strconv.Atoi(config.LogPipeFd)
if err != nil {
return fmt.Errorf("failed to convert _LIBCONTAINER_LOGPIPE environment variable value %q to int: %v", config.LogPipeFd, err)
}
logrus.SetOutput(os.NewFile(uintptr(logPipeFdInt), "logpipe"))
} else if config.LogFilePath != "" {
f, err := os.OpenFile(config.LogFilePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0644)
if err != nil {

View File

@@ -3,28 +3,48 @@
package libcontainer
import (
"io/ioutil"
"path/filepath"
"strconv"
"strings"
"unsafe"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func getValueFromCgroup(path, key string) (int, error) {
content, err := ioutil.ReadFile(path)
if err != nil {
return 0, err
}
lines := strings.Split(string(content), "\n")
for _, line := range lines {
arr := strings.Split(line, " ")
if len(arr) == 2 && arr[0] == key {
return strconv.Atoi(arr[1])
}
}
return 0, nil
}
func registerMemoryEventV2(cgDir, evName, cgEvName string) (<-chan struct{}, error) {
eventControlPath := filepath.Join(cgDir, evName)
cgEvPath := filepath.Join(cgDir, cgEvName)
fd, err := unix.InotifyInit()
if err != nil {
return nil, errors.Wrap(err, "unable to init inotify")
}
// watching oom kill
evFd, err := unix.InotifyAddWatch(fd, filepath.Join(cgDir, evName), unix.IN_MODIFY)
evFd, err := unix.InotifyAddWatch(fd, eventControlPath, unix.IN_MODIFY)
if err != nil {
unix.Close(fd)
return nil, errors.Wrap(err, "unable to add inotify watch")
}
// Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited
cgFd, err := unix.InotifyAddWatch(fd, filepath.Join(cgDir, cgEvName), unix.IN_MODIFY)
cgFd, err := unix.InotifyAddWatch(fd, cgEvPath, unix.IN_MODIFY)
if err != nil {
unix.Close(fd)
return nil, errors.Wrap(err, "unable to add inotify watch")
@@ -59,12 +79,12 @@ func registerMemoryEventV2(cgDir, evName, cgEvName string) (<-chan struct{}, err
}
switch int(rawEvent.Wd) {
case evFd:
oom, err := fscommon.GetValueByKey(cgDir, evName, "oom_kill")
oom, err := getValueFromCgroup(eventControlPath, "oom_kill")
if err != nil || oom > 0 {
ch <- struct{}{}
}
case cgFd:
pids, err := fscommon.GetValueByKey(cgDir, cgEvName, "populated")
pids, err := getValueFromCgroup(cgEvPath, "populated")
if err != nil || pids == 0 {
return
}

View File

@@ -51,7 +51,7 @@ type parentProcess interface {
setExternalDescriptors(fds []string)
forwardChildLogs() chan error
forwardChildLogs()
}
type filePair struct {
@@ -65,7 +65,6 @@ type setnsProcess struct {
logFilePair filePair
cgroupPaths map[string]string
rootlessCgroups bool
manager cgroups.Manager
intelRdtPath string
config *initConfig
fds []string
@@ -89,8 +88,6 @@ func (p *setnsProcess) signal(sig os.Signal) error {
func (p *setnsProcess) start() (retErr error) {
defer p.messageSockPair.parent.Close()
// get the "before" value of oom kill count
oom, _ := p.manager.OOMKillCount()
err := p.cmd.Start()
// close the write-side of the pipes (controlled by child)
p.messageSockPair.child.Close()
@@ -98,34 +95,19 @@ func (p *setnsProcess) start() (retErr error) {
if err != nil {
return newSystemErrorWithCause(err, "starting setns process")
}
waitInit := initWaiter(p.messageSockPair.parent)
defer func() {
if retErr != nil {
if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom {
// Someone in this cgroup was killed, this _might_ be us.
retErr = newSystemErrorWithCause(retErr, "possibly OOM-killed")
}
werr := <-waitInit
if werr != nil {
logrus.WithError(werr).Warn()
}
err := ignoreTerminateErrors(p.terminate())
if err != nil {
logrus.WithError(err).Warn("unable to terminate setnsProcess")
}
}
}()
if p.bootstrapData != nil {
if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}
}
err = <-waitInit
if err != nil {
return err
}
if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "executing setns process")
}
@@ -262,8 +244,8 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
func (p *setnsProcess) forwardChildLogs() chan error {
return logs.ForwardLogs(p.logFilePair.parent)
func (p *setnsProcess) forwardChildLogs() {
go logs.ForwardLogs(p.logFilePair.parent)
}
type initProcess struct {
@@ -337,36 +319,9 @@ func (p *initProcess) start() (retErr error) {
p.process.ops = nil
return newSystemErrorWithCause(err, "starting init process command")
}
waitInit := initWaiter(p.messageSockPair.parent)
defer func() {
if retErr != nil {
// Find out if init is killed by the kernel's OOM killer.
// Get the count before killing init as otherwise cgroup
// might be removed by systemd.
oom, err := p.manager.OOMKillCount()
if err != nil {
logrus.WithError(err).Warn("unable to get oom kill count")
} else if oom > 0 {
// Does not matter what the particular error was,
// its cause is most probably OOM, so report that.
const oomError = "container init was OOM-killed (memory limit too low?)"
if logrus.GetLevel() >= logrus.DebugLevel {
// Only show the original error if debug is set,
// as it is not generally very useful.
retErr = newSystemErrorWithCause(retErr, oomError)
} else {
retErr = newSystemError(errors.New(oomError))
}
}
werr := <-waitInit
if werr != nil {
logrus.WithError(werr).Warn()
}
// Terminate the process to ensure we can remove cgroups.
// terminate the process to ensure we can remove cgroups
if err := ignoreTerminateErrors(p.terminate()); err != nil {
logrus.WithError(err).Warn("unable to terminate initProcess")
}
@@ -392,11 +347,6 @@ func (p *initProcess) start() (retErr error) {
if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
}
err = <-waitInit
if err != nil {
return err
}
childPid, err := p.getChildPid()
if err != nil {
return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
@@ -448,7 +398,7 @@ func (p *initProcess) start() (retErr error) {
// call prestart and CreateRuntime hooks
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
// Setup cgroup before the hook, so that the prestart and CreateRuntime hook could apply cgroup permissions.
if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
}
if p.intelRdtManager != nil {
@@ -504,7 +454,7 @@ func (p *initProcess) start() (retErr error) {
sentRun = true
case procHooks:
// Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
if err := p.manager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
}
if p.intelRdtManager != nil {
@@ -630,8 +580,8 @@ func (p *initProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
func (p *initProcess) forwardChildLogs() chan error {
return logs.ForwardLogs(p.logFilePair.parent)
func (p *initProcess) forwardChildLogs() {
go logs.ForwardLogs(p.logFilePair.parent)
}
func getPipeFds(pid int) ([]string, error) {
@@ -699,28 +649,3 @@ func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
}
return i, nil
}
// initWaiter returns a channel to wait on for making sure
// runc init has finished the initial setup.
func initWaiter(r io.Reader) chan error {
ch := make(chan error, 1)
go func() {
defer close(ch)
inited := make([]byte, 1)
n, err := r.Read(inited)
if err == nil {
if n < 1 {
err = errors.New("short read")
} else if inited[0] != 0 {
err = fmt.Errorf("unexpected %d != 0", inited[0])
} else {
ch <- nil
return
}
}
ch <- newSystemErrorWithCause(err, "waiting for init preliminary setup")
}()
return ch
}

View File

@@ -77,8 +77,7 @@ func (p *restoredProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
func (p *restoredProcess) forwardChildLogs() chan error {
return nil
func (p *restoredProcess) forwardChildLogs() {
}
// nonChildProcess represents a process where the calling process is not
@@ -126,6 +125,5 @@ func (p *nonChildProcess) setExternalDescriptors(newFds []string) {
p.fds = newFds
}
func (p *nonChildProcess) forwardChildLogs() chan error {
return nil
func (p *nonChildProcess) forwardChildLogs() {
}

View File

@@ -17,10 +17,9 @@ import (
"github.com/moby/sys/mountinfo"
"github.com/mrunalp/fileutils"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/runtime-spec/specs-go"
@@ -30,14 +29,6 @@ import (
const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
type mountConfig struct {
root string
label string
cgroup2Path string
rootlessCgroups bool
cgroupns bool
}
// needsSetupDev returns true if /dev needs to be set up.
func needsSetupDev(config *configs.Config) bool {
for _, m := range config.Mounts {
@@ -57,13 +48,7 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
return newSystemErrorWithCause(err, "preparing rootfs")
}
mountConfig := &mountConfig{
root: config.Rootfs,
label: config.MountLabel,
cgroup2Path: iConfig.Cgroup2Path,
rootlessCgroups: iConfig.RootlessCgroups,
cgroupns: config.Namespaces.Contains(configs.NEWCGROUP),
}
hasCgroupns := config.Namespaces.Contains(configs.NEWCGROUP)
setupDev := needsSetupDev(config)
for _, m := range config.Mounts {
for _, precmd := range m.PremountCmds {
@@ -71,7 +56,7 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
return newSystemErrorWithCause(err, "running premount command")
}
}
if err := mountToRootfs(m, mountConfig); err != nil {
if err := mountToRootfs(m, config.Rootfs, config.MountLabel, hasCgroupns); err != nil {
return newSystemErrorWithCausef(err, "mounting %q to rootfs at %q", m.Source, m.Destination)
}
@@ -237,7 +222,7 @@ func prepareBindMount(m *configs.Mount, rootfs string) error {
return nil
}
func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
func mountCgroupV1(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
binds, err := getCgroupMounts(m)
if err != nil {
return err
@@ -257,12 +242,12 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
Data: "mode=755",
PropagationFlags: m.PropagationFlags,
}
if err := mountToRootfs(tmpfs, c); err != nil {
if err := mountToRootfs(tmpfs, rootfs, mountLabel, enableCgroupns); err != nil {
return err
}
for _, b := range binds {
if c.cgroupns {
subsystemPath := filepath.Join(c.root, b.Destination)
if enableCgroupns {
subsystemPath := filepath.Join(rootfs, b.Destination)
if err := os.MkdirAll(subsystemPath, 0755); err != nil {
return err
}
@@ -281,7 +266,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
return err
}
} else {
if err := mountToRootfs(b, c); err != nil {
if err := mountToRootfs(b, rootfs, mountLabel, enableCgroupns); err != nil {
return err
}
}
@@ -291,7 +276,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
// symlink(2) is very dumb, it will just shove the path into
// the link and doesn't do any checks or relative path
// conversion. Also, don't error out if the cgroup already exists.
if err := os.Symlink(mc, filepath.Join(c.root, m.Destination, ss)); err != nil && !os.IsExist(err) {
if err := os.Symlink(mc, filepath.Join(rootfs, m.Destination, ss)); err != nil && !os.IsExist(err) {
return err
}
}
@@ -299,39 +284,28 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
return nil
}
func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
dest, err := securejoin.SecureJoin(c.root, m.Destination)
func mountCgroupV2(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
cgroupPath, err := securejoin.SecureJoin(rootfs, m.Destination)
if err != nil {
return err
}
if err := os.MkdirAll(dest, 0755); err != nil {
if err := os.MkdirAll(cgroupPath, 0755); err != nil {
return err
}
if err := unix.Mount(m.Source, dest, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
if err := unix.Mount(m.Source, cgroupPath, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
if err == unix.EPERM || err == unix.EBUSY {
src := fs2.UnifiedMountpoint
if c.cgroupns && c.cgroup2Path != "" {
// Emulate cgroupns by bind-mounting
// the container cgroup path rather than
// the whole /sys/fs/cgroup.
src = c.cgroup2Path
}
err = unix.Mount(src, dest, "", uintptr(m.Flags)|unix.MS_BIND, "")
if err == unix.ENOENT && c.rootlessCgroups {
err = nil
}
return err
return unix.Mount("/sys/fs/cgroup", cgroupPath, "", uintptr(m.Flags)|unix.MS_BIND, "")
}
return err
}
return nil
}
func mountToRootfs(m *configs.Mount, c *mountConfig) error {
rootfs := c.root
mountLabel := c.label
dest := m.Destination
func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns bool) error {
var (
dest = m.Destination
)
if !strings.HasPrefix(dest, rootfs) {
dest = filepath.Join(rootfs, dest)
}
@@ -450,9 +424,9 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
}
case "cgroup":
if cgroups.IsCgroup2UnifiedMode() {
return mountCgroupV2(m, c)
return mountCgroupV2(m, rootfs, mountLabel, enableCgroupns)
}
return mountCgroupV1(m, c)
return mountCgroupV1(m, rootfs, mountLabel, enableCgroupns)
default:
// ensure that the destination of the mount is resolved of symlinks at mount time because
// any previous mounts can invalidate the next mount's destination.
@@ -629,7 +603,7 @@ func reOpenDevNull() error {
// Create the device nodes in the container.
func createDevices(config *configs.Config) error {
useBindMount := userns.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
oldMask := unix.Umask(0000)
for _, node := range config.Devices {
@@ -957,20 +931,9 @@ func readonlyPath(path string) error {
if os.IsNotExist(err) {
return nil
}
return &os.PathError{Op: "bind-mount", Path: path, Err: err}
return err
}
var s unix.Statfs_t
if err := unix.Statfs(path, &s); err != nil {
return &os.PathError{Op: "statfs", Path: path, Err: err}
}
flags := uintptr(s.Flags) & (unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC)
if err := unix.Mount(path, path, "", flags|unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY, ""); err != nil {
return &os.PathError{Op: "bind-mount-ro", Path: path, Err: err}
}
return nil
return unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "")
}
// remountReadonly will remount an existing mount point and ensure that it is read-only.

View File

@@ -3,7 +3,6 @@
package patchbpf
import (
"bytes"
"encoding/binary"
"io"
"os"
@@ -115,26 +114,14 @@ func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error)
defer wtr.Close()
defer rdr.Close()
readerBuffer := new(bytes.Buffer)
errChan := make(chan error, 1)
go func() {
_, err := io.Copy(readerBuffer, rdr)
errChan <- err
close(errChan)
}()
if err := filter.ExportBPF(wtr); err != nil {
return nil, errors.Wrap(err, "exporting BPF")
}
// Close so that the reader actually gets EOF.
_ = wtr.Close()
if copyErr := <-errChan; copyErr != nil {
return nil, errors.Wrap(copyErr, "reading from ExportBPF pipe")
}
// Parse the instructions.
rawProgram, err := parseProgram(readerBuffer)
rawProgram, err := parseProgram(rdr)
if err != nil {
return nil, errors.Wrap(err, "parsing generated BPF filter")
}
@@ -597,12 +584,9 @@ func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (err error) {
unix.SECCOMP_MODE_FILTER,
uintptr(unsafe.Pointer(&fprog)), 0, 0)
} else {
_, _, errno := unix.RawSyscall(unix.SYS_SECCOMP,
_, _, err = unix.RawSyscall(unix.SYS_SECCOMP,
uintptr(C.C_SET_MODE_FILTER),
uintptr(flags), uintptr(unsafe.Pointer(&fprog)))
if errno != 0 {
err = errno
}
}
runtime.KeepAlive(filter)
runtime.KeepAlive(fprog)

View File

@@ -3,8 +3,11 @@
package seccomp
import (
"bufio"
"errors"
"fmt"
"os"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/seccomp/patchbpf"
@@ -77,6 +80,24 @@ func InitSeccomp(config *configs.Seccomp) error {
return nil
}
// IsEnabled returns if the kernel has been configured to support seccomp.
func IsEnabled() bool {
// Try to read from /proc/self/status for kernels > 3.8
s, err := parseStatusFile("/proc/self/status")
if err != nil {
// Check if Seccomp is supported, via CONFIG_SECCOMP.
if err := unix.Prctl(unix.PR_GET_SECCOMP, 0, 0, 0, 0); err != unix.EINVAL {
// Make sure the kernel has CONFIG_SECCOMP_FILTER.
if err := unix.Prctl(unix.PR_SET_SECCOMP, unix.SECCOMP_MODE_FILTER, 0, 0, 0); err != unix.EINVAL {
return true
}
}
return false
}
_, ok := s["Seccomp"]
return ok
}
// Convert Libcontainer Action to Libseccomp ScmpAction
func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error) {
switch act {
@@ -216,6 +237,33 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
return nil
}
func parseStatusFile(path string) (map[string]string, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
s := bufio.NewScanner(f)
status := make(map[string]string)
for s.Scan() {
text := s.Text()
parts := strings.Split(text, ":")
if len(parts) <= 1 {
continue
}
status[parts[0]] = parts[1]
}
if err := s.Err(); err != nil {
return nil, err
}
return status, nil
}
// Version returns major, minor, and micro.
func Version() (uint, uint, uint) {
return libseccomp.GetLibraryVersion()

View File

@@ -18,6 +18,11 @@ func InitSeccomp(config *configs.Seccomp) error {
return nil
}
// IsEnabled returns false, because it is not supported.
func IsEnabled() bool {
return false
}
// Version returns major, minor, and micro.
func Version() (uint, uint, uint) {
return 0, 0, 0

View File

@@ -12,7 +12,6 @@ import (
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/selinux/go-selinux"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@@ -22,7 +21,6 @@ type linuxSetnsInit struct {
pipe *os.File
consoleSocket *os.File
config *initConfig
logFd int
}
func (l *linuxSetnsInit) getSessionRingName() string {
@@ -88,11 +86,5 @@ func (l *linuxSetnsInit) Init() error {
return newSystemErrorWithCause(err, "init seccomp")
}
}
logrus.Debugf("setns_init: about to exec")
// Close the log pipe fd so the parent's ForwardLogs can exit.
if err := unix.Close(l.logFd); err != nil {
return newSystemErrorWithCause(err, "closing log pipe fd")
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
}

View File

@@ -16,7 +16,6 @@ import (
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@@ -25,7 +24,6 @@ type linuxStandardInit struct {
consoleSocket *os.File
parentPid int
fifoFd int
logFd int
config *initConfig
}
@@ -182,14 +180,7 @@ func (l *linuxStandardInit) Init() error {
return err
}
// Close the pipe to signal that we have completed our init.
logrus.Debugf("init: closing the pipe to signal completion")
l.pipe.Close()
// Close the log pipe fd so the parent's ForwardLogs can exit.
if err := unix.Close(l.logFd); err != nil {
return newSystemErrorWithCause(err, "closing log pipe fd")
}
// Wait for the FIFO to be opened on the other side before exec-ing the
// user process. We open it through /proc/self/fd/$fd, because the fd that
// was given to us was an O_PATH fd to the fifo itself. Linux allows us to

View File

@@ -3,9 +3,12 @@
package system
import (
"os"
"os/exec"
"sync"
"unsafe"
"github.com/opencontainers/runc/libcontainer/user"
"golang.org/x/sys/unix"
)
@@ -84,6 +87,52 @@ func Setctty() error {
return nil
}
var (
inUserNS bool
nsOnce sync.Once
)
// RunningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
func RunningInUserNS() bool {
nsOnce.Do(func() {
uidmap, err := user.CurrentProcessUIDMap()
if err != nil {
// This kernel-provided file only exists if user namespaces are supported
return
}
inUserNS = UIDMapInUserNS(uidmap)
})
return inUserNS
}
func UIDMapInUserNS(uidmap []user.IDMap) bool {
/*
* We assume we are in the initial user namespace if we have a full
* range - 4294967295 uids starting at uid 0.
*/
if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
return false
}
return true
}
// GetParentNSeuid returns the euid within the parent user namespace
func GetParentNSeuid() int64 {
euid := int64(os.Geteuid())
uidmap, err := user.CurrentProcessUIDMap()
if err != nil {
// This kernel-provided file only exists if user namespaces are supported
return euid
}
for _, um := range uidmap {
if um.ID <= euid && euid <= um.ID+um.Count-1 {
return um.ParentID + euid - um.ID
}
}
return euid
}
// SetSubreaper sets the value i as the subreaper setting for the calling process
func SetSubreaper(i int) error {
return unix.Prctl(unix.PR_SET_CHILD_SUBREAPER, uintptr(i), 0, 0, 0)

View File

@@ -0,0 +1,27 @@
// +build !linux
package system
import (
"os"
"github.com/opencontainers/runc/libcontainer/user"
)
// RunningInUserNS is a stub for non-Linux systems
// Always returns false
func RunningInUserNS() bool {
return false
}
// UIDMapInUserNS is a stub for non-Linux systems
// Always returns false
func UIDMapInUserNS(uidmap []user.IDMap) bool {
return false
}
// GetParentNSeuid returns the euid within the parent user namespace
// Always returns os.Geteuid on non-linux
func GetParentNSeuid() int {
return os.Geteuid()
}

View File

@@ -1,5 +0,0 @@
package system
import "github.com/opencontainers/runc/libcontainer/userns"
var RunningInUserNS = userns.RunningInUserNS

View File

@@ -0,0 +1,2 @@
Tianon Gravi <admwiggin@gmail.com> (@tianon)
Aleksa Sarai <cyphar@cyphar.com> (@cyphar)

View File

@@ -0,0 +1,41 @@
package user
import (
"errors"
)
var (
// The current operating system does not provide the required data for user lookups.
ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
// No matching entries found in file.
ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
ErrNoGroupEntries = errors.New("no matching entries in group file")
)
// LookupUser looks up a user by their username in /etc/passwd. If the user
// cannot be found (or there is no /etc/passwd file on the filesystem), then
// LookupUser returns an error.
func LookupUser(username string) (User, error) {
return lookupUser(username)
}
// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
// be found (or there is no /etc/passwd file on the filesystem), then LookupId
// returns an error.
func LookupUid(uid int) (User, error) {
return lookupUid(uid)
}
// LookupGroup looks up a group by its name in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGroup
// returns an error.
func LookupGroup(groupname string) (Group, error) {
return lookupGroup(groupname)
}
// LookupGid looks up a group by its group id in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGid
// returns an error.
func LookupGid(gid int) (Group, error) {
return lookupGid(gid)
}

View File

@@ -16,19 +16,13 @@ const (
unixGroupPath = "/etc/group"
)
// LookupUser looks up a user by their username in /etc/passwd. If the user
// cannot be found (or there is no /etc/passwd file on the filesystem), then
// LookupUser returns an error.
func LookupUser(username string) (User, error) {
func lookupUser(username string) (User, error) {
return lookupUserFunc(func(u User) bool {
return u.Name == username
})
}
// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
// be found (or there is no /etc/passwd file on the filesystem), then LookupId
// returns an error.
func LookupUid(uid int) (User, error) {
func lookupUid(uid int) (User, error) {
return lookupUserFunc(func(u User) bool {
return u.Uid == uid
})
@@ -57,19 +51,13 @@ func lookupUserFunc(filter func(u User) bool) (User, error) {
return users[0], nil
}
// LookupGroup looks up a group by its name in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGroup
// returns an error.
func LookupGroup(groupname string) (Group, error) {
func lookupGroup(groupname string) (Group, error) {
return lookupGroupFunc(func(g Group) bool {
return g.Name == groupname
})
}
// LookupGid looks up a group by its group id in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGid
// returns an error.
func LookupGid(gid int) (Group, error) {
func lookupGid(gid int) (Group, error) {
return lookupGroupFunc(func(g Group) bool {
return g.Gid == gid
})

View File

@@ -0,0 +1,40 @@
// +build windows
package user
import (
"os/user"
"strconv"
)
func lookupUser(username string) (User, error) {
u, err := user.Lookup(username)
if err != nil {
return User{}, err
}
return userFromOS(u)
}
func lookupUid(uid int) (User, error) {
u, err := user.LookupId(strconv.Itoa(uid))
if err != nil {
return User{}, err
}
return userFromOS(u)
}
func lookupGroup(groupname string) (Group, error) {
g, err := user.LookupGroup(groupname)
if err != nil {
return Group{}, err
}
return groupFromOS(g)
}
func lookupGid(gid int) (Group, error) {
g, err := user.LookupGroupId(strconv.Itoa(gid))
if err != nil {
return Group{}, err
}
return groupFromOS(g)
}

View File

@@ -2,10 +2,10 @@ package user
import (
"bufio"
"errors"
"fmt"
"io"
"os"
"os/user"
"strconv"
"strings"
)
@@ -16,13 +16,6 @@ const (
)
var (
// The current operating system does not provide the required data for user lookups.
ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
// No matching entries found in file.
ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
ErrNoGroupEntries = errors.New("no matching entries in group file")
ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
)
@@ -36,6 +29,28 @@ type User struct {
Shell string
}
// userFromOS converts an os/user.(*User) to local User
//
// (This does not include Pass, Shell or Gecos)
func userFromOS(u *user.User) (User, error) {
newUser := User{
Name: u.Username,
Home: u.HomeDir,
}
id, err := strconv.Atoi(u.Uid)
if err != nil {
return newUser, err
}
newUser.Uid = id
id, err = strconv.Atoi(u.Gid)
if err != nil {
return newUser, err
}
newUser.Gid = id
return newUser, nil
}
type Group struct {
Name string
Pass string
@@ -43,6 +58,23 @@ type Group struct {
List []string
}
// groupFromOS converts an os/user.(*Group) to local Group
//
// (This does not include Pass or List)
func groupFromOS(g *user.Group) (Group, error) {
newGroup := Group{
Name: g.Name,
}
id, err := strconv.Atoi(g.Gid)
if err != nil {
return newGroup, err
}
newGroup.Gid = id
return newGroup, nil
}
// SubID represents an entry in /etc/sub{u,g}id
type SubID struct {
Name string

View File

@@ -1,42 +0,0 @@
// +build gofuzz
package user
import (
"io"
"strings"
)
func IsDivisbleBy(n int, divisibleby int) bool {
return (n % divisibleby) == 0
}
func FuzzUser(data []byte) int {
if len(data) == 0 {
return -1
}
if !IsDivisbleBy(len(data), 5) {
return -1
}
var divided [][]byte
chunkSize := len(data) / 5
for i := 0; i < len(data); i += chunkSize {
end := i + chunkSize
divided = append(divided, data[i:end])
}
_, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil)
var passwd, group io.Reader
group = strings.NewReader(string(divided[1]))
_, _ = GetAdditionalGroups([]string{string(divided[2])}, group)
passwd = strings.NewReader(string(divided[3]))
_, _ = GetExecUser(string(divided[4]), nil, passwd, group)
return 1
}

View File

@@ -1,5 +0,0 @@
package userns
// RunningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
var RunningInUserNS = runningInUserNS

View File

@@ -1,15 +0,0 @@
// +build gofuzz
package userns
import (
"strings"
"github.com/opencontainers/runc/libcontainer/user"
)
func FuzzUIDMap(data []byte) int {
uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
_ = uidMapInUserNS(uidmap)
return 1
}

View File

@@ -1,37 +0,0 @@
package userns
import (
"sync"
"github.com/opencontainers/runc/libcontainer/user"
)
var (
inUserNS bool
nsOnce sync.Once
)
// runningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
func runningInUserNS() bool {
nsOnce.Do(func() {
uidmap, err := user.CurrentProcessUIDMap()
if err != nil {
// This kernel-provided file only exists if user namespaces are supported
return
}
inUserNS = uidMapInUserNS(uidmap)
})
return inUserNS
}
func uidMapInUserNS(uidmap []user.IDMap) bool {
/*
* We assume we are in the initial user namespace if we have a full
* range - 4294967295 uids starting at uid 0.
*/
if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
return false
}
return true
}

View File

@@ -1,17 +0,0 @@
// +build !linux
package userns
import "github.com/opencontainers/runc/libcontainer/user"
// runningInUserNS is a stub for non-Linux systems
// Always returns false
func runningInUserNS() bool {
return false
}
// uidMapInUserNS is a stub for non-Linux systems
// Always returns false
func uidMapInUserNS(uidmap []user.IDMap) bool {
return false
}