vendor: cadvisor v0.39.0

Main upgrades:
- github.com/opencontainers/runc v1.0.0-rc93
- github.com/containerd/containerd v1.4.4
- github.com/docker/docker v20.10.2
- github.com/mrunalp/fileutils v0.5.0
- github.com/opencontainers/selinux v1.8.0
- github.com/cilium/ebpf v0.2.0
This commit is contained in:
David Porter
2021-03-08 22:09:22 -08:00
parent faa3a5fbd4
commit b5dd78da3d
286 changed files with 7427 additions and 4415 deletions

View File

@@ -27,29 +27,29 @@ import (
"sort"
"strconv"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/pkg/errors"
)
// deviceMeta is a DeviceRule without the Allow or Permissions fields, and no
// deviceMeta is a Rule without the Allow or Permissions fields, and no
// wildcard-type support. It's effectively the "match" portion of a metadata
// rule, for the purposes of our emulation.
type deviceMeta struct {
node configs.DeviceType
node devices.Type
major int64
minor int64
}
// deviceRule is effectively the tuple (deviceMeta, DevicePermissions).
// deviceRule is effectively the tuple (deviceMeta, Permissions).
type deviceRule struct {
meta deviceMeta
perms configs.DevicePermissions
perms devices.Permissions
}
// deviceRules is a mapping of device metadata rules to the associated
// permissions in the ruleset.
type deviceRules map[deviceMeta]configs.DevicePermissions
type deviceRules map[deviceMeta]devices.Permissions
func (r deviceRules) orderedEntries() []deviceRule {
var rules []deviceRule
@@ -103,9 +103,9 @@ func parseLine(line string) (*deviceRule, error) {
// TODO: Double-check that the entire file is "a *:* rwm".
return nil, nil
case "b":
rule.meta.node = configs.BlockDevice
rule.meta.node = devices.BlockDevice
case "c":
rule.meta.node = configs.CharDevice
rule.meta.node = devices.CharDevice
default:
// Should never happen!
return nil, errors.Errorf("unknown device type %q", node)
@@ -113,7 +113,7 @@ func parseLine(line string) (*deviceRule, error) {
// Parse the major number.
if major == "*" {
rule.meta.major = configs.Wildcard
rule.meta.major = devices.Wildcard
} else {
val, err := strconv.ParseUint(major, 10, 32)
if err != nil {
@@ -124,7 +124,7 @@ func parseLine(line string) (*deviceRule, error) {
// Parse the minor number.
if minor == "*" {
rule.meta.minor = configs.Wildcard
rule.meta.minor = devices.Wildcard
} else {
val, err := strconv.ParseUint(minor, 10, 32)
if err != nil {
@@ -134,7 +134,7 @@ func parseLine(line string) (*deviceRule, error) {
}
// Parse the access permissions.
rule.perms = configs.DevicePermissions(perms)
rule.perms = devices.Permissions(perms)
if !rule.perms.IsValid() || rule.perms.IsEmpty() {
// Should never happen!
return nil, errors.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
@@ -144,7 +144,7 @@ func parseLine(line string) (*deviceRule, error) {
func (e *Emulator) addRule(rule deviceRule) error {
if e.rules == nil {
e.rules = make(map[deviceMeta]configs.DevicePermissions)
e.rules = make(map[deviceMeta]devices.Permissions)
}
// Merge with any pre-existing permissions.
@@ -169,9 +169,9 @@ func (e *Emulator) rmRule(rule deviceRule) error {
// to mention it'd be really slow (the kernel side is implemented as a
// linked-list of exceptions).
for _, partialMeta := range []deviceMeta{
{node: rule.meta.node, major: configs.Wildcard, minor: rule.meta.minor},
{node: rule.meta.node, major: rule.meta.major, minor: configs.Wildcard},
{node: rule.meta.node, major: configs.Wildcard, minor: configs.Wildcard},
{node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor},
{node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard},
{node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard},
} {
// This wildcard rule is equivalent to the requested rule, so skip it.
if rule.meta == partialMeta {
@@ -202,7 +202,7 @@ func (e *Emulator) rmRule(rule deviceRule) error {
func (e *Emulator) allow(rule *deviceRule) error {
// This cgroup is configured as a black-list. Reset the entire emulator,
// and put is into black-list mode.
if rule == nil || rule.meta.node == configs.WildcardDevice {
if rule == nil || rule.meta.node == devices.WildcardDevice {
*e = Emulator{
defaultAllow: true,
rules: nil,
@@ -222,7 +222,7 @@ func (e *Emulator) allow(rule *deviceRule) error {
func (e *Emulator) deny(rule *deviceRule) error {
// This cgroup is configured as a white-list. Reset the entire emulator,
// and put is into white-list mode.
if rule == nil || rule.meta.node == configs.WildcardDevice {
if rule == nil || rule.meta.node == devices.WildcardDevice {
*e = Emulator{
defaultAllow: false,
rules: nil,
@@ -239,7 +239,7 @@ func (e *Emulator) deny(rule *deviceRule) error {
return err
}
func (e *Emulator) Apply(rule configs.DeviceRule) error {
func (e *Emulator) Apply(rule devices.Rule) error {
if !rule.Type.CanCgroup() {
return errors.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
}
@@ -252,7 +252,7 @@ func (e *Emulator) Apply(rule configs.DeviceRule) error {
},
perms: rule.Permissions,
}
if innerRule.meta.node == configs.WildcardDevice {
if innerRule.meta.node == devices.WildcardDevice {
innerRule = nil
}
@@ -307,8 +307,8 @@ func EmulatorFromList(list io.Reader) (*Emulator, error) {
// This function is the sole reason for all of Emulator -- to allow us
// to figure out how to update a containers' cgroups without causing spurrious
// device errors (if possible).
func (source *Emulator) Transition(target *Emulator) ([]*configs.DeviceRule, error) {
var transitionRules []*configs.DeviceRule
func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
var transitionRules []*devices.Rule
oldRules := source.rules
// If the default policy doesn't match, we need to include a "disruptive"
@@ -319,11 +319,11 @@ func (source *Emulator) Transition(target *Emulator) ([]*configs.DeviceRule, err
// deny rules are in place in a black-list cgroup. Thus if the source is a
// black-list we also have to include a disruptive rule.
if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
transitionRules = append(transitionRules, &configs.DeviceRule{
transitionRules = append(transitionRules, &devices.Rule{
Type: 'a',
Major: -1,
Minor: -1,
Permissions: configs.DevicePermissions("rwm"),
Permissions: devices.Permissions("rwm"),
Allow: target.defaultAllow,
})
// The old rules are only relevant if we aren't starting out with a
@@ -342,7 +342,7 @@ func (source *Emulator) Transition(target *Emulator) ([]*configs.DeviceRule, err
newPerms := target.rules[meta]
droppedPerms := oldPerms.Difference(newPerms)
if !droppedPerms.IsEmpty() {
transitionRules = append(transitionRules, &configs.DeviceRule{
transitionRules = append(transitionRules, &devices.Rule{
Type: meta.node,
Major: meta.major,
Minor: meta.minor,
@@ -360,7 +360,7 @@ func (source *Emulator) Transition(target *Emulator) ([]*configs.DeviceRule, err
oldPerms := oldRules[meta]
gainedPerms := newPerms.Difference(oldPerms)
if !gainedPerms.IsEmpty() {
transitionRules = append(transitionRules, &configs.DeviceRule{
transitionRules = append(transitionRules, &devices.Rule{
Type: meta.node,
Major: meta.major,
Minor: meta.minor,

View File

@@ -1,4 +1,4 @@
// Package devicefilter containes eBPF device filter program
// Package devicefilter contains eBPF device filter program
//
// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
//
@@ -7,11 +7,11 @@
package devicefilter
import (
"fmt"
"math"
"strconv"
"github.com/cilium/ebpf/asm"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
@@ -22,7 +22,7 @@ const (
)
// DeviceFilter returns eBPF device filter program and its license string
func DeviceFilter(devices []*configs.DeviceRule) (asm.Instructions, string, error) {
func DeviceFilter(devices []*devices.Rule) (asm.Instructions, string, error) {
p := &program{}
p.init()
for i := len(devices) - 1; i >= 0; i-- {
@@ -68,7 +68,7 @@ func (p *program) init() {
}
// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element.
func (p *program) appendDevice(dev *configs.DeviceRule) error {
func (p *program) appendDevice(dev *devices.Rule) error {
if p.blockID < 0 {
return errors.New("the program is finalized")
}
@@ -88,7 +88,7 @@ func (p *program) appendDevice(dev *configs.DeviceRule) error {
hasType = false
default:
// if not specified in OCI json, typ is set to DeviceTypeAll
return errors.Errorf("invalid DeviceType %q", string(dev.Type))
return errors.Errorf("invalid Type %q", string(dev.Type))
}
if dev.Major > math.MaxUint32 {
return errors.Errorf("invalid major %d", dev.Major)
@@ -114,9 +114,11 @@ func (p *program) appendDevice(dev *configs.DeviceRule) error {
// If the access is rwm, skip the check.
hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
blockSym := fmt.Sprintf("block-%d", p.blockID)
nextBlockSym := fmt.Sprintf("block-%d", p.blockID+1)
prevBlockLastIdx := len(p.insts) - 1
var (
blockSym = "block-" + strconv.Itoa(p.blockID)
nextBlockSym = "block-" + strconv.Itoa(p.blockID+1)
prevBlockLastIdx = len(p.insts) - 1
)
if hasType {
p.insts = append(p.insts,
// if (R2 != bpfType) goto next
@@ -158,7 +160,7 @@ func (p *program) finalize() (asm.Instructions, error) {
// acceptBlock with asm.Return() is already inserted
return p.insts, nil
}
blockSym := fmt.Sprintf("block-%d", p.blockID)
blockSym := "block-" + strconv.Itoa(p.blockID)
p.insts = append(p.insts,
// R0 <- 0
asm.Mov.Imm32(asm.R0, 0).Sym(blockSym),

View File

@@ -6,7 +6,6 @@ import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
@@ -105,9 +104,9 @@ func splitBlkioStatLine(r rune) bool {
return r == ' ' || r == ':'
}
func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) {
var blkioStats []cgroups.BlkioStatEntry
f, err := os.Open(path)
f, err := fscommon.OpenFile(dir, file, os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return blkioStats, nil
@@ -125,7 +124,7 @@ func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
// skip total line
continue
} else {
return nil, fmt.Errorf("Invalid line found while parsing %s: %s", path, sc.Text())
return nil, fmt.Errorf("Invalid line found while parsing %s/%s: %s", dir, file, sc.Text())
}
}
@@ -158,73 +157,134 @@ func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
}
func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
// Try to read CFQ stats available on all CFQ enabled kernels first
if blkioStats, err := getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err == nil && blkioStats != nil {
return getCFQStats(path, stats)
type blkioStatInfo struct {
filename string
blkioStatEntriesPtr *[]cgroups.BlkioStatEntry
}
var bfqDebugStats = []blkioStatInfo{
{
filename: "blkio.bfq.sectors_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
},
{
filename: "blkio.bfq.io_service_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
},
{
filename: "blkio.bfq.io_wait_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
},
{
filename: "blkio.bfq.io_merged_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
},
{
filename: "blkio.bfq.io_queued_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
},
{
filename: "blkio.bfq.time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
},
{
filename: "blkio.bfq.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.bfq.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
var bfqStats = []blkioStatInfo{
{
filename: "blkio.bfq.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.bfq.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
var cfqStats = []blkioStatInfo{
{
filename: "blkio.sectors_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
},
{
filename: "blkio.io_service_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
},
{
filename: "blkio.io_wait_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
},
{
filename: "blkio.io_merged_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
},
{
filename: "blkio.io_queued_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
},
{
filename: "blkio.time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
},
{
filename: "blkio.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
var throttleRecursiveStats = []blkioStatInfo{
{
filename: "blkio.throttle.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.throttle.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
var baseStats = []blkioStatInfo{
{
filename: "blkio.throttle.io_serviced",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.throttle.io_service_bytes",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
var orderedStats = [][]blkioStatInfo{
bfqDebugStats,
bfqStats,
cfqStats,
throttleRecursiveStats,
baseStats,
}
return getStats(path, stats) // Use generic stats as fallback
}
func getCFQStats(path string, stats *cgroups.Stats) error {
var blkioStats []cgroups.BlkioStatEntry
var err error
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.sectors_recursive")); err != nil {
return err
for _, statGroup := range orderedStats {
for i, statInfo := range statGroup {
if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil {
// if error occurs on first file, move to next group
if i == 0 {
break
}
return err
}
*statInfo.blkioStatEntriesPtr = blkioStats
//finish if all stats are gathered
if i == len(statGroup)-1 {
return nil
}
}
}
stats.BlkioStats.SectorsRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_bytes_recursive")); err != nil {
return err
}
stats.BlkioStats.IoServiceBytesRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err != nil {
return err
}
stats.BlkioStats.IoServicedRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_queued_recursive")); err != nil {
return err
}
stats.BlkioStats.IoQueuedRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_time_recursive")); err != nil {
return err
}
stats.BlkioStats.IoServiceTimeRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_wait_time_recursive")); err != nil {
return err
}
stats.BlkioStats.IoWaitTimeRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_merged_recursive")); err != nil {
return err
}
stats.BlkioStats.IoMergedRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.time_recursive")); err != nil {
return err
}
stats.BlkioStats.IoTimeRecursive = blkioStats
return nil
}
func getStats(path string, stats *cgroups.Stats) error {
var blkioStats []cgroups.BlkioStatEntry
var err error
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_service_bytes")); err != nil {
return err
}
stats.BlkioStats.IoServiceBytesRecursive = blkioStats
if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_serviced")); err != nil {
return err
}
stats.BlkioStats.IoServicedRecursive = blkioStats
return nil
}

View File

@@ -6,7 +6,6 @@ import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -87,7 +86,7 @@ func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
}
func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
f, err := os.Open(filepath.Join(path, "cpu.stat"))
f, err := fscommon.OpenFile(path, "cpu.stat", os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return nil

View File

@@ -5,7 +5,6 @@ package fs
import (
"bufio"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
@@ -83,8 +82,7 @@ func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
// Returns user and kernel usage breakdown in nanoseconds.
func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
userModeUsage := uint64(0)
kernelModeUsage := uint64(0)
var userModeUsage, kernelModeUsage uint64
const (
userField = "user"
systemField = "system"
@@ -93,11 +91,11 @@ func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
// Expected format:
// user <usage in ticks>
// system <usage in ticks>
data, err := ioutil.ReadFile(filepath.Join(path, cgroupCpuacctStat))
data, err := fscommon.ReadFile(path, cgroupCpuacctStat)
if err != nil {
return 0, 0, err
}
fields := strings.Fields(string(data))
fields := strings.Fields(data)
if len(fields) < 4 {
return 0, 0, fmt.Errorf("failure - %s is expected to have at least 4 fields", filepath.Join(path, cgroupCpuacctStat))
}
@@ -119,11 +117,11 @@ func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
func getPercpuUsage(path string) ([]uint64, error) {
percpuUsage := []uint64{}
data, err := ioutil.ReadFile(filepath.Join(path, "cpuacct.usage_percpu"))
data, err := fscommon.ReadFile(path, "cpuacct.usage_percpu")
if err != nil {
return percpuUsage, err
}
for _, value := range strings.Fields(string(data)) {
for _, value := range strings.Fields(data) {
value, err := strconv.ParseUint(value, 10, 64)
if err != nil {
return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err)
@@ -137,7 +135,7 @@ func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
usageKernelMode := []uint64{}
usageUserMode := []uint64{}
file, err := os.Open(filepath.Join(path, cgroupCpuacctUsageAll))
file, err := fscommon.OpenFile(path, cgroupCpuacctUsageAll, os.O_RDONLY)
if os.IsNotExist(err) {
return usageKernelMode, usageUserMode, nil
} else if err != nil {

View File

@@ -3,17 +3,17 @@
package fs
import (
"bytes"
"io/ioutil"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/moby/sys/mountinfo"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
type CpusetGroup struct {
@@ -41,30 +41,107 @@ func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error {
return nil
}
func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}
// Get the source mount point of directory passed in as argument.
func getMount(dir string) (string, error) {
mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(dir))
func getCpusetStat(path string, filename string) ([]uint16, error) {
var extracted []uint16
fileContent, err := fscommon.GetCgroupParamString(path, filename)
if err != nil {
return "", err
return extracted, err
}
if len(mi) < 1 {
return "", errors.Errorf("Can't find mount point of %s", dir)
if len(fileContent) == 0 {
return extracted, fmt.Errorf("%s found to be empty", filepath.Join(path, filename))
}
// find the longest mount point
var idx, maxlen int
for i := range mi {
if len(mi[i].Mountpoint) > maxlen {
maxlen = len(mi[i].Mountpoint)
idx = i
for _, s := range strings.Split(fileContent, ",") {
splitted := strings.SplitN(s, "-", 3)
switch len(splitted) {
case 3:
return extracted, fmt.Errorf("invalid values in %s", filepath.Join(path, filename))
case 2:
min, err := strconv.ParseUint(splitted[0], 10, 16)
if err != nil {
return extracted, err
}
max, err := strconv.ParseUint(splitted[1], 10, 16)
if err != nil {
return extracted, err
}
if min > max {
return extracted, fmt.Errorf("invalid values in %s", filepath.Join(path, filename))
}
for i := min; i <= max; i++ {
extracted = append(extracted, uint16(i))
}
case 1:
value, err := strconv.ParseUint(s, 10, 16)
if err != nil {
return extracted, err
}
extracted = append(extracted, uint16(value))
}
}
return mi[idx].Mountpoint, nil
return extracted, nil
}
func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
var err error
stats.CPUSetStats.CPUs, err = getCpusetStat(path, "cpuset.cpus")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.cpu_exclusive")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.Mems, err = getCpusetStat(path, "cpuset.mems")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_hardwall")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_exclusive")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_migrate")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_page")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_slab")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_pressure")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, "cpuset.sched_load_balance")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, "cpuset.sched_relax_domain_level")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
return nil
}
func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error {
@@ -73,18 +150,13 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
if dir == "" {
return nil
}
root, err := getMount(dir)
if err != nil {
return err
}
root = filepath.Dir(root)
// 'ensureParent' start with parent because we don't want to
// explicitly inherit from parent, it could conflict with
// 'cpuset.cpu_exclusive'.
if err := s.ensureParent(filepath.Dir(dir), root); err != nil {
if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil {
return err
}
if err := os.MkdirAll(dir, 0755); err != nil {
if err := os.Mkdir(dir, 0755); err != nil && !os.IsExist(err) {
return err
}
// We didn't inherit cpuset configs from parent, but we have
@@ -103,59 +175,61 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
return cgroups.WriteCgroupProc(dir, pid)
}
func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) {
if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus")); err != nil {
func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) {
if cpus, err = fscommon.ReadFile(parent, "cpuset.cpus"); err != nil {
return
}
if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems")); err != nil {
if mems, err = fscommon.ReadFile(parent, "cpuset.mems"); err != nil {
return
}
return cpus, mems, nil
}
// ensureParent makes sure that the parent directory of current is created
// and populated with the proper cpus and mems files copied from
// it's parent.
func (s *CpusetGroup) ensureParent(current, root string) error {
// cpusetEnsureParent makes sure that the parent directories of current
// are created and populated with the proper cpus and mems files copied
// from their respective parent. It does that recursively, starting from
// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point).
func cpusetEnsureParent(current string) error {
var st unix.Statfs_t
parent := filepath.Dir(current)
if libcontainerUtils.CleanPath(parent) == root {
err := unix.Statfs(parent, &st)
if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC {
return nil
}
// Avoid infinite recursion.
if parent == current {
return errors.New("cpuset: cgroup parent path outside cgroup root")
// Treat non-existing directory as cgroupfs as it will be created,
// and the root cpuset directory obviously exists.
if err != nil && err != unix.ENOENT {
return &os.PathError{Op: "statfs", Path: parent, Err: err}
}
if err := s.ensureParent(parent, root); err != nil {
if err := cpusetEnsureParent(parent); err != nil {
return err
}
if err := os.MkdirAll(current, 0755); err != nil {
if err := os.Mkdir(current, 0755); err != nil && !os.IsExist(err) {
return err
}
return s.copyIfNeeded(current, parent)
return cpusetCopyIfNeeded(current, parent)
}
// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
// directory to the current directory if the file's contents are 0
func (s *CpusetGroup) copyIfNeeded(current, parent string) error {
var (
err error
currentCpus, currentMems []byte
parentCpus, parentMems []byte
)
if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil {
func cpusetCopyIfNeeded(current, parent string) error {
currentCpus, currentMems, err := getCpusetSubsystemSettings(current)
if err != nil {
return err
}
if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil {
parentCpus, parentMems, err := getCpusetSubsystemSettings(parent)
if err != nil {
return err
}
if s.isEmpty(currentCpus) {
if isEmptyCpuset(currentCpus) {
if err := fscommon.WriteFile(current, "cpuset.cpus", string(parentCpus)); err != nil {
return err
}
}
if s.isEmpty(currentMems) {
if isEmptyCpuset(currentMems) {
if err := fscommon.WriteFile(current, "cpuset.mems", string(parentMems)); err != nil {
return err
}
@@ -163,13 +237,13 @@ func (s *CpusetGroup) copyIfNeeded(current, parent string) error {
return nil
}
func (s *CpusetGroup) isEmpty(b []byte) bool {
return len(bytes.Trim(b, "\n")) == 0
func isEmptyCpuset(str string) bool {
return str == "" || str == "\n"
}
func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error {
if err := s.Set(path, cgroup); err != nil {
return err
}
return s.copyIfNeeded(path, filepath.Dir(path))
return cpusetCopyIfNeeded(path, filepath.Dir(path))
}

View File

@@ -8,9 +8,10 @@ import (
"reflect"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/devices"
cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/system"
)
@@ -34,17 +35,17 @@ func (s *DevicesGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
}
func loadEmulator(path string) (*devices.Emulator, error) {
func loadEmulator(path string) (*cgroupdevices.Emulator, error) {
list, err := fscommon.ReadFile(path, "devices.list")
if err != nil {
return nil, err
}
return devices.EmulatorFromList(bytes.NewBufferString(list))
return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list))
}
func buildEmulator(rules []*configs.DeviceRule) (*devices.Emulator, error) {
func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) {
// This defaults to a white-list -- which is what we want!
emu := &devices.Emulator{}
emu := &cgroupdevices.Emulator{}
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, err

View File

@@ -28,33 +28,54 @@ func (s *FreezerGroup) Apply(path string, d *cgroupData) error {
func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
switch cgroup.Resources.Freezer {
case configs.Frozen, configs.Thawed:
for {
// In case this loop does not exit because it doesn't get the expected
// state, let's write again this state, hoping it's going to be properly
// set this time. Otherwise, this loop could run infinitely, waiting for
// a state change that would never happen.
if err := fscommon.WriteFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
case configs.Frozen:
// As per older kernel docs (freezer-subsystem.txt before
// kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
// userspace should either retry or thaw. While current
// kernel cgroup v1 docs no longer mention a need to retry,
// the kernel (tested on v5.4, Ubuntu 20.04) can't reliably
// freeze a cgroup while new processes keep appearing in it
// (either via fork/clone or by writing new PIDs to
// cgroup.procs).
//
// The number of retries below is chosen to have a decent
// chance to succeed even in the worst case scenario (runc
// pause/unpause with parallel runc exec).
//
// Adding any amount of sleep in between retries did not
// increase the chances of successful freeze.
for i := 0; i < 1000; i++ {
if err := fscommon.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil {
return err
}
state, err := s.GetState(path)
state, err := fscommon.ReadFile(path, "freezer.state")
if err != nil {
return err
}
if state == cgroup.Resources.Freezer {
break
state = strings.TrimSpace(state)
switch state {
case "FREEZING":
continue
case string(configs.Frozen):
return nil
default:
// should never happen
return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state))
}
time.Sleep(1 * time.Millisecond)
}
// Despite our best efforts, it got stuck in FREEZING.
// Leaving it in this state is bad and dangerous, so
// let's (try to) thaw it back and error out.
_ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
return errors.New("unable to freeze")
case configs.Thawed:
return fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
case configs.Undefined:
return nil
default:
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer))
}
return nil
}
func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {

View File

@@ -3,11 +3,9 @@
package fs
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strings"
"sync"
"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -133,46 +131,19 @@ func getCgroupRoot() (string, error) {
return cgroupRoot, nil
}
// slow path: parse mountinfo, find the first mount where fs=cgroup
// (e.g. "/sys/fs/cgroup/memory"), use its parent.
f, err := os.Open("/proc/self/mountinfo")
// slow path: parse mountinfo
mi, err := cgroups.GetCgroupMounts(false)
if err != nil {
return "", err
}
defer f.Close()
var root string
scanner := bufio.NewScanner(f)
for scanner.Scan() {
text := scanner.Text()
fields := strings.Split(text, " ")
// Safe as mountinfo encodes mountpoints with spaces as \040.
index := strings.Index(text, " - ")
postSeparatorFields := strings.Fields(text[index+3:])
numPostFields := len(postSeparatorFields)
// This is an error as we can't detect if the mount is for "cgroup"
if numPostFields == 0 {
return "", fmt.Errorf("mountinfo: found no fields post '-' in %q", text)
}
if postSeparatorFields[0] == "cgroup" {
// Check that the mount is properly formatted.
if numPostFields < 3 {
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
}
root = filepath.Dir(fields[4])
break
}
}
if err := scanner.Err(); err != nil {
return "", err
}
if root == "" {
if len(mi) < 1 {
return "", errors.New("no cgroup mount found in mountinfo")
}
// Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"),
// use its parent directory.
root := filepath.Dir(mi[0].Mountpoint)
if _, err := os.Stat(root); err != nil {
return "", err
}
@@ -218,28 +189,31 @@ func (m *manager) Apply(pid int) (err error) {
m.mu.Lock()
defer m.mu.Unlock()
var c = m.cgroups
d, err := getCgroupData(m.cgroups, pid)
if err != nil {
return err
c := m.cgroups
if c.Resources.Unified != nil {
return cgroups.ErrV1NoUnified
}
m.paths = make(map[string]string)
if c.Paths != nil {
cgMap, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return err
}
for name, path := range c.Paths {
_, err := d.path(name)
if err != nil {
if cgroups.IsNotFound(err) {
continue
}
return err
// XXX(kolyshkin@): why this check is needed?
if _, ok := cgMap[name]; ok {
m.paths[name] = path
}
m.paths[name] = path
}
return cgroups.EnterPid(m.paths, pid)
}
d, err := getCgroupData(m.cgroups, pid)
if err != nil {
return err
}
for _, sys := range subsystems {
p, err := d.path(sys.Name())
if err != nil {
@@ -274,11 +248,7 @@ func (m *manager) Destroy() error {
}
m.mu.Lock()
defer m.mu.Unlock()
if err := cgroups.RemovePaths(m.paths); err != nil {
return err
}
m.paths = make(map[string]string)
return nil
return cgroups.RemovePaths(m.paths)
}
func (m *manager) Path(subsys string) string {
@@ -313,6 +283,9 @@ func (m *manager) Set(container *configs.Config) error {
if m.cgroups != nil && m.cgroups.Paths != nil {
return nil
}
if container.Cgroups.Resources.Unified != nil {
return cgroups.ErrV1NoUnified
}
m.mu.Lock()
defer m.mu.Unlock()
@@ -425,16 +398,6 @@ func join(path string, pid int) error {
return cgroups.WriteCgroupProc(path, pid)
}
func removePath(p string, err error) error {
if err != nil {
return err
}
if p != "" {
return os.RemoveAll(p)
}
return nil
}
func (m *manager) GetPaths() map[string]string {
m.mu.Lock()
defer m.mu.Unlock()

View File

@@ -5,7 +5,6 @@ package fs
import (
"fmt"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
@@ -25,7 +24,7 @@ func (s *HugetlbGroup) Apply(path string, d *cgroupData) error {
func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error {
for _, hugetlb := range cgroup.Resources.HugetlbLimit {
if err := fscommon.WriteFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
if err := fscommon.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}
}
@@ -39,21 +38,21 @@ func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}
for _, pageSize := range HugePageSizes {
usage := strings.Join([]string{"hugetlb", pageSize, "usage_in_bytes"}, ".")
usage := "hugetlb." + pageSize + ".usage_in_bytes"
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", usage, err)
}
hugetlbStats.Usage = value
maxUsage := strings.Join([]string{"hugetlb", pageSize, "max_usage_in_bytes"}, ".")
maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes"
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", maxUsage, err)
}
hugetlbStats.MaxUsage = value
failcnt := strings.Join([]string{"hugetlb", pageSize, "failcnt"}, ".")
failcnt := "hugetlb." + pageSize + ".failcnt"
value, err = fscommon.GetCgroupParamUint(path, failcnt)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", failcnt, err)

View File

@@ -5,11 +5,11 @@ package fs
import (
"errors"
"fmt"
"io/ioutil"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"golang.org/x/sys/unix"
)
@@ -42,7 +42,7 @@ func setKernelMemory(path string, kernelMemoryLimit int64) error {
// doesn't support it we *must* error out.
return errors.New("kernel memory accounting not supported by this kernel")
}
if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
if err := fscommon.WriteFile(path, cgroupKernelMemoryLimit, strconv.FormatInt(kernelMemoryLimit, 10)); err != nil {
// Check if the error number returned by the syscall is "EBUSY"
// The EBUSY signal is returned on attempts to write to the
// memory.kmem.limit_in_bytes file if the cgroup has children or
@@ -50,7 +50,7 @@ func setKernelMemory(path string, kernelMemoryLimit int64) error {
if errors.Is(err, unix.EBUSY) {
return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
}
return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
return err
}
return nil
}

View File

@@ -7,7 +7,6 @@ import (
"fmt"
"math"
"os"
"path"
"path/filepath"
"strconv"
"strings"
@@ -18,16 +17,8 @@ import (
)
const (
numaNodeSymbol = "N"
numaStatColumnSeparator = " "
numaStatKeyValueSeparator = "="
numaStatMaxColumns = math.MaxUint8 + 1
numaStatValueIndex = 1
numaStatTypeIndex = 0
numaStatColumnSliceLength = 2
cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
cgroupMemoryLimit = "memory.limit_in_bytes"
cgroupMemoryPagesByNuma = "memory.numa_stat"
cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
cgroupMemoryLimit = "memory.limit_in_bytes"
)
type MemoryGroup struct {
@@ -160,7 +151,7 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {
func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
// Set stats from memory.stat.
statsFile, err := os.Open(filepath.Join(path, "memory.stat"))
statsFile, err := fscommon.OpenFile(path, "memory.stat", os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return nil
@@ -200,8 +191,7 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
}
stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
useHierarchy := strings.Join([]string{"memory", "use_hierarchy"}, ".")
value, err := fscommon.GetCgroupParamUint(path, useHierarchy)
value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy")
if err != nil {
return err
}
@@ -233,12 +223,14 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
moduleName := "memory"
if name != "" {
moduleName = strings.Join([]string{"memory", name}, ".")
moduleName = "memory." + name
}
usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
var (
usage = moduleName + ".usage_in_bytes"
maxUsage = moduleName + ".max_usage_in_bytes"
failcnt = moduleName + ".failcnt"
limit = moduleName + ".limit_in_bytes"
)
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
@@ -277,47 +269,81 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
}
func getPageUsageByNUMA(cgroupPath string) (cgroups.PageUsageByNUMA, error) {
const (
maxColumns = math.MaxUint8 + 1
filename = "memory.numa_stat"
)
stats := cgroups.PageUsageByNUMA{}
file, err := os.Open(path.Join(cgroupPath, cgroupMemoryPagesByNuma))
file, err := fscommon.OpenFile(cgroupPath, filename, os.O_RDONLY)
if os.IsNotExist(err) {
return stats, nil
} else if err != nil {
return stats, err
}
// File format is documented in linux/Documentation/cgroup-v1/memory.txt
// and it looks like this:
//
// total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
// file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
// anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
// unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
// hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
scanner := bufio.NewScanner(file)
for scanner.Scan() {
var statsType string
statsByType := cgroups.PageStats{Nodes: map[uint8]uint64{}}
columns := strings.SplitN(scanner.Text(), numaStatColumnSeparator, numaStatMaxColumns)
var field *cgroups.PageStats
for _, column := range columns {
pagesByNode := strings.SplitN(column, numaStatKeyValueSeparator, numaStatColumnSliceLength)
line := scanner.Text()
columns := strings.SplitN(line, " ", maxColumns)
for i, column := range columns {
byNode := strings.SplitN(column, "=", 2)
// Some custom kernels have non-standard fields, like
// numa_locality 0 0 0 0 0 0 0 0 0 0
// numa_exectime 0
if len(byNode) < 2 {
if i == 0 {
// Ignore/skip those.
break
} else {
// The first column was already validated,
// so be strict to the rest.
return stats, fmt.Errorf("malformed line %q in %s",
line, filename)
}
}
key, val := byNode[0], byNode[1]
if i == 0 { // First column: key is name, val is total.
field = getNUMAField(&stats, key)
if field == nil { // unknown field (new kernel?)
break
}
field.Total, err = strconv.ParseUint(val, 0, 64)
if err != nil {
return stats, err
}
field.Nodes = map[uint8]uint64{}
} else { // Subsequent columns: key is N<id>, val is usage.
if len(key) < 2 || key[0] != 'N' {
// This is definitely an error.
return stats, fmt.Errorf("malformed line %q in %s",
line, filename)
}
if strings.HasPrefix(pagesByNode[numaStatTypeIndex], numaNodeSymbol) {
nodeID, err := strconv.ParseUint(pagesByNode[numaStatTypeIndex][1:], 10, 8)
n, err := strconv.ParseUint(key[1:], 10, 8)
if err != nil {
return cgroups.PageUsageByNUMA{}, err
}
statsByType.Nodes[uint8(nodeID)], err = strconv.ParseUint(pagesByNode[numaStatValueIndex], 0, 64)
if err != nil {
return cgroups.PageUsageByNUMA{}, err
}
} else {
statsByType.Total, err = strconv.ParseUint(pagesByNode[numaStatValueIndex], 0, 64)
usage, err := strconv.ParseUint(val, 10, 64)
if err != nil {
return cgroups.PageUsageByNUMA{}, err
}
statsType = pagesByNode[numaStatTypeIndex]
field.Nodes[uint8(n)] = usage
}
err := addNUMAStatsByType(&stats, statsByType, statsType)
if err != nil {
return cgroups.PageUsageByNUMA{}, err
}
}
}
err = scanner.Err()
@@ -328,26 +354,24 @@ func getPageUsageByNUMA(cgroupPath string) (cgroups.PageUsageByNUMA, error) {
return stats, nil
}
func addNUMAStatsByType(stats *cgroups.PageUsageByNUMA, byTypeStats cgroups.PageStats, statsType string) error {
switch statsType {
func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats {
switch name {
case "total":
stats.Total = byTypeStats
return &stats.Total
case "file":
stats.File = byTypeStats
return &stats.File
case "anon":
stats.Anon = byTypeStats
return &stats.Anon
case "unevictable":
stats.Unevictable = byTypeStats
return &stats.Unevictable
case "hierarchical_total":
stats.Hierarchical.Total = byTypeStats
return &stats.Hierarchical.Total
case "hierarchical_file":
stats.Hierarchical.File = byTypeStats
return &stats.Hierarchical.File
case "hierarchical_anon":
stats.Hierarchical.Anon = byTypeStats
return &stats.Hierarchical.Anon
case "hierarchical_unevictable":
stats.Hierarchical.Unevictable = byTypeStats
default:
return fmt.Errorf("unsupported NUMA page type found: %s", statsType)
return &stats.Hierarchical.Unevictable
}
return nil
}

View File

@@ -19,7 +19,7 @@ func (s *NameGroup) Name() string {
func (s *NameGroup) Apply(path string, d *cgroupData) error {
if s.Join {
// ignore errors if the named cgroup does not exist
join(path, d.pid)
_ = join(path, d.pid)
}
return nil
}

View File

@@ -5,7 +5,6 @@ package fs2
import (
"bufio"
"os"
"path/filepath"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -50,7 +49,7 @@ func setCpu(dirPath string, cgroup *configs.Cgroup) error {
return nil
}
func statCpu(dirPath string, stats *cgroups.Stats) error {
f, err := os.Open(filepath.Join(dirPath, "cpu.stat"))
f, err := fscommon.OpenFile(dirPath, "cpu.stat", os.O_RDONLY)
if err != nil {
return err
}

View File

@@ -1,19 +1,17 @@
package fs2
import (
"bytes"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
func supportedControllers(cgroup *configs.Cgroup) ([]byte, error) {
const file = UnifiedMountpoint + "/cgroup.controllers"
return ioutil.ReadFile(file)
func supportedControllers(cgroup *configs.Cgroup) (string, error) {
return fscommon.ReadFile(UnifiedMountpoint, "/cgroup.controllers")
}
// needAnyControllers returns whether we enable some supported controllers or not,
@@ -31,7 +29,7 @@ func needAnyControllers(cgroup *configs.Cgroup) (bool, error) {
return false, err
}
avail := make(map[string]struct{})
for _, ctr := range strings.Fields(string(content)) {
for _, ctr := range strings.Fields(content) {
avail[ctr] = struct{}{}
}
@@ -81,8 +79,12 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
return err
}
ctrs := bytes.Fields(content)
res := append([]byte("+"), bytes.Join(ctrs, []byte(" +"))...)
const (
cgTypeFile = "cgroup.type"
cgStCtlFile = "cgroup.subtree_control"
)
ctrs := strings.Fields(content)
res := "+" + strings.Join(ctrs, " +")
elements := strings.Split(path, "/")
elements = elements[3:]
@@ -103,9 +105,9 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
}
}()
}
cgTypeFile := filepath.Join(current, "cgroup.type")
cgType, _ := ioutil.ReadFile(cgTypeFile)
switch strings.TrimSpace(string(cgType)) {
cgType, _ := fscommon.ReadFile(current, cgTypeFile)
cgType = strings.TrimSpace(cgType)
switch cgType {
// If the cgroup is in an invalid mode (usually this means there's an internal
// process in the cgroup tree, because we created a cgroup under an
// already-populated-by-other-processes cgroup), then we have to error out if
@@ -120,7 +122,7 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
// since that means we're a properly delegated cgroup subtree) but in
// this case there's not much we can do and it's better than giving an
// error.
_ = ioutil.WriteFile(cgTypeFile, []byte("threaded"), 0644)
_ = fscommon.WriteFile(current, cgTypeFile, "threaded")
}
// If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers
// (and you cannot usually take a cgroup out of threaded mode).
@@ -128,18 +130,17 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
fallthrough
case "threaded":
if containsDomainController(c) {
return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, strings.TrimSpace(string(cgType)))
return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType)
}
}
}
// enable all supported controllers
if i < len(elements)-1 {
file := filepath.Join(current, "cgroup.subtree_control")
if err := ioutil.WriteFile(file, res, 0644); err != nil {
if err := fscommon.WriteFile(current, cgStCtlFile, res); err != nil {
// try write one by one
allCtrs := bytes.Split(res, []byte(" "))
allCtrs := strings.Split(res, " ")
for _, ctr := range allCtrs {
_ = ioutil.WriteFile(file, ctr, 0644)
_ = fscommon.WriteFile(current, cgStCtlFile, ctr)
}
}
// Some controllers might not be enabled when rootless or containerized,

View File

@@ -6,11 +6,12 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups/ebpf"
"github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
func isRWM(perms configs.DevicePermissions) bool {
func isRWM(perms devices.Permissions) bool {
var r, w, m bool
for _, perm := range perms {
switch perm {
@@ -61,7 +62,7 @@ func setDevices(dirPath string, cgroup *configs.Cgroup) error {
//
// The real issue is that BPF_F_ALLOW_MULTI makes it hard to have a
// race-free blacklist because it acts as a whitelist by default, and
// having a deny-everything program cannot be overriden by other
// having a deny-everything program cannot be overridden by other
// programs. You could temporarily insert a deny-everything program
// but that would result in spurrious failures during updates.
if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {

View File

@@ -19,7 +19,7 @@ func setFreezer(dirPath string, state configs.FreezerState) error {
// freeze the container (since without the freezer cgroup, that's a
// no-op).
if state == configs.Undefined || state == configs.Thawed {
err = nil
return nil
}
return errors.Wrap(err, "freezer not supported")
}

View File

@@ -3,15 +3,14 @@
package fs2
import (
"io/ioutil"
"fmt"
"os"
"path/filepath"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
type manager struct {
@@ -52,15 +51,14 @@ func (m *manager) getControllers() error {
return nil
}
file := filepath.Join(m.dirPath, "cgroup.controllers")
data, err := ioutil.ReadFile(file)
data, err := fscommon.ReadFile(m.dirPath, "cgroup.controllers")
if err != nil {
if m.rootless && m.config.Path == "" {
return nil
}
return err
}
fields := strings.Fields(string(data))
fields := strings.Fields(data)
m.controllers = make(map[string]struct{}, len(fields))
for _, c := range fields {
m.controllers[c] = struct{}{}
@@ -157,45 +155,8 @@ func (m *manager) Freeze(state configs.FreezerState) error {
return nil
}
func rmdir(path string) error {
err := unix.Rmdir(path)
if err == nil || err == unix.ENOENT {
return nil
}
return &os.PathError{Op: "rmdir", Path: path, Err: err}
}
// removeCgroupPath aims to remove cgroup path recursively
// Because there may be subcgroups in it.
func removeCgroupPath(path string) error {
// try the fast path first
if err := rmdir(path); err == nil {
return nil
}
infos, err := ioutil.ReadDir(path)
if err != nil {
if os.IsNotExist(err) {
err = nil
}
return err
}
for _, info := range infos {
if info.IsDir() {
// We should remove subcgroups dir first
if err = removeCgroupPath(filepath.Join(path, info.Name())); err != nil {
break
}
}
}
if err == nil {
err = rmdir(path)
}
return err
}
func (m *manager) Destroy() error {
return removeCgroupPath(m.dirPath)
return cgroups.RemovePath(m.dirPath)
}
func (m *manager) Path(_ string) string {
@@ -245,10 +206,40 @@ func (m *manager) Set(container *configs.Config) error {
if err := setFreezer(m.dirPath, container.Cgroups.Freezer); err != nil {
return err
}
if err := m.setUnified(container.Cgroups.Unified); err != nil {
return err
}
m.config = container.Cgroups
return nil
}
func (m *manager) setUnified(res map[string]string) error {
for k, v := range res {
if strings.Contains(k, "/") {
return fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
}
if err := fscommon.WriteFile(m.dirPath, k, v); err != nil {
errC := errors.Cause(err)
// Check for both EPERM and ENOENT since O_CREAT is used by WriteFile.
if errors.Is(errC, os.ErrPermission) || errors.Is(errC, os.ErrNotExist) {
// Check if a controller is available,
// to give more specific error if not.
sk := strings.SplitN(k, ".", 2)
if len(sk) != 2 {
return fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
}
c := sk[0]
if _, ok := m.controllers[c]; !ok && c != "cgroup" {
return fmt.Errorf("unified resource %q can't be set: controller %q not available", k, c)
}
}
return errors.Wrapf(err, "can't set unified resource %q", k)
}
}
return nil
}
func (m *manager) GetPaths() map[string]string {
paths := make(map[string]string, 1)
paths[""] = m.dirPath

View File

@@ -3,10 +3,7 @@
package fs2
import (
"io/ioutil"
"path/filepath"
"strconv"
"strings"
"github.com/pkg/errors"
@@ -24,7 +21,7 @@ func setHugeTlb(dirPath string, cgroup *configs.Cgroup) error {
return nil
}
for _, hugetlb := range cgroup.Resources.HugetlbLimit {
if err := fscommon.WriteFile(dirPath, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "max"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
if err := fscommon.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}
}
@@ -40,22 +37,20 @@ func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
hugetlbStats := cgroups.HugetlbStats{}
for _, pagesize := range hugePageSizes {
usage := strings.Join([]string{"hugetlb", pagesize, "current"}, ".")
value, err := fscommon.GetCgroupParamUint(dirPath, usage)
value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current")
if err != nil {
return errors.Wrapf(err, "failed to parse hugetlb.%s.current file", pagesize)
return err
}
hugetlbStats.Usage = value
fileName := strings.Join([]string{"hugetlb", pagesize, "events"}, ".")
filePath := filepath.Join(dirPath, fileName)
contents, err := ioutil.ReadFile(filePath)
fileName := "hugetlb." + pagesize + ".events"
contents, err := fscommon.ReadFile(dirPath, fileName)
if err != nil {
return errors.Wrapf(err, "failed to parse hugetlb.%s.events file", pagesize)
return errors.Wrap(err, "failed to read stats")
}
_, value, err = fscommon.GetCgroupParamKeyValue(string(contents))
_, value, err = fscommon.GetCgroupParamKeyValue(contents)
if err != nil {
return errors.Wrapf(err, "failed to parse hugetlb.%s.events file", pagesize)
return errors.Wrap(err, "failed to parse "+fileName)
}
hugetlbStats.Failcnt = value

View File

@@ -5,7 +5,6 @@ package fs2
import (
"bufio"
"os"
"path/filepath"
"strconv"
"strings"
@@ -60,8 +59,7 @@ func setIo(dirPath string, cgroup *configs.Cgroup) error {
func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) {
ret := map[string][]string{}
p := filepath.Join(dirPath, name)
f, err := os.Open(p)
f, err := fscommon.OpenFile(dirPath, name, os.O_RDONLY)
if err != nil {
return nil, err
}

View File

@@ -5,9 +5,7 @@ package fs2
import (
"bufio"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
@@ -76,7 +74,7 @@ func setMemory(dirPath string, cgroup *configs.Cgroup) error {
func statMemory(dirPath string, stats *cgroups.Stats) error {
// Set stats from memory.stat.
statsFile, err := os.Open(filepath.Join(dirPath, "memory.stat"))
statsFile, err := fscommon.OpenFile(dirPath, "memory.stat", os.O_RDONLY)
if err != nil {
return err
}
@@ -112,10 +110,10 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
moduleName := "memory"
if name != "" {
moduleName = strings.Join([]string{"memory", name}, ".")
moduleName = "memory." + name
}
usage := strings.Join([]string{moduleName, "current"}, ".")
limit := strings.Join([]string{moduleName, "max"}, ".")
usage := moduleName + ".current"
limit := moduleName + ".max"
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {

View File

@@ -3,7 +3,6 @@
package fs2
import (
"io/ioutil"
"path/filepath"
"strings"
@@ -34,15 +33,15 @@ func setPids(dirPath string, cgroup *configs.Cgroup) error {
func statPidsWithoutController(dirPath string, stats *cgroups.Stats) error {
// if the controller is not enabled, let's read PIDS from cgroups.procs
// (or threads if cgroup.threads is enabled)
contents, err := ioutil.ReadFile(filepath.Join(dirPath, "cgroup.procs"))
contents, err := fscommon.ReadFile(dirPath, "cgroup.procs")
if errors.Is(err, unix.ENOTSUP) {
contents, err = ioutil.ReadFile(filepath.Join(dirPath, "cgroup.threads"))
contents, err = fscommon.ReadFile(dirPath, "cgroup.threads")
}
if err != nil {
return err
}
pids := make(map[string]string)
for _, i := range strings.Split(string(contents), "\n") {
for _, i := range strings.Split(contents, "\n") {
if i != "" {
pids[i] = i
}

View File

@@ -3,46 +3,47 @@
package fscommon
import (
"io/ioutil"
"bytes"
"os"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
// WriteFile writes data to a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func WriteFile(dir, file, data string) error {
if dir == "" {
return errors.Errorf("no directory specified for %s", file)
}
path, err := securejoin.SecureJoin(dir, file)
fd, err := OpenFile(dir, file, unix.O_WRONLY)
if err != nil {
return err
}
if err := retryingWriteFile(path, []byte(data), 0700); err != nil {
return errors.Wrapf(err, "failed to write %q to %q", data, path)
defer fd.Close()
if err := retryingWriteFile(fd, data); err != nil {
return errors.Wrapf(err, "failed to write %q", data)
}
return nil
}
// ReadFile reads data from a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func ReadFile(dir, file string) (string, error) {
if dir == "" {
return "", errors.Errorf("no directory specified for %s", file)
}
path, err := securejoin.SecureJoin(dir, file)
fd, err := OpenFile(dir, file, unix.O_RDONLY)
if err != nil {
return "", err
}
data, err := ioutil.ReadFile(path)
return string(data), err
defer fd.Close()
var buf bytes.Buffer
_, err = buf.ReadFrom(fd)
return buf.String(), err
}
func retryingWriteFile(filename string, data []byte, perm os.FileMode) error {
func retryingWriteFile(fd *os.File, data string) error {
for {
err := ioutil.WriteFile(filename, data, perm)
_, err := fd.Write([]byte(data))
if errors.Is(err, unix.EINTR) {
logrus.Infof("interrupted while writing %s to %s", string(data), filename)
logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
continue
}
return err

View File

@@ -0,0 +1,103 @@
package fscommon
import (
"os"
"strings"
"sync"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
const (
cgroupfsDir = "/sys/fs/cgroup"
cgroupfsPrefix = cgroupfsDir + "/"
)
var (
// Set to true by fs unit tests
TestMode bool
cgroupFd int = -1
prepOnce sync.Once
prepErr error
resolveFlags uint64
)
func prepareOpenat2() error {
prepOnce.Do(func() {
fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
Flags: unix.O_DIRECTORY | unix.O_PATH})
if err != nil {
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
if err != unix.ENOSYS {
logrus.Warnf("falling back to securejoin: %s", prepErr)
} else {
logrus.Debug("openat2 not available, falling back to securejoin")
}
return
}
var st unix.Statfs_t
if err = unix.Fstatfs(fd, &st); err != nil {
prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
logrus.Warnf("falling back to securejoin: %s", prepErr)
return
}
cgroupFd = fd
resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
if st.Type == unix.CGROUP2_SUPER_MAGIC {
// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
}
})
return prepErr
}
// OpenFile opens a cgroup file in a given dir with given flags.
// It is supposed to be used for cgroup files only.
func OpenFile(dir, file string, flags int) (*os.File, error) {
if dir == "" {
return nil, errors.Errorf("no directory specified for %s", file)
}
mode := os.FileMode(0)
if TestMode && flags&os.O_WRONLY != 0 {
// "emulate" cgroup fs for unit tests
flags |= os.O_TRUNC | os.O_CREATE
mode = 0o600
}
reldir := strings.TrimPrefix(dir, cgroupfsPrefix)
if len(reldir) == len(dir) { // non-standard path, old system?
return openWithSecureJoin(dir, file, flags, mode)
}
if prepareOpenat2() != nil {
return openWithSecureJoin(dir, file, flags, mode)
}
relname := reldir + "/" + file
fd, err := unix.Openat2(cgroupFd, relname,
&unix.OpenHow{
Resolve: resolveFlags,
Flags: uint64(flags) | unix.O_CLOEXEC,
Mode: uint64(mode),
})
if err != nil {
return nil, &os.PathError{Op: "openat2", Path: dir + "/" + file, Err: err}
}
return os.NewFile(uintptr(fd), cgroupfsPrefix+relname), nil
}
func openWithSecureJoin(dir, file string, flags int, mode os.FileMode) (*os.File, error) {
path, err := securejoin.SecureJoin(dir, file)
if err != nil {
return nil, err
}
return os.OpenFile(path, flags, mode)
}

View File

@@ -5,9 +5,7 @@ package fscommon
import (
"errors"
"fmt"
"io/ioutil"
"math"
"path/filepath"
"strconv"
"strings"
)
@@ -16,8 +14,9 @@ var (
ErrNotValidFormat = errors.New("line is not a valid key value format")
)
// Saturates negative values at zero and returns a uint64.
// Due to kernel bugs, some of the memory cgroup stats can be negative.
// ParseUint converts a string to an uint64 integer.
// Negative values are returned at zero as, due to kernel bugs,
// some of the memory cgroup stats can be negative.
func ParseUint(s string, base, bitSize int) (uint64, error) {
value, err := strconv.ParseUint(s, base, bitSize)
if err != nil {
@@ -36,15 +35,16 @@ func ParseUint(s string, base, bitSize int) (uint64, error) {
return value, nil
}
// Parses a cgroup param and returns as name, value
// i.e. "io_service_bytes 1234" will return as io_service_bytes, 1234
// GetCgroupParamKeyValue parses a space-separated "name value" kind of cgroup
// parameter and returns its components. For example, "io_service_bytes 1234"
// will return as "io_service_bytes", 1234.
func GetCgroupParamKeyValue(t string) (string, uint64, error) {
parts := strings.Fields(t)
switch len(parts) {
case 2:
value, err := ParseUint(parts[1], 10, 64)
if err != nil {
return "", 0, fmt.Errorf("unable to convert param value (%q) to uint64: %v", parts[1], err)
return "", 0, fmt.Errorf("unable to convert to uint64: %v", err)
}
return parts[0], value, nil
@@ -53,31 +53,50 @@ func GetCgroupParamKeyValue(t string) (string, uint64, error) {
}
}
// Gets a single uint64 value from the specified cgroup file.
func GetCgroupParamUint(cgroupPath, cgroupFile string) (uint64, error) {
fileName := filepath.Join(cgroupPath, cgroupFile)
contents, err := ioutil.ReadFile(fileName)
// GetCgroupParamUint reads a single uint64 value from the specified cgroup file.
// If the value read is "max", the math.MaxUint64 is returned.
func GetCgroupParamUint(path, file string) (uint64, error) {
contents, err := GetCgroupParamString(path, file)
if err != nil {
return 0, err
}
trimmed := strings.TrimSpace(string(contents))
if trimmed == "max" {
contents = strings.TrimSpace(contents)
if contents == "max" {
return math.MaxUint64, nil
}
res, err := ParseUint(trimmed, 10, 64)
res, err := ParseUint(contents, 10, 64)
if err != nil {
return res, fmt.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), fileName)
return res, fmt.Errorf("unable to parse file %q", path+"/"+file)
}
return res, nil
}
// Gets a string value from the specified cgroup file
func GetCgroupParamString(cgroupPath, cgroupFile string) (string, error) {
contents, err := ioutil.ReadFile(filepath.Join(cgroupPath, cgroupFile))
// GetCgroupParamInt reads a single int64 value from specified cgroup file.
// If the value read is "max", the math.MaxInt64 is returned.
func GetCgroupParamInt(path, file string) (int64, error) {
contents, err := ReadFile(path, file)
if err != nil {
return 0, err
}
contents = strings.TrimSpace(contents)
if contents == "max" {
return math.MaxInt64, nil
}
res, err := strconv.ParseInt(contents, 10, 64)
if err != nil {
return res, fmt.Errorf("unable to parse %q as a int from Cgroup file %q", contents, path+"/"+file)
}
return res, nil
}
// GetCgroupParamString reads a string from the specified cgroup file.
func GetCgroupParamString(path, file string) (string, error) {
contents, err := ReadFile(path, file)
if err != nil {
return "", err
}
return strings.TrimSpace(string(contents)), nil
return strings.TrimSpace(contents), nil
}

View File

@@ -39,6 +39,33 @@ type CpuStats struct {
ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
}
type CPUSetStats struct {
// List of the physical numbers of the CPUs on which processes
// in that cpuset are allowed to execute
CPUs []uint16 `json:"cpus,omitempty"`
// cpu_exclusive flag
CPUExclusive uint64 `json:"cpu_exclusive"`
// List of memory nodes on which processes in that cpuset
// are allowed to allocate memory
Mems []uint16 `json:"mems,omitempty"`
// mem_hardwall flag
MemHardwall uint64 `json:"mem_hardwall"`
// mem_exclusive flag
MemExclusive uint64 `json:"mem_exclusive"`
// memory_migrate flag
MemoryMigrate uint64 `json:"memory_migrate"`
// memory_spread page flag
MemorySpreadPage uint64 `json:"memory_spread_page"`
// memory_spread slab flag
MemorySpreadSlab uint64 `json:"memory_spread_slab"`
// memory_pressure
MemoryPressure uint64 `json:"memory_pressure"`
// sched_load balance flag
SchedLoadBalance uint64 `json:"sched_load_balance"`
// sched_relax_domain_level
SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
}
type MemoryData struct {
Usage uint64 `json:"usage,omitempty"`
MaxUsage uint64 `json:"max_usage,omitempty"`
@@ -121,6 +148,7 @@ type HugetlbStats struct {
type Stats struct {
CpuStats CpuStats `json:"cpu_stats,omitempty"`
CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
MemoryStats MemoryStats `json:"memory_stats,omitempty"`
PidsStats PidsStats `json:"pids_stats,omitempty"`
BlkioStats BlkioStats `json:"blkio_stats,omitempty"`

View File

@@ -13,12 +13,20 @@ import (
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
"github.com/opencontainers/runc/libcontainer/cgroups/devices"
cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
const (
// Default kernel value for cpu quota period is 100000 us (100 ms), same for v1 and v2.
// v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and
// v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
defCPUQuotaPeriod = uint64(100000)
)
var (
connOnce sync.Once
connDbus *systemdDbus.Conn
@@ -26,7 +34,6 @@ var (
versionOnce sync.Once
version int
versionErr error
isRunningSystemdOnce sync.Once
isRunningSystemd bool
@@ -81,11 +88,11 @@ func ExpandSlice(slice string) (string, error) {
return path, nil
}
func groupPrefix(ruleType configs.DeviceType) (string, error) {
func groupPrefix(ruleType devices.Type) (string, error) {
switch ruleType {
case configs.BlockDevice:
case devices.BlockDevice:
return "block-", nil
case configs.CharDevice:
case devices.CharDevice:
return "char-", nil
default:
return "", errors.Errorf("device type %v has no group prefix", ruleType)
@@ -93,10 +100,10 @@ func groupPrefix(ruleType configs.DeviceType) (string, error) {
}
// findDeviceGroup tries to find the device group name (as listed in
// /proc/devices) with the type prefixed as requried for DeviceAllow, for a
// /proc/devices) with the type prefixed as required for DeviceAllow, for a
// given (type, major) combination. If more than one device group exists, an
// arbitrary one is chosen.
func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, error) {
func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
fh, err := os.Open("/proc/devices")
if err != nil {
return "", err
@@ -109,7 +116,7 @@ func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, erro
}
scanner := bufio.NewScanner(fh)
var currentType configs.DeviceType
var currentType devices.Type
for scanner.Scan() {
// We need to strip spaces because the first number is column-aligned.
line := strings.TrimSpace(scanner.Text())
@@ -117,10 +124,10 @@ func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, erro
// Handle the "header" lines.
switch line {
case "Block devices:":
currentType = configs.BlockDevice
currentType = devices.BlockDevice
continue
case "Character devices:":
currentType = configs.CharDevice
currentType = devices.CharDevice
continue
case "":
continue
@@ -156,7 +163,7 @@ func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, erro
// generateDeviceProperties takes the configured device rules and generates a
// corresponding set of systemd properties to configure the devices correctly.
func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Property, error) {
func generateDeviceProperties(rules []*devices.Rule) ([]systemdDbus.Property, error) {
// DeviceAllow is the type "a(ss)" which means we need a temporary struct
// to represent it in Go.
type deviceAllowEntry struct {
@@ -172,7 +179,7 @@ func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Proper
}
// Figure out the set of rules.
configEmu := &devices.Emulator{}
configEmu := &cgroupdevices.Emulator{}
for _, rule := range rules {
if err := configEmu.Apply(*rule); err != nil {
return nil, errors.Wrap(err, "apply rule for systemd")
@@ -199,7 +206,7 @@ func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Proper
// Now generate the set of rules we actually need to apply. Unlike the
// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
// whitelist which is the default for devices.Emulator.
baseEmu := &devices.Emulator{}
baseEmu := &cgroupdevices.Emulator{}
finalRules, err := baseEmu.Transition(configEmu)
if err != nil {
return nil, errors.Wrap(err, "get simplified rules for systemd")
@@ -211,7 +218,7 @@ func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Proper
return nil, errors.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
}
switch rule.Type {
case configs.BlockDevice, configs.CharDevice:
case devices.BlockDevice, devices.CharDevice:
default:
// Should never happen.
return nil, errors.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
@@ -243,9 +250,9 @@ func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Proper
// so we'll give a warning in that case (note that the fallback code
// will insert any rules systemd couldn't handle). What amazing fun.
if rule.Major == configs.Wildcard {
if rule.Major == devices.Wildcard {
// "_ *:n _" rules aren't supported by systemd.
if rule.Minor != configs.Wildcard {
if rule.Minor != devices.Wildcard {
logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
continue
}
@@ -256,7 +263,7 @@ func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Proper
return nil, err
}
entry.Path = prefix + "*"
} else if rule.Minor == configs.Wildcard {
} else if rule.Minor == devices.Wildcard {
// "_ n:* _" rules require a device group from /proc/devices.
group, err := findDeviceGroup(rule.Type, rule.Major)
if err != nil {
@@ -271,9 +278,9 @@ func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Proper
} else {
// "_ n:m _" rules are just a path in /dev/{block,char}/.
switch rule.Type {
case configs.BlockDevice:
case devices.BlockDevice:
entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
case configs.CharDevice:
case devices.CharDevice:
entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
}
}
@@ -307,7 +314,7 @@ func newProp(name string, units interface{}) systemdDbus.Property {
func getUnitName(c *configs.Cgroup) string {
// by default, we create a scope unless the user explicitly asks for a slice.
if !strings.HasSuffix(c.Name, ".slice") {
return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
return c.ScopePrefix + "-" + c.Name + ".scope"
}
return c.Name
}
@@ -325,6 +332,9 @@ func isUnitExists(err error) bool {
func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []systemdDbus.Property) error {
statusChan := make(chan string, 1)
if _, err := dbusConnection.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
timeout := time.NewTimer(30 * time.Second)
defer timeout.Stop()
select {
case s := <-statusChan:
close(statusChan)
@@ -333,8 +343,9 @@ func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []s
dbusConnection.ResetFailedUnit(unitName)
return errors.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
}
case <-time.After(time.Second):
logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
case <-timeout.C:
dbusConnection.ResetFailedUnit(unitName)
return errors.New("Timeout waiting for systemd to create " + unitName)
}
} else if !isUnitExists(err) {
return err
@@ -360,20 +371,20 @@ func stopUnit(dbusConnection *systemdDbus.Conn, unitName string) error {
return nil
}
func systemdVersion(conn *systemdDbus.Conn) (int, error) {
func systemdVersion(conn *systemdDbus.Conn) int {
versionOnce.Do(func() {
version = -1
verStr, err := conn.GetManagerProperty("Version")
if err != nil {
versionErr = err
return
if err == nil {
version, err = systemdVersionAtoi(verStr)
}
version, versionErr = systemdVersionAtoi(verStr)
return
if err != nil {
logrus.WithError(err).Error("unable to get systemd version")
}
})
return version, versionErr
return version
}
func systemdVersionAtoi(verStr string) (int, error) {
@@ -394,12 +405,13 @@ func systemdVersionAtoi(verStr string) (int, error) {
func addCpuQuota(conn *systemdDbus.Conn, properties *[]systemdDbus.Property, quota int64, period uint64) {
if period != 0 {
// systemd only supports CPUQuotaPeriodUSec since v242
sdVer, err := systemdVersion(conn)
if err != nil {
logrus.Warnf("systemdVersion: %s", err)
} else if sdVer >= 242 {
sdVer := systemdVersion(conn)
if sdVer >= 242 {
*properties = append(*properties,
newProp("CPUQuotaPeriodUSec", period))
} else {
logrus.Debugf("systemd v%d is too old to support CPUQuotaPeriodSec "+
" (setting will still be applied to cgroupfs)", sdVer)
}
}
if quota != 0 || period != 0 {
@@ -407,10 +419,8 @@ func addCpuQuota(conn *systemdDbus.Conn, properties *[]systemdDbus.Property, quo
cpuQuotaPerSecUSec := uint64(math.MaxUint64)
if quota > 0 {
if period == 0 {
// assume the default kernel value of 100000 us (100 ms), same for v1 and v2.
// v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and
// v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
period = 100000
// assume the default
period = defCPUQuotaPeriod
}
// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
// (integer percentage of CPU) internally. This means that if a fractional percent of
@@ -425,3 +435,37 @@ func addCpuQuota(conn *systemdDbus.Conn, properties *[]systemdDbus.Property, quo
newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
}
}
func addCpuset(conn *systemdDbus.Conn, props *[]systemdDbus.Property, cpus, mems string) error {
if cpus == "" && mems == "" {
return nil
}
// systemd only supports AllowedCPUs/AllowedMemoryNodes since v244
sdVer := systemdVersion(conn)
if sdVer < 244 {
logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+
" (settings will still be applied to cgroupfs)", sdVer)
return nil
}
if cpus != "" {
bits, err := rangeToBits(cpus)
if err != nil {
return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w",
cpus, err)
}
*props = append(*props,
newProp("AllowedCPUs", bits))
}
if mems != "" {
bits, err := rangeToBits(mems)
if err != nil {
return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w",
mems, err)
}
*props = append(*props,
newProp("AllowedMemoryNodes", bits))
}
return nil
}

View File

@@ -0,0 +1,67 @@
package systemd
import (
"encoding/binary"
"strconv"
"strings"
"github.com/pkg/errors"
"github.com/willf/bitset"
)
// rangeToBits converts a text representation of a CPU mask (as written to
// or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes
// with the corresponding bits set (as consumed by systemd over dbus as
// AllowedCPUs/AllowedMemoryNodes unit property value).
func rangeToBits(str string) ([]byte, error) {
bits := &bitset.BitSet{}
for _, r := range strings.Split(str, ",") {
// allow extra spaces around
r = strings.TrimSpace(r)
// allow empty elements (extra commas)
if r == "" {
continue
}
ranges := strings.SplitN(r, "-", 2)
if len(ranges) > 1 {
start, err := strconv.ParseUint(ranges[0], 10, 32)
if err != nil {
return nil, err
}
end, err := strconv.ParseUint(ranges[1], 10, 32)
if err != nil {
return nil, err
}
if start > end {
return nil, errors.New("invalid range: " + r)
}
for i := uint(start); i <= uint(end); i++ {
bits.Set(i)
}
} else {
val, err := strconv.ParseUint(ranges[0], 10, 32)
if err != nil {
return nil, err
}
bits.Set(uint(val))
}
}
val := bits.Bytes()
if len(val) == 0 {
// do not allow empty values
return nil, errors.New("empty value")
}
ret := make([]byte, len(val)*8)
for i := range val {
// bitset uses BigEndian internally
binary.BigEndian.PutUint64(ret[i*8:], val[len(val)-1-i])
}
// remove upper all-zero bytes
for ret[0] == 0 {
ret = ret[1:]
}
return ret, nil
}

View File

@@ -57,7 +57,7 @@ func DetectUID() (int, error) {
}
b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput()
if err != nil {
return -1, errors.Wrap(err, "could not execute `busctl --user --no-pager status`")
return -1, errors.Wrapf(err, "could not execute `busctl --user --no-pager status`: %q", string(b))
}
scanner := bufio.NewScanner(bytes.NewReader(b))
for scanner.Scan() {
@@ -102,5 +102,5 @@ func DetectUserDbusSessionBusAddress() (string, error) {
return strings.TrimPrefix(s, "DBUS_SESSION_BUS_ADDRESS="), nil
}
}
return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from `systemctl --user --no-pager show-environment`")
return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from `systemctl --user --no-pager show-environment`. Make sure you have installed the dbus-user-session or dbus-daemon package and then run: `systemctl --user start dbus`")
}

View File

@@ -4,7 +4,6 @@ package systemd
import (
"errors"
"io/ioutil"
"os"
"path/filepath"
"strings"
@@ -13,6 +12,7 @@ import (
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
)
@@ -90,6 +90,11 @@ func genV1ResourcesProperties(c *configs.Cgroup, conn *systemdDbus.Conn) ([]syst
newProp("TasksMax", uint64(r.PidsLimit)))
}
err = addCpuset(conn, &properties, r.CpusetCpus, r.CpusetMems)
if err != nil {
return nil, err
}
return properties, nil
}
@@ -101,20 +106,23 @@ func (m *legacyManager) Apply(pid int) error {
properties []systemdDbus.Property
)
if c.Resources.Unified != nil {
return cgroups.ErrV1NoUnified
}
m.mu.Lock()
defer m.mu.Unlock()
if c.Paths != nil {
paths := make(map[string]string)
cgMap, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return err
}
// XXX(kolyshkin@): why this check is needed?
for name, path := range c.Paths {
_, err := getSubsystemPath(m.cgroups, name)
if err != nil {
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
if cgroups.IsNotFound(err) {
continue
}
return err
if _, ok := cgMap[name]; ok {
paths[name] = path
}
paths[name] = path
}
m.paths = paths
return cgroups.EnterPid(m.paths, pid)
@@ -179,14 +187,16 @@ func (m *legacyManager) Apply(pid int) error {
return err
}
if err := joinCgroups(c, pid); err != nil {
return err
}
paths := make(map[string]string)
for _, s := range legacySubsystems {
subsystemPath, err := getSubsystemPath(m.cgroups, s.Name())
if err != nil {
// Even if it's `not found` error, we'll return err
// because devices cgroup is hard requirement for
// container security.
if s.Name() == "devices" {
return err
}
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
if cgroups.IsNotFound(err) {
continue
@@ -196,6 +206,11 @@ func (m *legacyManager) Apply(pid int) error {
paths[s.Name()] = subsystemPath
}
m.paths = paths
if err := m.joinCgroups(pid); err != nil {
return err
}
return nil
}
@@ -212,17 +227,14 @@ func (m *legacyManager) Destroy() error {
}
unitName := getUnitName(m.cgroups)
err = stopUnit(dbusConnection, unitName)
stopErr := stopUnit(dbusConnection, unitName)
// Both on success and on error, cleanup all the cgroups we are aware of.
// Some of them were created directly by Apply() and are not managed by systemd.
if err := cgroups.RemovePaths(m.paths); err != nil {
return err
}
if err != nil {
return err
}
m.paths = make(map[string]string)
return nil
return stopErr
}
func (m *legacyManager) Path(subsys string) string {
@@ -231,48 +243,25 @@ func (m *legacyManager) Path(subsys string) string {
return m.paths[subsys]
}
func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
path, err := getSubsystemPath(c, subsystem)
if err != nil {
return "", err
}
if err := os.MkdirAll(path, 0755); err != nil {
return "", err
}
if err := cgroups.WriteCgroupProc(path, pid); err != nil {
return "", err
}
return path, nil
}
func joinCgroups(c *configs.Cgroup, pid int) error {
func (m *legacyManager) joinCgroups(pid int) error {
for _, sys := range legacySubsystems {
name := sys.Name()
switch name {
case "name=systemd":
// let systemd handle this
case "cpuset":
path, err := getSubsystemPath(c, name)
if err != nil && !cgroups.IsNotFound(err) {
return err
}
s := &fs.CpusetGroup{}
if err := s.ApplyDir(path, c, pid); err != nil {
return err
}
default:
_, err := join(c, name, pid)
if err != nil {
// Even if it's `not found` error, we'll return err
// because devices cgroup is hard requirement for
// container security.
if name == "devices" {
if path, ok := m.paths[name]; ok {
s := &fs.CpusetGroup{}
if err := s.ApplyDir(path, m.cgroups, pid); err != nil {
return err
}
// For other subsystems, omit the `not found` error
// because they are optional.
if !cgroups.IsNotFound(err) {
}
default:
if path, ok := m.paths[name]; ok {
if err := os.MkdirAll(path, 0755); err != nil {
return err
}
if err := cgroups.WriteCgroupProc(path, pid); err != nil {
return err
}
}
@@ -283,7 +272,7 @@ func joinCgroups(c *configs.Cgroup, pid int) error {
}
func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
mountpoint, err := cgroups.FindCgroupMountpoint(c.Path, subsystem)
mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem)
if err != nil {
return "", err
}
@@ -309,15 +298,14 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
}
func (m *legacyManager) Freeze(state configs.FreezerState) error {
path, err := getSubsystemPath(m.cgroups, "freezer")
if err != nil {
return err
path, ok := m.paths["freezer"]
if !ok {
return errSubsystemDoesNotExist
}
prevState := m.cgroups.Resources.Freezer
m.cgroups.Resources.Freezer = state
freezer := &fs.FreezerGroup{}
err = freezer.Set(path, m.cgroups)
if err != nil {
if err := freezer.Set(path, m.cgroups); err != nil {
m.cgroups.Resources.Freezer = prevState
return err
}
@@ -325,17 +313,17 @@ func (m *legacyManager) Freeze(state configs.FreezerState) error {
}
func (m *legacyManager) GetPids() ([]int, error) {
path, err := getSubsystemPath(m.cgroups, "devices")
if err != nil {
return nil, err
path, ok := m.paths["devices"]
if !ok {
return nil, errSubsystemDoesNotExist
}
return cgroups.GetPids(path)
}
func (m *legacyManager) GetAllPids() ([]int, error) {
path, err := getSubsystemPath(m.cgroups, "devices")
if err != nil {
return nil, err
path, ok := m.paths["devices"]
if !ok {
return nil, errSubsystemDoesNotExist
}
return cgroups.GetAllPids(path)
}
@@ -363,6 +351,9 @@ func (m *legacyManager) Set(container *configs.Config) error {
if m.cgroups.Paths != nil {
return nil
}
if container.Cgroups.Resources.Unified != nil {
return cgroups.ErrV1NoUnified
}
dbusConnection, err := getDbusConnection(false)
if err != nil {
return err
@@ -406,9 +397,9 @@ func (m *legacyManager) Set(container *configs.Config) error {
for _, sys := range legacySubsystems {
// Get the subsystem path, but don't error out for not found cgroups.
path, err := getSubsystemPath(container.Cgroups, sys.Name())
if err != nil && !cgroups.IsNotFound(err) {
return err
path, ok := m.paths[sys.Name()]
if !ok {
continue
}
if err := sys.Set(path, container.Cgroups); err != nil {
return err
@@ -420,7 +411,10 @@ func (m *legacyManager) Set(container *configs.Config) error {
func enableKmem(c *configs.Cgroup) error {
path, err := getSubsystemPath(c, "memory")
if err != nil && !cgroups.IsNotFound(err) {
if err != nil {
if cgroups.IsNotFound(err) {
return nil
}
return err
}
@@ -429,7 +423,7 @@ func enableKmem(c *configs.Cgroup) error {
}
// do not try to enable the kernel memory if we already have
// tasks in the cgroup.
content, err := ioutil.ReadFile(filepath.Join(path, "tasks"))
content, err := fscommon.ReadFile(path, "tasks")
if err != nil {
return err
}
@@ -450,9 +444,9 @@ func (m *legacyManager) GetCgroups() (*configs.Cgroup, error) {
}
func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
path, err := getSubsystemPath(m.cgroups, "freezer")
if err != nil && !cgroups.IsNotFound(err) {
return configs.Undefined, err
path, ok := m.paths["freezer"]
if !ok {
return configs.Undefined, nil
}
freezer := &fs.FreezerGroup{}
return freezer.GetState(path)

View File

@@ -3,6 +3,8 @@
package systemd
import (
"fmt"
"math"
"os"
"path/filepath"
"strconv"
@@ -34,6 +36,133 @@ func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgrou
}
}
// unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified
// key/value map (where key is cgroupfs file name) to systemd unit properties.
// This is on a best-effort basis, so the properties that are not known
// (to this function and/or systemd) are ignored (but logged with "debug"
// log level).
//
// For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt
//
// For the list of systemd unit properties, see systemd.resource-control(5).
func unifiedResToSystemdProps(conn *systemdDbus.Conn, res map[string]string) (props []systemdDbus.Property, _ error) {
var err error
for k, v := range res {
if strings.Contains(k, "/") {
return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
}
sk := strings.SplitN(k, ".", 2)
if len(sk) != 2 {
return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
}
// Kernel is quite forgiving to extra whitespace
// around the value, and so should we.
v = strings.TrimSpace(v)
// Please keep cases in alphabetical order.
switch k {
case "cpu.max":
// value: quota [period]
quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set
period := defCPUQuotaPeriod
sv := strings.Fields(v)
if len(sv) < 1 || len(sv) > 2 {
return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v)
}
// quota
if sv[0] != "max" {
quota, err = strconv.ParseInt(sv[0], 10, 64)
if err != nil {
return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err)
}
}
// period
if len(sv) == 2 {
period, err = strconv.ParseUint(sv[1], 10, 64)
if err != nil {
return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err)
}
}
addCpuQuota(conn, &props, quota, period)
case "cpu.weight":
num, err := strconv.ParseUint(v, 10, 64)
if err != nil {
return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
}
props = append(props,
newProp("CPUWeight", num))
case "cpuset.cpus", "cpuset.mems":
bits, err := rangeToBits(v)
if err != nil {
return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err)
}
m := map[string]string{
"cpuset.cpus": "AllowedCPUs",
"cpuset.mems": "AllowedMemoryNodes",
}
// systemd only supports these properties since v244
sdVer := systemdVersion(conn)
if sdVer >= 244 {
props = append(props,
newProp(m[k], bits))
} else {
logrus.Debugf("systemd v%d is too old to support %s"+
" (setting will still be applied to cgroupfs)",
sdVer, m[k])
}
case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max":
num := uint64(math.MaxUint64)
if v != "max" {
num, err = strconv.ParseUint(v, 10, 64)
if err != nil {
return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
}
}
m := map[string]string{
"memory.high": "MemoryHigh",
"memory.low": "MemoryLow",
"memory.min": "MemoryMin",
"memory.max": "MemoryMax",
"memory.swap.max": "MemorySwapMax",
}
props = append(props,
newProp(m[k], num))
case "pids.max":
num := uint64(math.MaxUint64)
if v != "max" {
var err error
num, err = strconv.ParseUint(v, 10, 64)
if err != nil {
return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
}
}
props = append(props,
newProp("TasksAccounting", true),
newProp("TasksMax", num))
case "memory.oom.group":
// Setting this to 1 is roughly equivalent to OOMPolicy=kill
// (as per systemd.service(5) and
// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html),
// but it's not clear what to do if it is unset or set
// to 0 in runc update, as there are two other possible
// values for OOMPolicy (continue/stop).
fallthrough
default:
// Ignore the unknown resource here -- will still be
// applied in Set which calls fs2.Set.
logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v)
}
}
return props, nil
}
func genV2ResourcesProperties(c *configs.Cgroup, conn *systemdDbus.Conn) ([]systemdDbus.Property, error) {
var properties []systemdDbus.Property
r := c.Resources
@@ -80,8 +209,22 @@ func genV2ResourcesProperties(c *configs.Cgroup, conn *systemdDbus.Conn) ([]syst
newProp("TasksMax", uint64(r.PidsLimit)))
}
err = addCpuset(conn, &properties, r.CpusetCpus, r.CpusetMems)
if err != nil {
return nil, err
}
// ignore r.KernelMemory
// convert Resources.Unified map to systemd properties
if r.Unified != nil {
unifiedProps, err := unifiedResToSystemdProps(conn, r.Unified)
if err != nil {
return nil, err
}
properties = append(properties, unifiedProps...)
}
return properties, nil
}

View File

@@ -15,7 +15,9 @@ import (
"sync"
"time"
units "github.com/docker/go-units"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@@ -29,19 +31,19 @@ var (
isUnified bool
)
// HugePageSizeUnitList is a list of the units used by the linux kernel when
// naming the HugePage control files.
// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt
// TODO Since the kernel only use KB, MB and GB; TB and PB should be removed,
// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393
var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"}
// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
func IsCgroup2UnifiedMode() bool {
isUnifiedOnce.Do(func() {
var st unix.Statfs_t
if err := unix.Statfs(unifiedMountpoint, &st); err != nil {
panic("cannot statfs cgroup root")
err := unix.Statfs(unifiedMountpoint, &st)
if err != nil {
if os.IsNotExist(err) && system.RunningInUserNS() {
// ignore the "not found" error if running in userns
logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
isUnified = false
return
}
panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
}
isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
})
@@ -86,11 +88,11 @@ func GetAllSubsystems() ([]string, error) {
// - freezer: implemented in kernel 5.2
// We assume these are always available, as it is hard to detect availability.
pseudo := []string{"devices", "freezer"}
data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
data, err := fscommon.ReadFile("/sys/fs/cgroup", "cgroup.controllers")
if err != nil {
return nil, err
}
subsystems := append(pseudo, strings.Fields(string(data))...)
subsystems := append(pseudo, strings.Fields(data)...)
return subsystems, nil
}
f, err := os.Open("/proc/cgroups")
@@ -207,20 +209,66 @@ func EnterPid(cgroupPaths map[string]string, pid int) error {
return nil
}
func rmdir(path string) error {
err := unix.Rmdir(path)
if err == nil || err == unix.ENOENT {
return nil
}
return &os.PathError{Op: "rmdir", Path: path, Err: err}
}
// RemovePath aims to remove cgroup path. It does so recursively,
// by removing any subdirectories (sub-cgroups) first.
func RemovePath(path string) error {
// try the fast path first
if err := rmdir(path); err == nil {
return nil
}
infos, err := ioutil.ReadDir(path)
if err != nil {
if os.IsNotExist(err) {
err = nil
}
return err
}
for _, info := range infos {
if info.IsDir() {
// We should remove subcgroups dir first
if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
break
}
}
}
if err == nil {
err = rmdir(path)
}
return err
}
// RemovePaths iterates over the provided paths removing them.
// We trying to remove all paths five times with increasing delay between tries.
// If after all there are not removed cgroups - appropriate error will be
// returned.
func RemovePaths(paths map[string]string) (err error) {
const retries = 5
delay := 10 * time.Millisecond
for i := 0; i < 5; i++ {
for i := 0; i < retries; i++ {
if i != 0 {
time.Sleep(delay)
delay *= 2
}
for s, p := range paths {
os.RemoveAll(p)
// TODO: here probably should be logging
if err := RemovePath(p); err != nil {
// do not log intermediate iterations
switch i {
case 0:
logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
case retries - 1:
logrus.WithError(err).Error("Failed to remove cgroup")
}
}
_, err := os.Stat(p)
// We need this strange way of checking cgroups existence because
// RemoveAll almost always returns error, even on already removed
@@ -230,6 +278,8 @@ func RemovePaths(paths map[string]string) (err error) {
}
}
if len(paths) == 0 {
//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
paths = make(map[string]string)
return nil
}
}
@@ -237,27 +287,50 @@ func RemovePaths(paths map[string]string) (err error) {
}
func GetHugePageSize() ([]string, error) {
files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages")
dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return []string{}, err
return nil, err
}
var fileNames []string
for _, st := range files {
fileNames = append(fileNames, st.Name())
files, err := dir.Readdirnames(0)
dir.Close()
if err != nil {
return nil, err
}
return getHugePageSizeFromFilenames(fileNames)
return getHugePageSizeFromFilenames(files)
}
func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
var pageSizes []string
for _, fileName := range fileNames {
nameArray := strings.Split(fileName, "-")
pageSize, err := units.RAMInBytes(nameArray[1])
if err != nil {
return []string{}, err
pageSizes := make([]string, 0, len(fileNames))
for _, file := range fileNames {
// example: hugepages-1048576kB
val := strings.TrimPrefix(file, "hugepages-")
if len(val) == len(file) {
// unexpected file name: no prefix found
continue
}
sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList)
pageSizes = append(pageSizes, sizeString)
// The suffix is always "kB" (as of Linux 5.9)
eLen := len(val) - 2
val = strings.TrimSuffix(val, "kB")
if len(val) != eLen {
logrus.Warnf("GetHugePageSize: %s: invalid filename suffix (expected \"kB\")", file)
continue
}
size, err := strconv.Atoi(val)
if err != nil {
return nil, err
}
// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
// but in our case the size is in KB already.
if size >= (1 << 20) {
val = strconv.Itoa(size>>20) + "GB"
} else if size >= (1 << 10) {
val = strconv.Itoa(size>>10) + "MB"
} else {
val += "KB"
}
pageSizes = append(pageSizes, val)
}
return pageSizes, nil
@@ -303,14 +376,14 @@ func WriteCgroupProc(dir string, pid int) error {
return nil
}
cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700)
file, err := fscommon.OpenFile(dir, CgroupProcesses, os.O_WRONLY)
if err != nil {
return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
}
defer cgroupProcessesFile.Close()
defer file.Close()
for i := 0; i < 5; i++ {
_, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid))
_, err = file.WriteString(strconv.Itoa(pid))
if err == nil {
return nil
}

View File

@@ -1,16 +1,16 @@
package cgroups
import (
"bufio"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"sync"
"syscall"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/moby/sys/mountinfo"
"golang.org/x/sys/unix"
)
@@ -23,7 +23,12 @@ const (
)
var (
errUnified = errors.New("not implemented for cgroup v2 unified hierarchy")
errUnified = errors.New("not implemented for cgroup v2 unified hierarchy")
ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1")
readMountinfoOnce sync.Once
readMountinfoErr error
cgroupMountinfo []*mountinfo.Info
)
type NotFoundError struct {
@@ -90,6 +95,21 @@ func tryDefaultPath(cgroupPath, subsystem string) string {
return path
}
// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones
// with fstype of "cgroup") for the current running process.
//
// The results are cached (to avoid re-reading mountinfo which is relatively
// expensive), so it is assumed that cgroup mounts are not being changed.
func readCgroupMountinfo() ([]*mountinfo.Info, error) {
readMountinfoOnce.Do(func() {
cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
mountinfo.FSTypeFilter("cgroup"),
)
})
return cgroupMountinfo, readMountinfoErr
}
// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
if IsCgroup2UnifiedMode() {
@@ -110,56 +130,28 @@ func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string,
return "", "", errUnified
}
// Avoid parsing mountinfo by checking if subsystem is valid/available.
if !isSubsystemAvailable(subsystem) {
return "", "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo")
mi, err := readCgroupMountinfo()
if err != nil {
return "", "", err
}
defer f.Close()
return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem)
}
func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
scanner := bufio.NewScanner(reader)
for scanner.Scan() {
txt := scanner.Text()
fields := strings.Fields(txt)
if len(fields) < 9 {
continue
}
if strings.HasPrefix(fields[4], cgroupPath) {
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) {
for _, mi := range mounts {
if strings.HasPrefix(mi.Mountpoint, cgroupPath) {
for _, opt := range strings.Split(mi.VFSOptions, ",") {
if opt == subsystem {
return fields[4], fields[3], nil
return mi.Mountpoint, mi.Root, nil
}
}
}
}
if err := scanner.Err(); err != nil {
return "", "", err
}
return "", "", NewNotFoundError(subsystem)
}
func isSubsystemAvailable(subsystem string) bool {
if IsCgroup2UnifiedMode() {
panic("don't call isSubsystemAvailable from cgroupv2 code")
}
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return false
}
_, avail := cgroups[subsystem]
return avail
}
func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
if len(m.Subsystems) == 0 {
return "", fmt.Errorf("no subsystem for mount")
@@ -168,25 +160,15 @@ func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
return getControllerPath(m.Subsystems[0], cgroups)
}
func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) {
res := make([]Mount, 0, len(ss))
scanner := bufio.NewScanner(mi)
numFound := 0
for scanner.Scan() && numFound < len(ss) {
txt := scanner.Text()
sepIdx := strings.Index(txt, " - ")
if sepIdx == -1 {
return nil, fmt.Errorf("invalid mountinfo format")
}
if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
continue
}
fields := strings.Split(txt, " ")
for _, mi := range mounts {
m := Mount{
Mountpoint: fields[4],
Root: fields[3],
Mountpoint: mi.Mountpoint,
Root: mi.Root,
}
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
for _, opt := range strings.Split(mi.VFSOptions, ",") {
seen, known := ss[opt]
if !known || (!all && seen) {
continue
@@ -199,19 +181,18 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
if len(m.Subsystems) > 0 || all {
res = append(res, m)
}
}
if err := scanner.Err(); err != nil {
return nil, err
if !all && numFound >= len(ss) {
break
}
}
return res, nil
}
func getCgroupMountsV1(all bool) ([]Mount, error) {
f, err := os.Open("/proc/self/mountinfo")
mi, err := readCgroupMountinfo()
if err != nil {
return nil, err
}
defer f.Close()
allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
@@ -222,7 +203,8 @@ func getCgroupMountsV1(all bool) ([]Mount, error) {
for s := range allSubsystems {
allMap[s] = false
}
return getCgroupMountsHelper(allMap, f, all)
return getCgroupMountsHelper(allMap, mi, all)
}
// GetOwnCgroup returns the relative path to the cgroup docker is running in.