deps: update runc to 1.1.0

This updates vendored runc/libcontainer to 1.1.0,
and google/cadvisor to a version updated to runc 1.1.0
(google/cadvisor#3048).

Changes in vendor are generated by (roughly):

        ./hack/pin-dependency.sh github.com/google/cadvisor v0.44.0
        ./hack/pin-dependency.sh github.com/opencontainers/runc v1.1.0
        ./hack/update-vendor.sh
        ./hack/lint-dependencies.sh # And follow all its recommendations.
        ./hack/update-vendor.sh
        ./hack/update-internal-modules.sh
        ./hack/lint-dependencies.sh # Re-check everything again.

Co-Authored-By: Kir Kolyshkin <kolyshkin@gmail.com>
This commit is contained in:
Elana Hashman
2022-03-28 11:32:04 -07:00
parent 41830a1f79
commit 07af1bab70
245 changed files with 6520 additions and 5250 deletions

View File

@@ -3,7 +3,6 @@ package apparmor
import (
"errors"
"fmt"
"io/ioutil"
"os"
"sync"
@@ -19,7 +18,7 @@ var (
func isEnabled() bool {
checkAppArmor.Do(func() {
if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil {
buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled")
buf, err := os.ReadFile("/sys/module/apparmor/parameters/enabled")
appArmorEnabled = err == nil && len(buf) > 1 && buf[0] == 'Y'
}
})
@@ -52,7 +51,7 @@ func setProcAttr(attr, value string) error {
// changeOnExec reimplements aa_change_onexec from libapparmor in Go
func changeOnExec(name string) error {
if err := setProcAttr("exec", "exec "+name); err != nil {
return fmt.Errorf("apparmor failed to apply profile: %s", err)
return fmt.Errorf("apparmor failed to apply profile: %w", err)
}
return nil
}

View File

@@ -1,3 +1,4 @@
//go:build !linux
// +build !linux
package apparmor

View File

@@ -1,3 +1,4 @@
//go:build linux
// +build linux
package capabilities
@@ -34,6 +35,17 @@ func init() {
}
}
// KnownCapabilities returns the list of the known capabilities.
// Used by `runc features`.
func KnownCapabilities() []string {
list := capability.List()
res := make([]string, len(list))
for i, c := range list {
res[i] = "CAP_" + strings.ToUpper(c.String())
}
return res
}
// New creates a new Caps from the given Capabilities config. Unknown Capabilities
// or Capabilities that are unavailable in the current environment are ignored,
// printing a warning instead.

View File

@@ -1,3 +1,4 @@
//go:build !linux
// +build !linux
package capabilities

View File

@@ -1,5 +1,3 @@
// +build linux
package cgroups
import (

View File

@@ -1,3 +0,0 @@
// +build !linux
package cgroups

View File

@@ -1,5 +1,3 @@
// +build linux
// SPDX-License-Identifier: Apache-2.0
/*
* Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com>
@@ -22,14 +20,13 @@ package devices
import (
"bufio"
"fmt"
"io"
"regexp"
"sort"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/pkg/errors"
)
// deviceMeta is a Rule without the Allow or Permissions fields, and no
@@ -79,19 +76,21 @@ func (e *Emulator) IsAllowAll() bool {
return e.IsBlacklist() && len(e.rules) == 0
}
var devicesListRegexp = regexp.MustCompile(`^([abc])\s+(\d+|\*):(\d+|\*)\s+([rwm]+)$`)
func parseLine(line string) (*deviceRule, error) {
matches := devicesListRegexp.FindStringSubmatch(line)
if matches == nil {
return nil, errors.Errorf("line doesn't match devices.list format")
// Input: node major:minor perms.
fields := strings.FieldsFunc(line, func(r rune) bool {
return r == ' ' || r == ':'
})
if len(fields) != 4 {
return nil, fmt.Errorf("malformed devices.list rule %s", line)
}
var (
rule deviceRule
node = matches[1]
major = matches[2]
minor = matches[3]
perms = matches[4]
node = fields[0]
major = fields[1]
minor = fields[2]
perms = fields[3]
)
// Parse the node type.
@@ -107,8 +106,7 @@ func parseLine(line string) (*deviceRule, error) {
case "c":
rule.meta.node = devices.CharDevice
default:
// Should never happen!
return nil, errors.Errorf("unknown device type %q", node)
return nil, fmt.Errorf("unknown device type %q", node)
}
// Parse the major number.
@@ -117,7 +115,7 @@ func parseLine(line string) (*deviceRule, error) {
} else {
val, err := strconv.ParseUint(major, 10, 32)
if err != nil {
return nil, errors.Wrap(err, "parse major number")
return nil, fmt.Errorf("invalid major number: %w", err)
}
rule.meta.major = int64(val)
}
@@ -128,7 +126,7 @@ func parseLine(line string) (*deviceRule, error) {
} else {
val, err := strconv.ParseUint(minor, 10, 32)
if err != nil {
return nil, errors.Wrap(err, "parse minor number")
return nil, fmt.Errorf("invalid minor number: %w", err)
}
rule.meta.minor = int64(val)
}
@@ -136,13 +134,12 @@ func parseLine(line string) (*deviceRule, error) {
// Parse the access permissions.
rule.perms = devices.Permissions(perms)
if !rule.perms.IsValid() || rule.perms.IsEmpty() {
// Should never happen!
return nil, errors.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
}
return &rule, nil
}
func (e *Emulator) addRule(rule deviceRule) error {
func (e *Emulator) addRule(rule deviceRule) error { //nolint:unparam
if e.rules == nil {
e.rules = make(map[deviceMeta]devices.Permissions)
}
@@ -180,7 +177,7 @@ func (e *Emulator) rmRule(rule deviceRule) error {
// Only give an error if the set of permissions overlap.
partialPerms := e.rules[partialMeta]
if !partialPerms.Intersection(rule.perms).IsEmpty() {
return errors.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
}
}
@@ -212,9 +209,9 @@ func (e *Emulator) allow(rule *deviceRule) error {
var err error
if e.defaultAllow {
err = errors.Wrap(e.rmRule(*rule), "remove 'deny' exception")
err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception")
} else {
err = errors.Wrap(e.addRule(*rule), "add 'allow' exception")
err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception")
}
return err
}
@@ -232,16 +229,16 @@ func (e *Emulator) deny(rule *deviceRule) error {
var err error
if e.defaultAllow {
err = errors.Wrap(e.addRule(*rule), "add 'deny' exception")
err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception")
} else {
err = errors.Wrap(e.rmRule(*rule), "remove 'allow' exception")
err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception")
}
return err
}
func (e *Emulator) Apply(rule devices.Rule) error {
if !rule.Type.CanCgroup() {
return errors.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
}
innerRule := &deviceRule{
@@ -283,17 +280,17 @@ func EmulatorFromList(list io.Reader) (*Emulator, error) {
line := s.Text()
deviceRule, err := parseLine(line)
if err != nil {
return nil, errors.Wrapf(err, "parsing line %q", line)
return nil, fmt.Errorf("error parsing line %q: %w", line, err)
}
// "devices.list" is an allow list. Note that this means that in
// black-list mode, we have no idea what rules are in play. As a
// result, we need to be very careful in Transition().
if err := e.allow(deviceRule); err != nil {
return nil, errors.Wrapf(err, "adding devices.list rule")
return nil, fmt.Errorf("error adding devices.list rule: %w", err)
}
}
if err := s.Err(); err != nil {
return nil, errors.Wrap(err, "reading devices.list lines")
return nil, fmt.Errorf("error reading devices.list lines: %w", err)
}
return e, nil
}
@@ -305,7 +302,7 @@ func EmulatorFromList(list io.Reader) (*Emulator, error) {
// necessary.
//
// This function is the sole reason for all of Emulator -- to allow us
// to figure out how to update a containers' cgroups without causing spurrious
// to figure out how to update a containers' cgroups without causing spurious
// device errors (if possible).
func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
var transitionRules []*devices.Rule
@@ -380,3 +377,10 @@ func (e *Emulator) Rules() ([]*devices.Rule, error) {
defaultCgroup := &Emulator{defaultAllow: false}
return defaultCgroup.Transition(e)
}
func wrapErr(err error, text string) error {
if err == nil {
return nil
}
return fmt.Errorf(text+": %w", err)
}

View File

@@ -7,13 +7,14 @@
package devicefilter
import (
"errors"
"fmt"
"math"
"strconv"
"github.com/cilium/ebpf/asm"
devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
@@ -51,21 +52,20 @@ func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
// only be one (at most) at the very start to instruct cgroupv1 to
// go into allow-list mode. However we do double-check this here.
if idx != 0 || rule.Allow != emu.IsBlacklist() {
return nil, "", errors.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
}
continue
}
if rule.Allow == p.defaultAllow {
// There should be no rules which have an action equal to the
// default action, the emulator removes those.
return nil, "", errors.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
}
if err := p.appendRule(rule); err != nil {
return nil, "", err
}
}
insts, err := p.finalize()
return insts, license, err
return p.finalize(), license, nil
}
type program struct {
@@ -118,13 +118,13 @@ func (p *program) appendRule(rule *devices.Rule) error {
bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
default:
// We do not permit 'a', nor any other types we don't know about.
return errors.Errorf("invalid type %q", string(rule.Type))
return fmt.Errorf("invalid type %q", string(rule.Type))
}
if rule.Major > math.MaxUint32 {
return errors.Errorf("invalid major %d", rule.Major)
return fmt.Errorf("invalid major %d", rule.Major)
}
if rule.Minor > math.MaxUint32 {
return errors.Errorf("invalid minor %d", rule.Major)
return fmt.Errorf("invalid minor %d", rule.Major)
}
hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
hasMinor := rule.Minor >= 0
@@ -138,7 +138,7 @@ func (p *program) appendRule(rule *devices.Rule) error {
case 'm':
bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
default:
return errors.Errorf("unknown device access %v", r)
return fmt.Errorf("unknown device access %v", r)
}
}
// If the access is rwm, skip the check.
@@ -180,7 +180,7 @@ func (p *program) appendRule(rule *devices.Rule) error {
return nil
}
func (p *program) finalize() (asm.Instructions, error) {
func (p *program) finalize() asm.Instructions {
var v int32
if p.defaultAllow {
v = 1
@@ -192,7 +192,7 @@ func (p *program) finalize() (asm.Instructions, error) {
asm.Return(),
)
p.blockID = -1
return p.insts, nil
return p.insts
}
func acceptBlock(accept bool) asm.Instructions {

View File

@@ -1,6 +1,7 @@
package ebpf
import (
"errors"
"fmt"
"os"
"runtime"
@@ -10,7 +11,6 @@ import (
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/link"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@@ -134,7 +134,7 @@ func haveBpfProgReplace() bool {
// not supported
return
}
// attach_flags test succeded.
// attach_flags test succeeded.
if !errors.Is(err, unix.EBADF) {
logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
}

View File

@@ -2,20 +2,27 @@ package cgroups
import (
"bytes"
"errors"
"fmt"
"os"
"path"
"strconv"
"strings"
"sync"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
// OpenFile opens a cgroup file in a given dir with given flags.
// It is supposed to be used for cgroup files only.
// It is supposed to be used for cgroup files only, and returns
// an error if the file is not a cgroup file.
//
// Arguments dir and file are joined together to form an absolute path
// to a file being opened.
func OpenFile(dir, file string, flags int) (*os.File, error) {
if dir == "" {
return nil, errors.Errorf("no directory specified for %s", file)
return nil, fmt.Errorf("no directory specified for %s", file)
}
return openFile(dir, file, flags)
}
@@ -43,7 +50,8 @@ func WriteFile(dir, file, data string) error {
}
defer fd.Close()
if err := retryingWriteFile(fd, data); err != nil {
return errors.Wrapf(err, "failed to write %q", data)
// Having data in the error message helps in debugging.
return fmt.Errorf("failed to write %q: %w", data, err)
}
return nil
}
@@ -81,7 +89,7 @@ func prepareOpenat2() error {
})
if err != nil {
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
if err != unix.ENOSYS {
if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare
logrus.Warnf("falling back to securejoin: %s", prepErr)
} else {
logrus.Debug("openat2 not available, falling back to securejoin")
@@ -107,8 +115,6 @@ func prepareOpenat2() error {
return prepErr
}
// OpenFile opens a cgroup file in a given dir with given flags.
// It is supposed to be used for cgroup files only.
func openFile(dir, file string, flags int) (*os.File, error) {
mode := os.FileMode(0)
if TestMode && flags&os.O_WRONLY != 0 {
@@ -116,34 +122,52 @@ func openFile(dir, file string, flags int) (*os.File, error) {
flags |= os.O_TRUNC | os.O_CREATE
mode = 0o600
}
path := path.Join(dir, file)
if prepareOpenat2() != nil {
return openFallback(dir, file, flags, mode)
return openFallback(path, flags, mode)
}
reldir := strings.TrimPrefix(dir, cgroupfsPrefix)
if len(reldir) == len(dir) { // non-standard path, old system?
return openFallback(dir, file, flags, mode)
relPath := strings.TrimPrefix(path, cgroupfsPrefix)
if len(relPath) == len(path) { // non-standard path, old system?
return openFallback(path, flags, mode)
}
relname := reldir + "/" + file
fd, err := unix.Openat2(cgroupFd, relname,
fd, err := unix.Openat2(cgroupFd, relPath,
&unix.OpenHow{
Resolve: resolveFlags,
Flags: uint64(flags) | unix.O_CLOEXEC,
Mode: uint64(mode),
})
if err != nil {
return nil, &os.PathError{Op: "openat2", Path: dir + "/" + file, Err: err}
err = &os.PathError{Op: "openat2", Path: path, Err: err}
// Check if cgroupFd is still opened to cgroupfsDir
// (happens when this package is incorrectly used
// across the chroot/pivot_root/mntns boundary, or
// when /sys/fs/cgroup is remounted).
//
// TODO: if such usage will ever be common, amend this
// to reopen cgroupFd and retry openat2.
fdStr := strconv.Itoa(cgroupFd)
fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
if fdDest != cgroupfsDir {
// Wrap the error so it is clear that cgroupFd
// is opened to an unexpected/wrong directory.
err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w",
fdStr, fdDest, cgroupfsDir, err)
}
return nil, err
}
return os.NewFile(uintptr(fd), cgroupfsPrefix+relname), nil
return os.NewFile(uintptr(fd), path), nil
}
var errNotCgroupfs = errors.New("not a cgroup file")
// openFallback is used when openat2(2) is not available. It checks the opened
// Can be changed by unit tests.
var openFallback = openAndCheck
// openAndCheck is used when openat2(2) is not available. It checks the opened
// file is on cgroupfs, returning an error otherwise.
func openFallback(dir, file string, flags int, mode os.FileMode) (*os.File, error) {
path := dir + "/" + file
func openAndCheck(path string, flags int, mode os.FileMode) (*os.File, error) {
fd, err := os.OpenFile(path, flags, mode)
if err != nil {
return nil, err

View File

@@ -1,10 +1,7 @@
// +build linux
package fs
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
@@ -23,8 +20,8 @@ func (s *BlkioGroup) Name() string {
return "blkio"
}
func (s *BlkioGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
func (s *BlkioGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *BlkioGroup) Set(path string, r *configs.Resources) error {
@@ -131,19 +128,19 @@ func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) {
// skip total line
continue
} else {
return nil, fmt.Errorf("Invalid line found while parsing %s/%s: %s", dir, file, sc.Text())
return nil, malformedLine(dir, file, sc.Text())
}
}
v, err := strconv.ParseUint(fields[0], 10, 64)
if err != nil {
return nil, err
return nil, &parseError{Path: dir, File: file, Err: err}
}
major := v
v, err = strconv.ParseUint(fields[1], 10, 64)
if err != nil {
return nil, err
return nil, &parseError{Path: dir, File: file, Err: err}
}
minor := v
@@ -155,10 +152,13 @@ func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) {
}
v, err = strconv.ParseUint(fields[valueField], 10, 64)
if err != nil {
return nil, err
return nil, &parseError{Path: dir, File: file, Err: err}
}
blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v})
}
if err := sc.Err(); err != nil {
return nil, &parseError{Path: dir, File: file, Err: err}
}
return blkioStats, nil
}

View File

@@ -1,5 +1,3 @@
// +build linux
package fs
import (
@@ -21,24 +19,19 @@ func (s *CpuGroup) Name() string {
return "cpu"
}
func (s *CpuGroup) Apply(path string, d *cgroupData) error {
// This might happen if we have no cpu cgroup mounted.
// Just do nothing and don't fail.
if path == "" {
return nil
}
func (s *CpuGroup) Apply(path string, r *configs.Resources, pid int) error {
if err := os.MkdirAll(path, 0o755); err != nil {
return err
}
// We should set the real-Time group scheduling settings before moving
// in the process because if the process is already in SCHED_RR mode
// and no RT bandwidth is set, adding it will fail.
if err := s.SetRtSched(path, d.config.Resources); err != nil {
if err := s.SetRtSched(path, r); err != nil {
return err
}
// Since we are not using join(), we need to place the pid
// into the procs file unlike other subsystems.
return cgroups.WriteCgroupProc(path, d.pid)
// Since we are not using apply(), we need to place the pid
// into the procs file.
return cgroups.WriteCgroupProc(path, pid)
}
func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error {
@@ -105,7 +98,8 @@ func (s *CpuGroup) Set(path string, r *configs.Resources) error {
}
func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
f, err := cgroups.OpenFile(path, "cpu.stat", os.O_RDONLY)
const file = "cpu.stat"
f, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return nil
@@ -118,7 +112,7 @@ func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return err
return &parseError{Path: path, File: file, Err: err}
}
switch t {
case "nr_periods":

View File

@@ -1,12 +1,8 @@
// +build linux
package fs
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
@@ -38,8 +34,8 @@ func (s *CpuacctGroup) Name() string {
return "cpuacct"
}
func (s *CpuacctGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
func (s *CpuacctGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *CpuacctGroup) Set(_ string, _ *configs.Resources) error {
@@ -85,45 +81,43 @@ func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
const (
userField = "user"
systemField = "system"
file = cgroupCpuacctStat
)
// Expected format:
// user <usage in ticks>
// system <usage in ticks>
data, err := cgroups.ReadFile(path, cgroupCpuacctStat)
data, err := cgroups.ReadFile(path, file)
if err != nil {
return 0, 0, err
}
// TODO: use strings.SplitN instead.
fields := strings.Fields(data)
if len(fields) < 4 {
return 0, 0, fmt.Errorf("failure - %s is expected to have at least 4 fields", filepath.Join(path, cgroupCpuacctStat))
}
if fields[0] != userField {
return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[0], cgroupCpuacctStat, userField)
}
if fields[2] != systemField {
return 0, 0, fmt.Errorf("unexpected field %q in %q, expected %q", fields[2], cgroupCpuacctStat, systemField)
if len(fields) < 4 || fields[0] != userField || fields[2] != systemField {
return 0, 0, malformedLine(path, file, data)
}
if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil {
return 0, 0, err
return 0, 0, &parseError{Path: path, File: file, Err: err}
}
if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil {
return 0, 0, err
return 0, 0, &parseError{Path: path, File: file, Err: err}
}
return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil
}
func getPercpuUsage(path string) ([]uint64, error) {
const file = "cpuacct.usage_percpu"
percpuUsage := []uint64{}
data, err := cgroups.ReadFile(path, "cpuacct.usage_percpu")
data, err := cgroups.ReadFile(path, file)
if err != nil {
return percpuUsage, err
}
// TODO: use strings.SplitN instead.
for _, value := range strings.Fields(data) {
value, err := strconv.ParseUint(value, 10, 64)
if err != nil {
return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err)
return percpuUsage, &parseError{Path: path, File: file, Err: err}
}
percpuUsage = append(percpuUsage, value)
}
@@ -133,16 +127,17 @@ func getPercpuUsage(path string) ([]uint64, error) {
func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
usageKernelMode := []uint64{}
usageUserMode := []uint64{}
const file = cgroupCpuacctUsageAll
file, err := cgroups.OpenFile(path, cgroupCpuacctUsageAll, os.O_RDONLY)
fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if os.IsNotExist(err) {
return usageKernelMode, usageUserMode, nil
} else if err != nil {
return nil, nil, err
}
defer file.Close()
defer fd.Close()
scanner := bufio.NewScanner(file)
scanner := bufio.NewScanner(fd)
scanner.Scan() // skipping header line
for scanner.Scan() {
@@ -153,19 +148,18 @@ func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
usageInKernelMode, err := strconv.ParseUint(lineFields[kernelModeColumn], 10, 64)
if err != nil {
return nil, nil, fmt.Errorf("Unable to convert CPU usage in kernel mode to uint64: %s", err)
return nil, nil, &parseError{Path: path, File: file, Err: err}
}
usageKernelMode = append(usageKernelMode, usageInKernelMode)
usageInUserMode, err := strconv.ParseUint(lineFields[userModeColumn], 10, 64)
if err != nil {
return nil, nil, fmt.Errorf("Unable to convert CPU usage in user mode to uint64: %s", err)
return nil, nil, &parseError{Path: path, File: file, Err: err}
}
usageUserMode = append(usageUserMode, usageInUserMode)
}
if err := scanner.Err(); err != nil {
return nil, nil, fmt.Errorf("Problem in reading %s line by line, %s", cgroupCpuacctUsageAll, err)
return nil, nil, &parseError{Path: path, File: file, Err: err}
}
return usageKernelMode, usageUserMode, nil

View File

@@ -1,19 +1,17 @@
// +build linux
package fs
import (
"fmt"
"errors"
"os"
"path/filepath"
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
type CpusetGroup struct{}
@@ -22,8 +20,8 @@ func (s *CpusetGroup) Name() string {
return "cpuset"
}
func (s *CpusetGroup) Apply(path string, d *cgroupData) error {
return s.ApplyDir(path, d.config.Resources, d.pid)
func (s *CpusetGroup) Apply(path string, r *configs.Resources, pid int) error {
return s.ApplyDir(path, r, pid)
}
func (s *CpusetGroup) Set(path string, r *configs.Resources) error {
@@ -40,32 +38,32 @@ func (s *CpusetGroup) Set(path string, r *configs.Resources) error {
return nil
}
func getCpusetStat(path string, filename string) ([]uint16, error) {
func getCpusetStat(path string, file string) ([]uint16, error) {
var extracted []uint16
fileContent, err := fscommon.GetCgroupParamString(path, filename)
fileContent, err := fscommon.GetCgroupParamString(path, file)
if err != nil {
return extracted, err
}
if len(fileContent) == 0 {
return extracted, fmt.Errorf("%s found to be empty", filepath.Join(path, filename))
return extracted, &parseError{Path: path, File: file, Err: errors.New("empty file")}
}
for _, s := range strings.Split(fileContent, ",") {
splitted := strings.SplitN(s, "-", 3)
switch len(splitted) {
sp := strings.SplitN(s, "-", 3)
switch len(sp) {
case 3:
return extracted, fmt.Errorf("invalid values in %s", filepath.Join(path, filename))
return extracted, &parseError{Path: path, File: file, Err: errors.New("extra dash")}
case 2:
min, err := strconv.ParseUint(splitted[0], 10, 16)
min, err := strconv.ParseUint(sp[0], 10, 16)
if err != nil {
return extracted, err
return extracted, &parseError{Path: path, File: file, Err: err}
}
max, err := strconv.ParseUint(splitted[1], 10, 16)
max, err := strconv.ParseUint(sp[1], 10, 16)
if err != nil {
return extracted, err
return extracted, &parseError{Path: path, File: file, Err: err}
}
if min > max {
return extracted, fmt.Errorf("invalid values in %s", filepath.Join(path, filename))
return extracted, &parseError{Path: path, File: file, Err: errors.New("invalid values, min > max")}
}
for i := min; i <= max; i++ {
extracted = append(extracted, uint16(i))
@@ -73,7 +71,7 @@ func getCpusetStat(path string, filename string) ([]uint16, error) {
case 1:
value, err := strconv.ParseUint(s, 10, 16)
if err != nil {
return extracted, err
return extracted, &parseError{Path: path, File: file, Err: err}
}
extracted = append(extracted, uint16(value))
}
@@ -168,9 +166,8 @@ func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error
if err := s.ensureCpusAndMems(dir, r); err != nil {
return err
}
// because we are not using d.join we need to place the pid into the procs file
// unlike the other subsystems
// Since we are not using apply(), we need to place the pid
// into the procs file.
return cgroups.WriteCgroupProc(dir, pid)
}
@@ -198,7 +195,7 @@ func cpusetEnsureParent(current string) error {
}
// Treat non-existing directory as cgroupfs as it will be created,
// and the root cpuset directory obviously exists.
if err != nil && err != unix.ENOENT {
if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare
return &os.PathError{Op: "statfs", Path: parent, Err: err}
}
@@ -224,12 +221,12 @@ func cpusetCopyIfNeeded(current, parent string) error {
}
if isEmptyCpuset(currentCpus) {
if err := cgroups.WriteFile(current, "cpuset.cpus", string(parentCpus)); err != nil {
if err := cgroups.WriteFile(current, "cpuset.cpus", parentCpus); err != nil {
return err
}
}
if isEmptyCpuset(currentMems) {
if err := cgroups.WriteFile(current, "cpuset.mems", string(parentMems)); err != nil {
if err := cgroups.WriteFile(current, "cpuset.mems", parentMems); err != nil {
return err
}
}

View File

@@ -1,5 +1,3 @@
// +build linux
package fs
import (
@@ -15,15 +13,15 @@ import (
)
type DevicesGroup struct {
testingSkipFinalCheck bool
TestingSkipFinalCheck bool
}
func (s *DevicesGroup) Name() string {
return "devices"
}
func (s *DevicesGroup) Apply(path string, d *cgroupData) error {
if d.config.SkipDevices {
func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error {
if r.SkipDevices {
return nil
}
if path == "" {
@@ -31,7 +29,8 @@ func (s *DevicesGroup) Apply(path string, d *cgroupData) error {
// is a hard requirement for container's security.
return errSubsystemDoesNotExist
}
return join(path, d.pid)
return apply(path, pid)
}
func loadEmulator(path string) (*cgroupdevices.Emulator, error) {
@@ -91,7 +90,7 @@ func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
//
// This safety-check is skipped for the unit tests because we cannot
// currently mock devices.list correctly.
if !s.testingSkipFinalCheck {
if !s.TestingSkipFinalCheck {
currentAfter, err := loadEmulator(path)
if err != nil {
return err

View File

@@ -0,0 +1,15 @@
package fs
import (
"fmt"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)
type parseError = fscommon.ParseError
// malformedLine is used by all cgroupfs file parsers that expect a line
// in a particular format but get some garbage instead.
func malformedLine(path, file, line string) error {
return &parseError{Path: path, File: file, Err: fmt.Errorf("malformed line: %s", line)}
}

View File

@@ -1,5 +1,3 @@
// +build linux
package fs
import (
@@ -21,8 +19,8 @@ func (s *FreezerGroup) Name() string {
return "freezer"
}
func (s *FreezerGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
func (s *FreezerGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {

View File

@@ -1,165 +1,86 @@
// +build linux
package fs
import (
"errors"
"fmt"
"os"
"path/filepath"
"sync"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
var (
subsystems = []subsystem{
&CpusetGroup{},
&DevicesGroup{},
&MemoryGroup{},
&CpuGroup{},
&CpuacctGroup{},
&PidsGroup{},
&BlkioGroup{},
&HugetlbGroup{},
&NetClsGroup{},
&NetPrioGroup{},
&PerfEventGroup{},
&FreezerGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
}
HugePageSizes, _ = cgroups.GetHugePageSize()
)
var subsystems = []subsystem{
&CpusetGroup{},
&DevicesGroup{},
&MemoryGroup{},
&CpuGroup{},
&CpuacctGroup{},
&PidsGroup{},
&BlkioGroup{},
&HugetlbGroup{},
&NetClsGroup{},
&NetPrioGroup{},
&PerfEventGroup{},
&FreezerGroup{},
&RdmaGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
}
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
func init() {
// If using cgroups-hybrid mode then add a "" controller indicating
// it should join the cgroups v2.
if cgroups.IsCgroup2HybridMode() {
subsystems = append(subsystems, &NameGroup{GroupName: "", Join: true})
}
}
type subsystem interface {
// Name returns the name of the subsystem.
Name() string
// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
// GetStats fills in the stats for the subsystem.
GetStats(path string, stats *cgroups.Stats) error
// Creates and joins the cgroup represented by 'cgroupData'.
Apply(path string, c *cgroupData) error
// Apply creates and joins a cgroup, adding pid into it. Some
// subsystems use resources to pre-configure the cgroup parents
// before creating or joining it.
Apply(path string, r *configs.Resources, pid int) error
// Set sets the cgroup resources.
Set(path string, r *configs.Resources) error
}
type manager struct {
mu sync.Mutex
cgroups *configs.Cgroup
rootless bool // ignore permission-related errors
paths map[string]string
mu sync.Mutex
cgroups *configs.Cgroup
paths map[string]string
}
func NewManager(cg *configs.Cgroup, paths map[string]string, rootless bool) cgroups.Manager {
func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
// Some v1 controllers (cpu, cpuset, and devices) expect
// cgroups.Resources to not be nil in Apply.
if cg.Resources == nil {
return nil, errors.New("cgroup v1 manager needs configs.Resources to be set during manager creation")
}
if cg.Resources.Unified != nil {
return nil, cgroups.ErrV1NoUnified
}
if paths == nil {
var err error
paths, err = initPaths(cg)
if err != nil {
return nil, err
}
}
return &manager{
cgroups: cg,
paths: paths,
rootless: rootless,
}
}
// The absolute path to the root of the cgroup hierarchies.
var (
cgroupRootLock sync.Mutex
cgroupRoot string
)
const defaultCgroupRoot = "/sys/fs/cgroup"
func tryDefaultCgroupRoot() string {
var st, pst unix.Stat_t
// (1) it should be a directory...
err := unix.Lstat(defaultCgroupRoot, &st)
if err != nil || st.Mode&unix.S_IFDIR == 0 {
return ""
}
// (2) ... and a mount point ...
err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst)
if err != nil {
return ""
}
if st.Dev == pst.Dev {
// parent dir has the same dev -- not a mount point
return ""
}
// (3) ... of 'tmpfs' fs type.
var fst unix.Statfs_t
err = unix.Statfs(defaultCgroupRoot, &fst)
if err != nil || fst.Type != unix.TMPFS_MAGIC {
return ""
}
// (4) it should have at least 1 entry ...
dir, err := os.Open(defaultCgroupRoot)
if err != nil {
return ""
}
names, err := dir.Readdirnames(1)
if err != nil {
return ""
}
if len(names) < 1 {
return ""
}
// ... which is a cgroup mount point.
err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst)
if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
return ""
}
return defaultCgroupRoot
}
// Gets the cgroupRoot.
func getCgroupRoot() (string, error) {
cgroupRootLock.Lock()
defer cgroupRootLock.Unlock()
if cgroupRoot != "" {
return cgroupRoot, nil
}
// fast path
cgroupRoot = tryDefaultCgroupRoot()
if cgroupRoot != "" {
return cgroupRoot, nil
}
// slow path: parse mountinfo
mi, err := cgroups.GetCgroupMounts(false)
if err != nil {
return "", err
}
if len(mi) < 1 {
return "", errors.New("no cgroup mount found in mountinfo")
}
// Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"),
// use its parent directory.
root := filepath.Dir(mi[0].Mountpoint)
if _, err := os.Stat(root); err != nil {
return "", err
}
cgroupRoot = root
return cgroupRoot, nil
}
type cgroupData struct {
root string
innerPath string
config *configs.Cgroup
pid int
cgroups: cg,
paths: paths,
}, nil
}
// isIgnorableError returns whether err is a permission error (in the loose
@@ -171,8 +92,6 @@ func isIgnorableError(rootless bool, err error) bool {
if !rootless {
return false
}
// TODO: rm errors.Cause once we switch to %w everywhere
err = errors.Cause(err)
// Is it an ordinary EPERM?
if errors.Is(err, os.ErrPermission) {
return true
@@ -186,56 +105,30 @@ func isIgnorableError(rootless bool, err error) bool {
}
func (m *manager) Apply(pid int) (err error) {
if m.cgroups == nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
c := m.cgroups
if c.Resources.Unified != nil {
return cgroups.ErrV1NoUnified
}
m.paths = make(map[string]string)
if c.Paths != nil {
cgMap, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return err
}
for name, path := range c.Paths {
// XXX(kolyshkin@): why this check is needed?
if _, ok := cgMap[name]; ok {
m.paths[name] = path
}
}
return cgroups.EnterPid(m.paths, pid)
}
d, err := getCgroupData(m.cgroups, pid)
if err != nil {
return err
}
for _, sys := range subsystems {
p, err := d.path(sys.Name())
if err != nil {
// The non-presence of the devices subsystem is
// considered fatal for security reasons.
if cgroups.IsNotFound(err) && (c.SkipDevices || sys.Name() != "devices") {
continue
}
return err
name := sys.Name()
p, ok := m.paths[name]
if !ok {
continue
}
m.paths[sys.Name()] = p
if err := sys.Apply(p, d); err != nil {
if err := sys.Apply(p, c.Resources, pid); err != nil {
// In the case of rootless (including euid=0 in userns), where an
// explicit cgroup path hasn't been set, we don't bail on error in
// case of permission problems. Cases where limits have been set
// (and we couldn't create our own cgroup) are handled by Set.
if isIgnorableError(m.rootless, err) && m.cgroups.Path == "" {
delete(m.paths, sys.Name())
// case of permission problems here, but do delete the path from
// the m.paths map, since it is either non-existent and could not
// be created, or the pid could not be added to it.
//
// Cases where limits for the subsystem have been set are handled
// later by Set, which fails with a friendly error (see
// if path == "" in Set).
if isIgnorableError(c.Rootless, err) && c.Path == "" {
delete(m.paths, name)
continue
}
return err
@@ -246,9 +139,6 @@ func (m *manager) Apply(pid int) (err error) {
}
func (m *manager) Destroy() error {
if m.cgroups == nil || m.cgroups.Paths != nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
return cgroups.RemovePaths(m.paths)
@@ -281,11 +171,6 @@ func (m *manager) Set(r *configs.Resources) error {
return nil
}
// If Paths are set, then we are just joining cgroups paths
// and there is no need to set any values.
if m.cgroups != nil && m.cgroups.Paths != nil {
return nil
}
if r.Unified != nil {
return cgroups.ErrV1NoUnified
}
@@ -295,10 +180,11 @@ func (m *manager) Set(r *configs.Resources) error {
for _, sys := range subsystems {
path := m.paths[sys.Name()]
if err := sys.Set(path, r); err != nil {
if m.rootless && sys.Name() == "devices" {
// When rootless is true, errors from the device subsystem
// are ignored, as it is really not expected to work.
if m.cgroups.Rootless && sys.Name() == "devices" {
continue
}
// When m.rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if path == "" {
@@ -317,7 +203,7 @@ func (m *manager) Set(r *configs.Resources) error {
// provided
func (m *manager) Freeze(state configs.FreezerState) error {
path := m.Path("freezer")
if m.cgroups == nil || path == "" {
if path == "" {
return errors.New("cannot toggle freezer: cgroups not configured for container")
}
@@ -339,68 +225,6 @@ func (m *manager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(m.Path("devices"))
}
func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
root, err := getCgroupRoot()
if err != nil {
return nil, err
}
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return nil, errors.New("cgroup: either Path or Name and Parent should be used")
}
// XXX: Do not remove this code. Path safety is important! -- cyphar
cgPath := libcontainerUtils.CleanPath(c.Path)
cgParent := libcontainerUtils.CleanPath(c.Parent)
cgName := libcontainerUtils.CleanPath(c.Name)
innerPath := cgPath
if innerPath == "" {
innerPath = filepath.Join(cgParent, cgName)
}
return &cgroupData{
root: root,
innerPath: innerPath,
config: c,
pid: pid,
}, nil
}
func (raw *cgroupData) path(subsystem string) (string, error) {
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(raw.innerPath) {
mnt, err := cgroups.FindCgroupMountpoint(raw.root, subsystem)
// If we didn't mount the subsystem, there is no point we make the path.
if err != nil {
return "", err
}
// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil
}
// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
if err != nil {
return "", err
}
return filepath.Join(parentPath, raw.innerPath), nil
}
func join(path string, pid int) error {
if path == "" {
return nil
}
if err := os.MkdirAll(path, 0o755); err != nil {
return err
}
return cgroups.WriteCgroupProc(path, pid)
}
func (m *manager) GetPaths() map[string]string {
m.mu.Lock()
defer m.mu.Unlock()
@@ -432,7 +256,7 @@ func OOMKillCount(path string) (uint64, error) {
func (m *manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.Path("memory"))
// Ignore ENOENT when rootless as it couldn't create cgroup.
if err != nil && m.rootless && os.IsNotExist(err) {
if err != nil && m.cgroups.Rootless && os.IsNotExist(err) {
err = nil
}

View File

@@ -1,9 +1,6 @@
// +build linux
package fs
import (
"fmt"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -17,8 +14,8 @@ func (s *HugetlbGroup) Name() string {
return "hugetlb"
}
func (s *HugetlbGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
@@ -32,29 +29,29 @@ func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
}
func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
hugetlbStats := cgroups.HugetlbStats{}
if !cgroups.PathExists(path) {
return nil
}
for _, pageSize := range HugePageSizes {
hugetlbStats := cgroups.HugetlbStats{}
for _, pageSize := range cgroups.HugePageSizes() {
usage := "hugetlb." + pageSize + ".usage_in_bytes"
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", usage, err)
return err
}
hugetlbStats.Usage = value
maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes"
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", maxUsage, err)
return err
}
hugetlbStats.MaxUsage = value
failcnt := "hugetlb." + pageSize + ".failcnt"
value, err = fscommon.GetCgroupParamUint(path, failcnt)
if err != nil {
return fmt.Errorf("failed to parse %s - %v", failcnt, err)
return err
}
hugetlbStats.Failcnt = value

View File

@@ -1,9 +1,8 @@
// +build linux
package fs
import (
"bufio"
"errors"
"fmt"
"math"
"os"
@@ -11,11 +10,11 @@ import (
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
const (
@@ -31,8 +30,8 @@ func (s *MemoryGroup) Name() string {
return "memory"
}
func (s *MemoryGroup) Apply(path string, d *cgroupData) (err error) {
return join(path, d.pid)
func (s *MemoryGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func setMemory(path string, val int64) error {
@@ -56,7 +55,7 @@ func setMemory(path string, val int64) error {
return err
}
return errors.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max)
return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max)
}
func setSwap(path string, val int64) error {
@@ -134,15 +133,15 @@ func (s *MemoryGroup) Set(path string, r *configs.Resources) error {
return err
}
} else {
return fmt.Errorf("invalid value:%d. valid memory swappiness range is 0-100", *r.MemorySwappiness)
return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness)
}
return nil
}
func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
// Set stats from memory.stat.
statsFile, err := cgroups.OpenFile(path, "memory.stat", os.O_RDONLY)
const file = "memory.stat"
statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return nil
@@ -155,7 +154,7 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return fmt.Errorf("failed to parse memory.stat (%q) - %v", sc.Text(), err)
return &parseError{Path: path, File: file, Err: err}
}
stats.MemoryStats.Stats[t] = v
}
@@ -220,42 +219,42 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
// are optional in the kernel.
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", usage, err)
return cgroups.MemoryData{}, err
}
memoryData.Usage = value
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil {
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", maxUsage, err)
return cgroups.MemoryData{}, err
}
memoryData.MaxUsage = value
value, err = fscommon.GetCgroupParamUint(path, failcnt)
if err != nil {
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", failcnt, err)
return cgroups.MemoryData{}, err
}
memoryData.Failcnt = value
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
return cgroups.MemoryData{}, fmt.Errorf("failed to parse %s - %v", limit, err)
return cgroups.MemoryData{}, err
}
memoryData.Limit = value
return memoryData, nil
}
func getPageUsageByNUMA(cgroupPath string) (cgroups.PageUsageByNUMA, error) {
func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) {
const (
maxColumns = math.MaxUint8 + 1
filename = "memory.numa_stat"
file = "memory.numa_stat"
)
stats := cgroups.PageUsageByNUMA{}
file, err := cgroups.OpenFile(cgroupPath, filename, os.O_RDONLY)
fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if os.IsNotExist(err) {
return stats, nil
} else if err != nil {
return stats, err
}
defer file.Close()
defer fd.Close()
// File format is documented in linux/Documentation/cgroup-v1/memory.txt
// and it looks like this:
@@ -266,7 +265,7 @@ func getPageUsageByNUMA(cgroupPath string) (cgroups.PageUsageByNUMA, error) {
// unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
// hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
scanner := bufio.NewScanner(file)
scanner := bufio.NewScanner(fd)
for scanner.Scan() {
var field *cgroups.PageStats
@@ -284,8 +283,7 @@ func getPageUsageByNUMA(cgroupPath string) (cgroups.PageUsageByNUMA, error) {
} else {
// The first column was already validated,
// so be strict to the rest.
return stats, fmt.Errorf("malformed line %q in %s",
line, filename)
return stats, malformedLine(path, file, line)
}
}
key, val := byNode[0], byNode[1]
@@ -296,24 +294,23 @@ func getPageUsageByNUMA(cgroupPath string) (cgroups.PageUsageByNUMA, error) {
}
field.Total, err = strconv.ParseUint(val, 0, 64)
if err != nil {
return stats, err
return stats, &parseError{Path: path, File: file, Err: err}
}
field.Nodes = map[uint8]uint64{}
} else { // Subsequent columns: key is N<id>, val is usage.
if len(key) < 2 || key[0] != 'N' {
// This is definitely an error.
return stats, fmt.Errorf("malformed line %q in %s",
line, filename)
return stats, malformedLine(path, file, line)
}
n, err := strconv.ParseUint(key[1:], 10, 8)
if err != nil {
return cgroups.PageUsageByNUMA{}, err
return stats, &parseError{Path: path, File: file, Err: err}
}
usage, err := strconv.ParseUint(val, 10, 64)
if err != nil {
return cgroups.PageUsageByNUMA{}, err
return stats, &parseError{Path: path, File: file, Err: err}
}
field.Nodes[uint8(n)] = usage
@@ -321,9 +318,8 @@ func getPageUsageByNUMA(cgroupPath string) (cgroups.PageUsageByNUMA, error) {
}
}
err = scanner.Err()
if err != nil {
return cgroups.PageUsageByNUMA{}, err
if err := scanner.Err(); err != nil {
return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err}
}
return stats, nil

View File

@@ -1,5 +1,3 @@
// +build linux
package fs
import (
@@ -16,10 +14,10 @@ func (s *NameGroup) Name() string {
return s.GroupName
}
func (s *NameGroup) Apply(path string, d *cgroupData) error {
func (s *NameGroup) Apply(path string, _ *configs.Resources, pid int) error {
if s.Join {
// ignore errors if the named cgroup does not exist
_ = join(path, d.pid)
// Ignore errors if the named cgroup does not exist.
_ = apply(path, pid)
}
return nil
}

View File

@@ -1,5 +1,3 @@
// +build linux
package fs
import (
@@ -15,8 +13,8 @@ func (s *NetClsGroup) Name() string {
return "net_cls"
}
func (s *NetClsGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
func (s *NetClsGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *NetClsGroup) Set(path string, r *configs.Resources) error {

View File

@@ -1,5 +1,3 @@
// +build linux
package fs
import (
@@ -13,8 +11,8 @@ func (s *NetPrioGroup) Name() string {
return "net_prio"
}
func (s *NetPrioGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
func (s *NetPrioGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *NetPrioGroup) Set(path string, r *configs.Resources) error {

View File

@@ -0,0 +1,186 @@
package fs
import (
"errors"
"os"
"path/filepath"
"sync"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
)
// The absolute path to the root of the cgroup hierarchies.
var (
cgroupRootLock sync.Mutex
cgroupRoot string
)
const defaultCgroupRoot = "/sys/fs/cgroup"
func initPaths(cg *configs.Cgroup) (map[string]string, error) {
root, err := rootPath()
if err != nil {
return nil, err
}
inner, err := innerPath(cg)
if err != nil {
return nil, err
}
paths := make(map[string]string)
for _, sys := range subsystems {
name := sys.Name()
path, err := subsysPath(root, inner, name)
if err != nil {
// The non-presence of the devices subsystem
// is considered fatal for security reasons.
if cgroups.IsNotFound(err) && (cg.SkipDevices || name != "devices") {
continue
}
return nil, err
}
paths[name] = path
}
return paths, nil
}
func tryDefaultCgroupRoot() string {
var st, pst unix.Stat_t
// (1) it should be a directory...
err := unix.Lstat(defaultCgroupRoot, &st)
if err != nil || st.Mode&unix.S_IFDIR == 0 {
return ""
}
// (2) ... and a mount point ...
err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst)
if err != nil {
return ""
}
if st.Dev == pst.Dev {
// parent dir has the same dev -- not a mount point
return ""
}
// (3) ... of 'tmpfs' fs type.
var fst unix.Statfs_t
err = unix.Statfs(defaultCgroupRoot, &fst)
if err != nil || fst.Type != unix.TMPFS_MAGIC {
return ""
}
// (4) it should have at least 1 entry ...
dir, err := os.Open(defaultCgroupRoot)
if err != nil {
return ""
}
names, err := dir.Readdirnames(1)
if err != nil {
return ""
}
if len(names) < 1 {
return ""
}
// ... which is a cgroup mount point.
err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst)
if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
return ""
}
return defaultCgroupRoot
}
// rootPath finds and returns path to the root of the cgroup hierarchies.
func rootPath() (string, error) {
cgroupRootLock.Lock()
defer cgroupRootLock.Unlock()
if cgroupRoot != "" {
return cgroupRoot, nil
}
// fast path
cgroupRoot = tryDefaultCgroupRoot()
if cgroupRoot != "" {
return cgroupRoot, nil
}
// slow path: parse mountinfo
mi, err := cgroups.GetCgroupMounts(false)
if err != nil {
return "", err
}
if len(mi) < 1 {
return "", errors.New("no cgroup mount found in mountinfo")
}
// Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"),
// use its parent directory.
root := filepath.Dir(mi[0].Mountpoint)
if _, err := os.Stat(root); err != nil {
return "", err
}
cgroupRoot = root
return cgroupRoot, nil
}
func innerPath(c *configs.Cgroup) (string, error) {
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return "", errors.New("cgroup: either Path or Name and Parent should be used")
}
// XXX: Do not remove CleanPath. Path safety is important! -- cyphar
innerPath := utils.CleanPath(c.Path)
if innerPath == "" {
cgParent := utils.CleanPath(c.Parent)
cgName := utils.CleanPath(c.Name)
innerPath = filepath.Join(cgParent, cgName)
}
return innerPath, nil
}
func subsysPath(root, inner, subsystem string) (string, error) {
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(inner) {
mnt, err := cgroups.FindCgroupMountpoint(root, subsystem)
// If we didn't mount the subsystem, there is no point we make the path.
if err != nil {
return "", err
}
// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
return filepath.Join(root, filepath.Base(mnt), inner), nil
}
// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
if err != nil {
return "", err
}
return filepath.Join(parentPath, inner), nil
}
func apply(path string, pid int) error {
if path == "" {
return nil
}
if err := os.MkdirAll(path, 0o755); err != nil {
return err
}
return cgroups.WriteCgroupProc(path, pid)
}

View File

@@ -1,5 +1,3 @@
// +build linux
package fs
import (
@@ -13,8 +11,8 @@ func (s *PerfEventGroup) Name() string {
return "perf_event"
}
func (s *PerfEventGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
func (s *PerfEventGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *PerfEventGroup) Set(_ string, _ *configs.Resources) error {

View File

@@ -1,10 +1,7 @@
// +build linux
package fs
import (
"fmt"
"path/filepath"
"math"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -18,8 +15,8 @@ func (s *PidsGroup) Name() string {
return "pids"
}
func (s *PidsGroup) Apply(path string, d *cgroupData) error {
return join(path, d.pid)
func (s *PidsGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *PidsGroup) Set(path string, r *configs.Resources) error {
@@ -45,21 +42,18 @@ func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
}
current, err := fscommon.GetCgroupParamUint(path, "pids.current")
if err != nil {
return fmt.Errorf("failed to parse pids.current - %s", err)
return err
}
maxString, err := fscommon.GetCgroupParamString(path, "pids.max")
max, err := fscommon.GetCgroupParamUint(path, "pids.max")
if err != nil {
return fmt.Errorf("failed to parse pids.max - %s", err)
return err
}
// Default if pids.max == "max" is 0 -- which represents "no limit".
var max uint64
if maxString != "max" {
max, err = fscommon.ParseUint(maxString, 10, 64)
if err != nil {
return fmt.Errorf("failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q", maxString, filepath.Join(path, "pids.max"))
}
// If no limit is set, read from pids.max returns "max", which is
// converted to MaxUint64 by GetCgroupParamUint. Historically, we
// represent "no limit" for pids as 0, thus this conversion.
if max == math.MaxUint64 {
max = 0
}
stats.PidsStats.Current = current

View File

@@ -0,0 +1,25 @@
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type RdmaGroup struct{}
func (s *RdmaGroup) Name() string {
return "rdma"
}
func (s *RdmaGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *RdmaGroup) Set(path string, r *configs.Resources) error {
return fscommon.RdmaSet(path, r)
}
func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error {
return fscommon.RdmaGetStats(path, stats)
}

View File

@@ -1,3 +0,0 @@
// +build !linux
package fs

View File

@@ -1,5 +1,3 @@
// +build linux
package fs2
import (
@@ -49,7 +47,8 @@ func setCpu(dirPath string, r *configs.Resources) error {
}
func statCpu(dirPath string, stats *cgroups.Stats) error {
f, err := cgroups.OpenFile(dirPath, "cpu.stat", os.O_RDONLY)
const file = "cpu.stat"
f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
if err != nil {
return err
}
@@ -59,7 +58,7 @@ func statCpu(dirPath string, stats *cgroups.Stats) error {
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return err
return &parseError{Path: dirPath, File: file, Err: err}
}
switch t {
case "usage_usec":
@@ -81,5 +80,8 @@ func statCpu(dirPath string, stats *cgroups.Stats) error {
stats.CpuStats.ThrottlingData.ThrottledTime = v * 1000
}
}
if err := sc.Err(); err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
return nil
}

View File

@@ -1,5 +1,3 @@
// +build linux
package fs2
import (

View File

@@ -18,41 +18,37 @@ package fs2
import (
"bufio"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
"github.com/opencontainers/runc/libcontainer/utils"
)
const UnifiedMountpoint = "/sys/fs/cgroup"
func defaultDirPath(c *configs.Cgroup) (string, error) {
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return "", errors.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c)
}
if len(c.Paths) != 0 {
// never set by specconv
return "", errors.Errorf("cgroup: Paths is unsupported, use Path, got %+v", c)
return "", fmt.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c)
}
// XXX: Do not remove this code. Path safety is important! -- cyphar
cgPath := libcontainerUtils.CleanPath(c.Path)
cgParent := libcontainerUtils.CleanPath(c.Parent)
cgName := libcontainerUtils.CleanPath(c.Name)
return _defaultDirPath(UnifiedMountpoint, cgPath, cgParent, cgName)
return _defaultDirPath(UnifiedMountpoint, c.Path, c.Parent, c.Name)
}
func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) {
if (cgName != "" || cgParent != "") && cgPath != "" {
return "", errors.New("cgroup: either Path or Name and Parent should be used")
}
innerPath := cgPath
// XXX: Do not remove CleanPath. Path safety is important! -- cyphar
innerPath := utils.CleanPath(cgPath)
if innerPath == "" {
cgParent := utils.CleanPath(cgParent)
cgName := utils.CleanPath(cgName)
innerPath = filepath.Join(cgParent, cgName)
}
if filepath.IsAbs(innerPath) {
@@ -89,7 +85,7 @@ func parseCgroupFromReader(r io.Reader) (string, error) {
parts = strings.SplitN(text, ":", 3)
)
if len(parts) < 3 {
return "", errors.Errorf("invalid cgroup entry: %q", text)
return "", fmt.Errorf("invalid cgroup entry: %q", text)
}
// text is like "0::/user.slice/user-1001.slice/session-1.scope"
if parts[0] == "0" && parts[1] == "" {

View File

@@ -1,16 +1,15 @@
// +build linux
package fs2
import (
"fmt"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups/ebpf"
"github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
func isRWM(perms devices.Permissions) bool {
@@ -64,7 +63,7 @@ func setDevices(dirPath string, r *configs.Resources) error {
}
dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0o600)
if err != nil {
return errors.Errorf("cannot get dir FD for %s", dirPath)
return fmt.Errorf("cannot get dir FD for %s", dirPath)
}
defer unix.Close(dirFD)
if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {

View File

@@ -1,19 +1,17 @@
// +build linux
package fs2
import (
"bufio"
stdErrors "errors"
"errors"
"fmt"
"os"
"strings"
"time"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
func setFreezer(dirPath string, state configs.FreezerState) error {
@@ -26,7 +24,7 @@ func setFreezer(dirPath string, state configs.FreezerState) error {
case configs.Thawed:
stateStr = "0"
default:
return errors.Errorf("invalid freezer state %q requested", state)
return fmt.Errorf("invalid freezer state %q requested", state)
}
fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDWR)
@@ -37,7 +35,7 @@ func setFreezer(dirPath string, state configs.FreezerState) error {
if state != configs.Frozen {
return nil
}
return errors.Wrap(err, "freezer not supported")
return fmt.Errorf("freezer not supported: %w", err)
}
defer fd.Close()
@@ -48,7 +46,7 @@ func setFreezer(dirPath string, state configs.FreezerState) error {
if actualState, err := readFreezer(dirPath, fd); err != nil {
return err
} else if actualState != state {
return errors.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState)
return fmt.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState)
}
return nil
}
@@ -58,7 +56,7 @@ func getFreezer(dirPath string) (configs.FreezerState, error) {
if err != nil {
// If the kernel is too old, then we just treat the freezer as being in
// an "undefined" state.
if os.IsNotExist(err) || stdErrors.Is(err, unix.ENODEV) {
if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
err = nil
}
return configs.Undefined, err
@@ -82,7 +80,7 @@ func readFreezer(dirPath string, fd *os.File) (configs.FreezerState, error) {
case "1\n":
return waitFrozen(dirPath)
default:
return configs.Undefined, errors.Errorf(`unknown "cgroup.freeze" state: %q`, state)
return configs.Undefined, fmt.Errorf(`unknown "cgroup.freeze" state: %q`, state)
}
}

View File

@@ -1,8 +1,7 @@
// +build linux
package fs2
import (
"errors"
"fmt"
"os"
"strings"
@@ -10,9 +9,10 @@ import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
)
type parseError = fscommon.ParseError
type manager struct {
config *configs.Cgroup
// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
@@ -20,16 +20,12 @@ type manager struct {
// controllers is content of "cgroup.controllers" file.
// excludes pseudo-controllers ("devices" and "freezer").
controllers map[string]struct{}
rootless bool
}
// NewManager creates a manager for cgroup v2 unified hierarchy.
// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope".
// If dirPath is empty, it is automatically set using config.
func NewManager(config *configs.Cgroup, dirPath string, rootless bool) (cgroups.Manager, error) {
if config == nil {
config = &configs.Cgroup{}
}
func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error) {
if dirPath == "" {
var err error
dirPath, err = defaultDirPath(config)
@@ -39,9 +35,8 @@ func NewManager(config *configs.Cgroup, dirPath string, rootless bool) (cgroups.
}
m := &manager{
config: config,
dirPath: dirPath,
rootless: rootless,
config: config,
dirPath: dirPath,
}
return m, nil
}
@@ -53,7 +48,7 @@ func (m *manager) getControllers() error {
data, err := cgroups.ReadFile(m.dirPath, "cgroup.controllers")
if err != nil {
if m.rootless && m.config.Path == "" {
if m.config.Rootless && m.config.Path == "" {
return nil
}
return err
@@ -73,12 +68,12 @@ func (m *manager) Apply(pid int) error {
// - "runc create (no limits + no cgrouppath + no permission) succeeds"
// - "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error"
// - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if m.rootless {
if m.config.Rootless {
if m.config.Path == "" {
if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed {
return nil
}
return errors.Wrap(err, "rootless needs no limits + no cgrouppath when no permission is granted for cgroups")
return fmt.Errorf("rootless needs no limits + no cgrouppath when no permission is granted for cgroups: %w", err)
}
}
return err
@@ -123,13 +118,20 @@ func (m *manager) GetStats() (*cgroups.Stats, error) {
if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
if len(errs) > 0 && !m.rootless {
return st, errors.Errorf("error while statting cgroup v2: %+v", errs)
// rdma (since kernel 4.11)
if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
if len(errs) > 0 && !m.config.Rootless {
return st, fmt.Errorf("error while statting cgroup v2: %+v", errs)
}
return st, nil
}
func (m *manager) Freeze(state configs.FreezerState) error {
if m.config.Resources == nil {
return errors.New("cannot toggle freezer: cgroups not configured for container")
}
if err := setFreezer(m.dirPath, state); err != nil {
return err
}
@@ -146,6 +148,9 @@ func (m *manager) Path(_ string) string {
}
func (m *manager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
if err := m.getControllers(); err != nil {
return err
}
@@ -167,10 +172,10 @@ func (m *manager) Set(r *configs.Resources) error {
}
// devices (since kernel 4.15, pseudo-controller)
//
// When m.rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// When rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if err := setDevices(m.dirPath, r); err != nil && !m.rootless {
if err := setDevices(m.dirPath, r); err != nil && !m.config.Rootless {
return err
}
// cpuset (since kernel 5.0)
@@ -181,6 +186,10 @@ func (m *manager) Set(r *configs.Resources) error {
if err := setHugeTlb(m.dirPath, r); err != nil {
return err
}
// rdma (since kernel 4.11)
if err := fscommon.RdmaSet(m.dirPath, r); err != nil {
return err
}
// freezer (since kernel 5.2, pseudo-controller)
if err := setFreezer(m.dirPath, r.Freezer); err != nil {
return err
@@ -198,9 +207,8 @@ func (m *manager) setUnified(res map[string]string) error {
return fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
}
if err := cgroups.WriteFile(m.dirPath, k, v); err != nil {
errC := errors.Cause(err)
// Check for both EPERM and ENOENT since O_CREAT is used by WriteFile.
if errors.Is(errC, os.ErrPermission) || errors.Is(errC, os.ErrNotExist) {
if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) {
// Check if a controller is available,
// to give more specific error if not.
sk := strings.SplitN(k, ".", 2)
@@ -212,7 +220,7 @@ func (m *manager) setUnified(res map[string]string) error {
return fmt.Errorf("unified resource %q can't be set: controller %q not available", k, c)
}
}
return errors.Wrapf(err, "can't set unified resource %q", k)
return fmt.Errorf("unable to set unified resource %q: %w", k, err)
}
}
@@ -243,7 +251,7 @@ func OOMKillCount(path string) (uint64, error) {
func (m *manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.dirPath)
if err != nil && m.rootless && os.IsNotExist(err) {
if err != nil && m.config.Rootless && os.IsNotExist(err) {
err = nil
}

View File

@@ -1,12 +1,8 @@
// +build linux
package fs2
import (
"strconv"
"github.com/pkg/errors"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
@@ -30,10 +26,8 @@ func setHugeTlb(dirPath string, r *configs.Resources) error {
}
func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
hugePageSizes, _ := cgroups.GetHugePageSize()
hugetlbStats := cgroups.HugetlbStats{}
for _, pagesize := range hugePageSizes {
for _, pagesize := range cgroups.HugePageSizes() {
value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current")
if err != nil {
return err
@@ -43,7 +37,7 @@ func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
fileName := "hugetlb." + pagesize + ".events"
value, err = fscommon.GetValueByKey(dirPath, fileName, "max")
if err != nil {
return errors.Wrap(err, "failed to read stats")
return err
}
hugetlbStats.Failcnt = value

View File

@@ -1,5 +1,3 @@
// +build linux
package fs2
import (
@@ -117,13 +115,14 @@ func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error
ret[parts[0]] = parts[1:]
}
if err := scanner.Err(); err != nil {
return nil, err
return nil, &parseError{Path: dirPath, File: name, Err: err}
}
return ret, nil
}
func statIo(dirPath string, stats *cgroups.Stats) error {
values, err := readCgroup2MapFile(dirPath, "io.stat")
const file = "io.stat"
values, err := readCgroup2MapFile(dirPath, file)
if err != nil {
return err
}
@@ -136,11 +135,11 @@ func statIo(dirPath string, stats *cgroups.Stats) error {
}
major, err := strconv.ParseUint(d[0], 10, 64)
if err != nil {
return err
return &parseError{Path: dirPath, File: file, Err: err}
}
minor, err := strconv.ParseUint(d[1], 10, 64)
if err != nil {
return err
return &parseError{Path: dirPath, File: file, Err: err}
}
for _, item := range v {
@@ -177,7 +176,7 @@ func statIo(dirPath string, stats *cgroups.Stats) error {
value, err := strconv.ParseUint(d[1], 10, 64)
if err != nil {
return err
return &parseError{Path: dirPath, File: file, Err: err}
}
entry := cgroups.BlkioStatEntry{

View File

@@ -1,19 +1,18 @@
// +build linux
package fs2
import (
"bufio"
"errors"
"math"
"os"
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
// numToStr converts an int64 value to a string for writing to a
@@ -75,8 +74,8 @@ func setMemory(dirPath string, r *configs.Resources) error {
}
func statMemory(dirPath string, stats *cgroups.Stats) error {
// Set stats from memory.stat.
statsFile, err := cgroups.OpenFile(dirPath, "memory.stat", os.O_RDONLY)
const file = "memory.stat"
statsFile, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
if err != nil {
return err
}
@@ -86,10 +85,13 @@ func statMemory(dirPath string, stats *cgroups.Stats) error {
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return errors.Wrapf(err, "failed to parse memory.stat (%q)", sc.Text())
return &parseError{Path: dirPath, File: file, Err: err}
}
stats.MemoryStats.Stats[t] = v
}
if err := sc.Err(); err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
stats.MemoryStats.Cache = stats.MemoryStats.Stats["file"]
// Unlike cgroup v1 which has memory.use_hierarchy binary knob,
// cgroup v2 is always hierarchical.
@@ -139,13 +141,13 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
// swapaccount=0 kernel boot parameter is given.
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", usage)
return cgroups.MemoryData{}, err
}
memoryData.Usage = value
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
return cgroups.MemoryData{}, errors.Wrapf(err, "failed to parse %s", limit)
return cgroups.MemoryData{}, err
}
memoryData.Limit = value
@@ -153,7 +155,8 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
}
func statsFromMeminfo(stats *cgroups.Stats) error {
f, err := os.Open("/proc/meminfo")
const file = "/proc/meminfo"
f, err := os.Open(file)
if err != nil {
return err
}
@@ -190,7 +193,7 @@ func statsFromMeminfo(stats *cgroups.Stats) error {
vStr := strings.TrimSpace(strings.TrimSuffix(parts[1], " kB"))
*p, err = strconv.ParseUint(vStr, 10, 64)
if err != nil {
return errors.Wrap(err, "parsing /proc/meminfo "+k)
return &parseError{File: file, Err: errors.New("bad value for " + k)}
}
found++
@@ -199,8 +202,8 @@ func statsFromMeminfo(stats *cgroups.Stats) error {
break
}
}
if sc.Err() != nil {
return sc.Err()
if err := sc.Err(); err != nil {
return &parseError{Path: "", File: file, Err: err}
}
stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024

View File

@@ -1,17 +1,16 @@
// +build linux
package fs2
import (
"errors"
"math"
"os"
"path/filepath"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
func isPidsSet(r *configs.Resources) bool {
@@ -53,22 +52,18 @@ func statPids(dirPath string, stats *cgroups.Stats) error {
if os.IsNotExist(err) {
return statPidsFromCgroupProcs(dirPath, stats)
}
return errors.Wrap(err, "failed to parse pids.current")
return err
}
maxString, err := fscommon.GetCgroupParamString(dirPath, "pids.max")
max, err := fscommon.GetCgroupParamUint(dirPath, "pids.max")
if err != nil {
return errors.Wrap(err, "failed to parse pids.max")
return err
}
// Default if pids.max == "max" is 0 -- which represents "no limit".
var max uint64
if maxString != "max" {
max, err = fscommon.ParseUint(maxString, 10, 64)
if err != nil {
return errors.Wrapf(err, "failed to parse pids.max - unable to parse %q as a uint from Cgroup file %q",
maxString, filepath.Join(dirPath, "pids.max"))
}
// If no limit is set, read from pids.max returns "max", which is
// converted to MaxUint64 by GetCgroupParamUint. Historically, we
// represent "no limit" for pids as 0, thus this conversion.
if max == math.MaxUint64 {
max = 0
}
stats.PidsStats.Current = current

View File

@@ -0,0 +1,121 @@
package fscommon
import (
"bufio"
"errors"
"math"
"os"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
// parseRdmaKV parses raw string to RdmaEntry.
func parseRdmaKV(raw string, entry *cgroups.RdmaEntry) error {
var value uint32
parts := strings.SplitN(raw, "=", 3)
if len(parts) != 2 {
return errors.New("Unable to parse RDMA entry")
}
k, v := parts[0], parts[1]
if v == "max" {
value = math.MaxUint32
} else {
val64, err := strconv.ParseUint(v, 10, 32)
if err != nil {
return err
}
value = uint32(val64)
}
if k == "hca_handle" {
entry.HcaHandles = value
} else if k == "hca_object" {
entry.HcaObjects = value
}
return nil
}
// readRdmaEntries reads and converts array of rawstrings to RdmaEntries from file.
// example entry: mlx4_0 hca_handle=2 hca_object=2000
func readRdmaEntries(dir, file string) ([]cgroups.RdmaEntry, error) {
rdmaEntries := make([]cgroups.RdmaEntry, 0)
fd, err := cgroups.OpenFile(dir, file, unix.O_RDONLY)
if err != nil {
return nil, err
}
defer fd.Close() //nolint:errorlint
scanner := bufio.NewScanner(fd)
for scanner.Scan() {
parts := strings.SplitN(scanner.Text(), " ", 4)
if len(parts) == 3 {
entry := new(cgroups.RdmaEntry)
entry.Device = parts[0]
err = parseRdmaKV(parts[1], entry)
if err != nil {
continue
}
err = parseRdmaKV(parts[2], entry)
if err != nil {
continue
}
rdmaEntries = append(rdmaEntries, *entry)
}
}
return rdmaEntries, scanner.Err()
}
// RdmaGetStats returns rdma stats such as totalLimit and current entries.
func RdmaGetStats(path string, stats *cgroups.Stats) error {
currentEntries, err := readRdmaEntries(path, "rdma.current")
if err != nil {
if errors.Is(err, os.ErrNotExist) {
err = nil
}
return err
}
maxEntries, err := readRdmaEntries(path, "rdma.max")
if err != nil {
return err
}
// If device got removed between reading two files, ignore returning stats.
if len(currentEntries) != len(maxEntries) {
return nil
}
stats.RdmaStats = cgroups.RdmaStats{
RdmaLimit: maxEntries,
RdmaCurrent: currentEntries,
}
return nil
}
func createCmdString(device string, limits configs.LinuxRdma) string {
cmdString := device
if limits.HcaHandles != nil {
cmdString += " hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10)
}
if limits.HcaObjects != nil {
cmdString += " hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10)
}
return cmdString
}
// RdmaSet sets RDMA resources.
func RdmaSet(path string, r *configs.Resources) error {
for device, limits := range r.Rdma {
if err := cgroups.WriteFile(path, "rdma.max", createCmdString(device, limits)); err != nil {
return err
}
}
return nil
}

View File

@@ -1,11 +1,10 @@
// +build linux
package fscommon
import (
"errors"
"fmt"
"math"
"path"
"strconv"
"strings"
@@ -13,8 +12,6 @@ import (
)
var (
ErrNotValidFormat = errors.New("line is not a valid key value format")
// Deprecated: use cgroups.OpenFile instead.
OpenFile = cgroups.OpenFile
// Deprecated: use cgroups.ReadFile instead.
@@ -23,6 +20,19 @@ var (
WriteFile = cgroups.WriteFile
)
// ParseError records a parse error details, including the file path.
type ParseError struct {
Path string
File string
Err error
}
func (e *ParseError) Error() string {
return "unable to parse " + path.Join(e.Path, e.File) + ": " + e.Err.Error()
}
func (e *ParseError) Unwrap() error { return e.Err }
// ParseUint converts a string to an uint64 integer.
// Negative values are returned at zero as, due to kernel bugs,
// some of the memory cgroup stats can be negative.
@@ -34,7 +44,7 @@ func ParseUint(s string, base, bitSize int) (uint64, error) {
// 2. Handle negative values lesser than MinInt64
if intErr == nil && intValue < 0 {
return 0, nil
} else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
} else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 {
return 0, nil
}
@@ -56,7 +66,7 @@ func ParseKeyValue(t string) (string, uint64, error) {
value, err := ParseUint(parts[1], 10, 64)
if err != nil {
return "", 0, fmt.Errorf("unable to convert to uint64: %v", err)
return "", 0, err
}
return parts[0], value, nil
@@ -71,11 +81,15 @@ func GetValueByKey(path, file, key string) (uint64, error) {
return 0, err
}
lines := strings.Split(string(content), "\n")
lines := strings.Split(content, "\n")
for _, line := range lines {
arr := strings.Split(line, " ")
if len(arr) == 2 && arr[0] == key {
return ParseUint(arr[1], 10, 64)
val, err := ParseUint(arr[1], 10, 64)
if err != nil {
err = &ParseError{Path: path, File: file, Err: err}
}
return val, err
}
}
@@ -96,7 +110,7 @@ func GetCgroupParamUint(path, file string) (uint64, error) {
res, err := ParseUint(contents, 10, 64)
if err != nil {
return res, fmt.Errorf("unable to parse file %q", path+"/"+file)
return res, &ParseError{Path: path, File: file, Err: err}
}
return res, nil
}
@@ -115,7 +129,7 @@ func GetCgroupParamInt(path, file string) (int64, error) {
res, err := strconv.ParseInt(contents, 10, 64)
if err != nil {
return res, fmt.Errorf("unable to parse %q as a int from Cgroup file %q", contents, path+"/"+file)
return res, &ParseError{Path: path, File: file, Err: err}
}
return res, nil
}

View File

@@ -0,0 +1,27 @@
package cgroups
import (
"io/fs"
"path/filepath"
)
// GetAllPids returns all pids from the cgroup identified by path, and all its
// sub-cgroups.
func GetAllPids(path string) ([]int, error) {
var pids []int
err := filepath.WalkDir(path, func(p string, d fs.DirEntry, iErr error) error {
if iErr != nil {
return iErr
}
if !d.IsDir() {
return nil
}
cPids, err := readProcsFile(p)
if err != nil {
return err
}
pids = append(pids, cPids...)
return nil
})
return pids, err
}

View File

@@ -0,0 +1,78 @@
package manager
import (
"errors"
"fmt"
"path/filepath"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/opencontainers/runc/libcontainer/configs"
)
// New returns the instance of a cgroup manager, which is chosen
// based on the local environment (whether cgroup v1 or v2 is used)
// and the config (whether config.Systemd is set or not).
func New(config *configs.Cgroup) (cgroups.Manager, error) {
return NewWithPaths(config, nil)
}
// NewWithPaths is similar to New, and can be used in case cgroup paths
// are already well known, which can save some resources.
//
// For cgroup v1, the keys are controller/subsystem name, and the values
// are absolute filesystem paths to the appropriate cgroups.
//
// For cgroup v2, the only key allowed is "" (empty string), and the value
// is the unified cgroup path.
func NewWithPaths(config *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
if config == nil {
return nil, errors.New("cgroups/manager.New: config must not be nil")
}
if config.Systemd && !systemd.IsRunningSystemd() {
return nil, errors.New("systemd not running on this host, cannot use systemd cgroups manager")
}
// Cgroup v2 aka unified hierarchy.
if cgroups.IsCgroup2UnifiedMode() {
path, err := getUnifiedPath(paths)
if err != nil {
return nil, fmt.Errorf("manager.NewWithPaths: inconsistent paths: %w", err)
}
if config.Systemd {
return systemd.NewUnifiedManager(config, path)
}
return fs2.NewManager(config, path)
}
// Cgroup v1.
if config.Systemd {
return systemd.NewLegacyManager(config, paths)
}
return fs.NewManager(config, paths)
}
// getUnifiedPath is an implementation detail of libcontainer factory.
// Historically, it saves cgroup paths as per-subsystem path map (as returned
// by cm.GetPaths(""), but with v2 we only have one single unified path
// (with "" as a key).
//
// This function converts from that map to string (using "" as a key),
// and also checks that the map itself is sane.
func getUnifiedPath(paths map[string]string) (string, error) {
if len(paths) > 1 {
return "", fmt.Errorf("expected a single path, got %+v", paths)
}
path := paths[""]
// can be empty
if path != "" {
if filepath.Clean(path) != path || !filepath.IsAbs(path) {
return "", fmt.Errorf("invalid path: %q", path)
}
}
return path, nil
}

View File

@@ -1,5 +1,3 @@
// +build linux
package cgroups
type ThrottlingData struct {
@@ -126,7 +124,7 @@ type BlkioStatEntry struct {
}
type BlkioStats struct {
// number of bytes tranferred to and from the block device
// number of bytes transferred to and from the block device
IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"`
IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"`
IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"`
@@ -146,6 +144,17 @@ type HugetlbStats struct {
Failcnt uint64 `json:"failcnt"`
}
type RdmaEntry struct {
Device string `json:"device,omitempty"`
HcaHandles uint32 `json:"hca_handles,omitempty"`
HcaObjects uint32 `json:"hca_objects,omitempty"`
}
type RdmaStats struct {
RdmaLimit []RdmaEntry `json:"rdma_limit,omitempty"`
RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"`
}
type Stats struct {
CpuStats CpuStats `json:"cpu_stats,omitempty"`
CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
@@ -154,6 +163,7 @@ type Stats struct {
BlkioStats BlkioStats `json:"blkio_stats,omitempty"`
// the map is in the format "size of hugepage: stats of the hugepage"
HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
RdmaStats RdmaStats `json:"rdma_stats,omitempty"`
}
func NewStats() *Stats {

View File

@@ -3,6 +3,7 @@ package systemd
import (
"bufio"
"context"
"errors"
"fmt"
"math"
"os"
@@ -14,11 +15,11 @@ import (
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
"github.com/sirupsen/logrus"
cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
const (
@@ -92,7 +93,7 @@ func groupPrefix(ruleType devices.Type) (string, error) {
case devices.CharDevice:
return "char-", nil
default:
return "", errors.Errorf("device type %v has no group prefix", ruleType)
return "", fmt.Errorf("device type %v has no group prefix", ruleType)
}
}
@@ -142,9 +143,9 @@ func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
)
if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 {
if err == nil {
err = errors.Errorf("wrong number of fields")
err = errors.New("wrong number of fields")
}
return "", errors.Wrapf(err, "scan /proc/devices line %q", line)
return "", fmt.Errorf("scan /proc/devices line %q: %w", line, err)
}
if currMajor == ruleMajor {
@@ -152,7 +153,7 @@ func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
}
}
if err := scanner.Err(); err != nil {
return "", errors.Wrap(err, "reading /proc/devices")
return "", fmt.Errorf("reading /proc/devices: %w", err)
}
// Couldn't find the device group.
return "", nil
@@ -192,12 +193,12 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err
configEmu := &cgroupdevices.Emulator{}
for _, rule := range r.Devices {
if err := configEmu.Apply(*rule); err != nil {
return nil, errors.Wrap(err, "apply rule for systemd")
return nil, fmt.Errorf("unable to apply rule for systemd: %w", err)
}
}
// systemd doesn't support blacklists. So we log a warning, and tell
// systemd to act as a deny-all whitelist. This ruleset will be replaced
// with our normal fallback code. This may result in spurrious errors, but
// with our normal fallback code. This may result in spurious errors, but
// the only other option is to error out here.
if configEmu.IsBlacklist() {
// However, if we're dealing with an allow-all rule then we can do it.
@@ -213,19 +214,19 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err
// whitelist which is the default for devices.Emulator.
finalRules, err := configEmu.Rules()
if err != nil {
return nil, errors.Wrap(err, "get simplified rules for systemd")
return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err)
}
var deviceAllowList []deviceAllowEntry
for _, rule := range finalRules {
if !rule.Allow {
// Should never happen.
return nil, errors.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
}
switch rule.Type {
case devices.BlockDevice, devices.CharDevice:
default:
// Should never happen.
return nil, errors.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
}
entry := deviceAllowEntry{
@@ -271,7 +272,7 @@ func generateDeviceProperties(r *configs.Resources) ([]systemdDbus.Property, err
// "_ n:* _" rules require a device group from /proc/devices.
group, err := findDeviceGroup(rule.Type, rule.Major)
if err != nil {
return nil, errors.Wrapf(err, "find device '%v/%d'", rule.Type, rule.Major)
return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err)
}
if group == "" {
// Couldn't find a group.
@@ -350,7 +351,7 @@ func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Pr
// Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
if s != "done" {
resetFailedUnit(cm, unitName)
return errors.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
}
case <-timeout.C:
resetFailedUnit(cm, unitName)
@@ -449,10 +450,13 @@ func systemdVersionAtoi(verStr string) (int, error) {
re := regexp.MustCompile(`v?([0-9]+)`)
matches := re.FindStringSubmatch(verStr)
if len(matches) < 2 {
return 0, errors.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches)
return 0, fmt.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches)
}
ver, err := strconv.Atoi(matches[1])
return ver, errors.Wrapf(err, "can't parse version %s", verStr)
if err != nil {
return -1, fmt.Errorf("can't parse version: %w", err)
}
return ver, nil
}
func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota int64, period uint64) {

View File

@@ -1,12 +1,10 @@
package systemd
import (
"encoding/binary"
"errors"
"math/big"
"strconv"
"strings"
"github.com/bits-and-blooms/bitset"
"github.com/pkg/errors"
)
// RangeToBits converts a text representation of a CPU mask (as written to
@@ -14,7 +12,7 @@ import (
// with the corresponding bits set (as consumed by systemd over dbus as
// AllowedCPUs/AllowedMemoryNodes unit property value).
func RangeToBits(str string) ([]byte, error) {
bits := &bitset.BitSet{}
bits := new(big.Int)
for _, r := range strings.Split(str, ",") {
// allow extra spaces around
@@ -36,32 +34,22 @@ func RangeToBits(str string) ([]byte, error) {
if start > end {
return nil, errors.New("invalid range: " + r)
}
for i := uint(start); i <= uint(end); i++ {
bits.Set(i)
for i := start; i <= end; i++ {
bits.SetBit(bits, int(i), 1)
}
} else {
val, err := strconv.ParseUint(ranges[0], 10, 32)
if err != nil {
return nil, err
}
bits.Set(uint(val))
bits.SetBit(bits, int(val), 1)
}
}
val := bits.Bytes()
if len(val) == 0 {
ret := bits.Bytes()
if len(ret) == 0 {
// do not allow empty values
return nil, errors.New("empty value")
}
ret := make([]byte, len(val)*8)
for i := range val {
// bitset uses BigEndian internally
binary.BigEndian.PutUint64(ret[i*8:], val[len(val)-1-i])
}
// remove upper all-zero bytes
for ret[0] == 0 {
ret = ret[1:]
}
return ret, nil
}

View File

@@ -1,5 +1,3 @@
// +build linux
package systemd
import (

View File

@@ -1,71 +0,0 @@
// +build !linux
package systemd
import (
"errors"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type Manager struct {
Cgroups *configs.Cgroup
Paths map[string]string
}
func IsRunningSystemd() bool {
return false
}
func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) {
return nil, errors.New("Systemd not supported")
}
func (m *Manager) Apply(pid int) error {
return errors.New("Systemd not supported")
}
func (m *Manager) GetPids() ([]int, error) {
return nil, errors.New("Systemd not supported")
}
func (m *Manager) GetAllPids() ([]int, error) {
return nil, errors.New("Systemd not supported")
}
func (m *Manager) Destroy() error {
return errors.New("Systemd not supported")
}
func (m *Manager) GetPaths() map[string]string {
return nil
}
func (m *Manager) Path(_ string) string {
return ""
}
func (m *Manager) GetStats() (*cgroups.Stats, error) {
return nil, errors.New("Systemd not supported")
}
func (m *Manager) Set(container *configs.Config) error {
return errors.New("Systemd not supported")
}
func (m *Manager) Freeze(state configs.FreezerState) error {
return errors.New("Systemd not supported")
}
func Freeze(c *configs.Cgroup, state configs.FreezerState) error {
return errors.New("Systemd not supported")
}
func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
return nil, errors.New("Systemd not supported")
}
func (m *Manager) Exists() bool {
return false
}

View File

@@ -1,10 +1,10 @@
// +build linux
package systemd
import (
"bufio"
"bytes"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
@@ -13,8 +13,8 @@ import (
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/pkg/errors"
)
// newUserSystemdDbus creates a connection for systemd user-instance.
@@ -31,17 +31,17 @@ func newUserSystemdDbus() (*systemdDbus.Conn, error) {
return systemdDbus.NewConnection(func() (*dbus.Conn, error) {
conn, err := dbus.Dial(addr)
if err != nil {
return nil, errors.Wrapf(err, "error while dialing %q", addr)
return nil, fmt.Errorf("error while dialing %q: %w", addr, err)
}
methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))}
err = conn.Auth(methods)
if err != nil {
conn.Close()
return nil, errors.Wrapf(err, "error while authenticating connection, address=%q, UID=%d", addr, uid)
return nil, fmt.Errorf("error while authenticating connection (address=%q, UID=%d): %w", addr, uid, err)
}
if err = conn.Hello(); err != nil {
conn.Close()
return nil, errors.Wrapf(err, "error while sending Hello message, address=%q, UID=%d", addr, uid)
return nil, fmt.Errorf("error while sending Hello message (address=%q, UID=%d): %w", addr, uid, err)
}
return conn, nil
})
@@ -57,7 +57,7 @@ func DetectUID() (int, error) {
}
b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput()
if err != nil {
return -1, errors.Wrapf(err, "could not execute `busctl --user --no-pager status`: %q", string(b))
return -1, fmt.Errorf("could not execute `busctl --user --no-pager status` (output: %q): %w", string(b), err)
}
scanner := bufio.NewScanner(bytes.NewReader(b))
for scanner.Scan() {
@@ -66,7 +66,7 @@ func DetectUID() (int, error) {
uidStr := strings.TrimPrefix(s, "OwnerUID=")
i, err := strconv.Atoi(uidStr)
if err != nil {
return -1, errors.Wrapf(err, "could not detect the OwnerUID: %s", s)
return -1, fmt.Errorf("could not detect the OwnerUID: %w", err)
}
return i, nil
}
@@ -93,7 +93,7 @@ func DetectUserDbusSessionBusAddress() (string, error) {
}
b, err := exec.Command("systemctl", "--user", "--no-pager", "show-environment").CombinedOutput()
if err != nil {
return "", errors.Wrapf(err, "could not execute `systemctl --user --no-pager show-environment`, output=%q", string(b))
return "", fmt.Errorf("could not execute `systemctl --user --no-pager show-environment` (output=%q): %w", string(b), err)
}
scanner := bufio.NewScanner(bytes.NewReader(b))
for scanner.Scan() {

View File

@@ -1,5 +1,3 @@
// +build linux
package systemd
import (
@@ -26,12 +24,25 @@ type legacyManager struct {
dbus *dbusConnManager
}
func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) cgroups.Manager {
func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
if cg.Rootless {
return nil, errors.New("cannot use rootless systemd cgroups manager on cgroup v1")
}
if cg.Resources != nil && cg.Resources.Unified != nil {
return nil, cgroups.ErrV1NoUnified
}
if paths == nil {
var err error
paths, err = initPaths(cg)
if err != nil {
return nil, err
}
}
return &legacyManager{
cgroups: cg,
paths: paths,
dbus: newDbusConnManager(false),
}
}, nil
}
type subsystem interface {
@@ -59,6 +70,7 @@ var legacySubsystems = []subsystem{
&fs.NetPrioGroup{},
&fs.NetClsGroup{},
&fs.NameGroup{GroupName: "name=systemd"},
&fs.RdmaGroup{},
}
func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
@@ -100,6 +112,53 @@ func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]syst
return properties, nil
}
// initPaths figures out and returns paths to cgroups.
func initPaths(c *configs.Cgroup) (map[string]string, error) {
slice := "system.slice"
if c.Parent != "" {
var err error
slice, err = ExpandSlice(c.Parent)
if err != nil {
return nil, err
}
}
unit := getUnitName(c)
paths := make(map[string]string)
for _, s := range legacySubsystems {
subsystemPath, err := getSubsystemPath(slice, unit, s.Name())
if err != nil {
// Even if it's `not found` error, we'll return err
// because devices cgroup is hard requirement for
// container security.
if s.Name() == "devices" {
return nil, err
}
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
if cgroups.IsNotFound(err) {
continue
}
return nil, err
}
paths[s.Name()] = subsystemPath
}
// If systemd is using cgroups-hybrid mode then add the slice path of
// this container to the paths so the following process executed with
// "runc exec" joins that cgroup as well.
if cgroups.IsCgroup2HybridMode() {
// "" means cgroup-hybrid path
cgroupsHybridPath, err := getSubsystemPath(slice, unit, "")
if err != nil && cgroups.IsNotFound(err) {
return nil, err
}
paths[""] = cgroupsHybridPath
}
return paths, nil
}
func (m *legacyManager) Apply(pid int) error {
var (
c = m.cgroups
@@ -108,27 +167,8 @@ func (m *legacyManager) Apply(pid int) error {
properties []systemdDbus.Property
)
if c.Resources.Unified != nil {
return cgroups.ErrV1NoUnified
}
m.mu.Lock()
defer m.mu.Unlock()
if c.Paths != nil {
paths := make(map[string]string)
cgMap, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return err
}
// XXX(kolyshkin@): why this check is needed?
for name, path := range c.Paths {
if _, ok := cgMap[name]; ok {
paths[name] = path
}
}
m.paths = paths
return cgroups.EnterPid(m.paths, pid)
}
if c.Parent != "" {
slice = c.Parent
@@ -136,12 +176,14 @@ func (m *legacyManager) Apply(pid int) error {
properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
// if we create a slice, the parent is defined via a Wants=
if strings.HasSuffix(unitName, ".slice") {
// If we create a slice, the parent is defined via a Wants=.
properties = append(properties, systemdDbus.PropWants(slice))
} else {
// otherwise, we use Slice=
// Otherwise it's a scope, which we put into a Slice=.
properties = append(properties, systemdDbus.PropSlice(slice))
// Assume scopes always support delegation (supported since systemd v218).
properties = append(properties, newProp("Delegate", true))
}
// only add pid if its valid, -1 is used w/ general slice creation.
@@ -149,12 +191,6 @@ func (m *legacyManager) Apply(pid int) error {
properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
}
// Check if we can delegate. This is only supported on systemd versions 218 and above.
if !strings.HasSuffix(unitName, ".slice") {
// Assume scopes always support delegation.
properties = append(properties, newProp("Delegate", true))
}
// Always enable accounting, this gets us the same behaviour as the fs implementation,
// plus the kernel has some problems with joining the memory cgroup at a later time.
properties = append(properties,
@@ -174,26 +210,6 @@ func (m *legacyManager) Apply(pid int) error {
return err
}
paths := make(map[string]string)
for _, s := range legacySubsystems {
subsystemPath, err := getSubsystemPath(m.cgroups, s.Name())
if err != nil {
// Even if it's `not found` error, we'll return err
// because devices cgroup is hard requirement for
// container security.
if s.Name() == "devices" {
return err
}
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
if cgroups.IsNotFound(err) {
continue
}
return err
}
paths[s.Name()] = subsystemPath
}
m.paths = paths
if err := m.joinCgroups(pid); err != nil {
return err
}
@@ -202,9 +218,6 @@ func (m *legacyManager) Apply(pid int) error {
}
func (m *legacyManager) Destroy() error {
if m.cgroups.Paths != nil {
return nil
}
m.mu.Lock()
defer m.mu.Unlock()
@@ -254,7 +267,7 @@ func (m *legacyManager) joinCgroups(pid int) error {
return nil
}
func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
func getSubsystemPath(slice, unit, subsystem string) (string, error) {
mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem)
if err != nil {
return "", err
@@ -267,17 +280,7 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
// if pid 1 is systemd 226 or later, it will be in init.scope, not the root
initPath = strings.TrimSuffix(filepath.Clean(initPath), "init.scope")
slice := "system.slice"
if c.Parent != "" {
slice = c.Parent
}
slice, err = ExpandSlice(slice)
if err != nil {
return "", err
}
return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
return filepath.Join(mountpoint, initPath, slice, unit), nil
}
func (m *legacyManager) Freeze(state configs.FreezerState) error {
@@ -399,9 +402,7 @@ func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) (
}
func (m *legacyManager) Set(r *configs.Resources) error {
// If Paths are set, then we are just joining cgroups paths
// and there is no need to set any values.
if m.cgroups.Paths != nil {
if r == nil {
return nil
}
if r.Unified != nil {

View File

@@ -1,10 +1,10 @@
// +build linux
package systemd
import (
"bufio"
"fmt"
"math"
"os"
"path/filepath"
"strconv"
"strings"
@@ -12,29 +12,39 @@ import (
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
type unifiedManager struct {
mu sync.Mutex
cgroups *configs.Cgroup
// path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
path string
rootless bool
dbus *dbusConnManager
path string
dbus *dbusConnManager
fsMgr cgroups.Manager
}
func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgroups.Manager {
return &unifiedManager{
cgroups: config,
path: path,
rootless: rootless,
dbus: newDbusConnManager(rootless),
func NewUnifiedManager(config *configs.Cgroup, path string) (cgroups.Manager, error) {
m := &unifiedManager{
cgroups: config,
path: path,
dbus: newDbusConnManager(config.Rootless),
}
if err := m.initPath(); err != nil {
return nil, err
}
fsMgr, err := fs2.NewManager(config, m.path)
if err != nil {
return nil, err
}
m.fsMgr = fsMgr
return m, nil
}
// unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified
@@ -233,12 +243,8 @@ func (m *unifiedManager) Apply(pid int) error {
properties []systemdDbus.Property
)
if c.Paths != nil {
return cgroups.WriteCgroupProc(m.path, pid)
}
slice := "system.slice"
if m.rootless {
if m.cgroups.Rootless {
slice = "user.slice"
}
if c.Parent != "" {
@@ -247,12 +253,14 @@ func (m *unifiedManager) Apply(pid int) error {
properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
// if we create a slice, the parent is defined via a Wants=
if strings.HasSuffix(unitName, ".slice") {
// If we create a slice, the parent is defined via a Wants=.
properties = append(properties, systemdDbus.PropWants(slice))
} else {
// otherwise, we use Slice=
// Otherwise it's a scope, which we put into a Slice=.
properties = append(properties, systemdDbus.PropSlice(slice))
// Assume scopes always support delegation (supported since systemd v218).
properties = append(properties, newProp("Delegate", true))
}
// only add pid if its valid, -1 is used w/ general slice creation.
@@ -260,12 +268,6 @@ func (m *unifiedManager) Apply(pid int) error {
properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
}
// Check if we can delegate. This is only supported on systemd versions 218 and above.
if !strings.HasSuffix(unitName, ".slice") {
// Assume scopes always support delegation.
properties = append(properties, newProp("Delegate", true))
}
// Always enable accounting, this gets us the same behaviour as the fs implementation,
// plus the kernel has some problems with joining the memory cgroup at a later time.
properties = append(properties,
@@ -282,22 +284,53 @@ func (m *unifiedManager) Apply(pid int) error {
properties = append(properties, c.SystemdProps...)
if err := startUnit(m.dbus, unitName, properties); err != nil {
return errors.Wrapf(err, "error while starting unit %q with properties %+v", unitName, properties)
return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err)
}
if err := m.initPath(); err != nil {
return err
}
if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
return err
}
if c.OwnerUID != nil {
filesToChown, err := cgroupFilesToChown()
if err != nil {
return err
}
for _, v := range filesToChown {
err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
if err != nil {
return err
}
}
}
return nil
}
func (m *unifiedManager) Destroy() error {
if m.cgroups.Paths != nil {
return nil
// The kernel exposes a list of files that should be chowned to the delegate
// uid in /sys/kernel/cgroup/delegate. If the file is not present
// (Linux < 4.15), use the initial values mentioned in cgroups(7).
func cgroupFilesToChown() ([]string, error) {
filesToChown := []string{"."} // the directory itself must be chowned
const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"
f, err := os.Open(cgroupDelegateFile)
if err == nil {
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
filesToChown = append(filesToChown, scanner.Text())
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
}
} else {
filesToChown = append(filesToChown, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads")
}
return filesToChown, nil
}
func (m *unifiedManager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
@@ -307,8 +340,8 @@ func (m *unifiedManager) Destroy() error {
}
// systemd 239 do not remove sub-cgroups.
err := cgroups.RemovePath(m.path)
// cgroups.RemovePath has handled ErrNotExist
err := m.fsMgr.Destroy()
// fsMgr.Destroy has handled ErrNotExist
if err != nil {
return err
}
@@ -317,7 +350,6 @@ func (m *unifiedManager) Destroy() error {
}
func (m *unifiedManager) Path(_ string) string {
_ = m.initPath()
return m.path
}
@@ -326,7 +358,7 @@ func (m *unifiedManager) Path(_ string) string {
func (m *unifiedManager) getSliceFull() (string, error) {
c := m.cgroups
slice := "system.slice"
if m.rootless {
if c.Rootless {
slice = "user.slice"
}
if c.Parent != "" {
@@ -337,7 +369,7 @@ func (m *unifiedManager) getSliceFull() (string, error) {
}
}
if m.rootless {
if c.Rootless {
// managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service".
managerCG, err := getManagerProperty(m.dbus, "ControlGroup")
if err != nil {
@@ -375,58 +407,36 @@ func (m *unifiedManager) initPath() error {
return nil
}
func (m *unifiedManager) fsManager() (cgroups.Manager, error) {
if err := m.initPath(); err != nil {
return nil, err
}
return fs2.NewManager(m.cgroups, m.path, m.rootless)
}
func (m *unifiedManager) Freeze(state configs.FreezerState) error {
fsMgr, err := m.fsManager()
if err != nil {
return err
}
return fsMgr.Freeze(state)
return m.fsMgr.Freeze(state)
}
func (m *unifiedManager) GetPids() ([]int, error) {
if err := m.initPath(); err != nil {
return nil, err
}
return cgroups.GetPids(m.path)
}
func (m *unifiedManager) GetAllPids() ([]int, error) {
if err := m.initPath(); err != nil {
return nil, err
}
return cgroups.GetAllPids(m.path)
}
func (m *unifiedManager) GetStats() (*cgroups.Stats, error) {
fsMgr, err := m.fsManager()
if err != nil {
return nil, err
}
return fsMgr.GetStats()
return m.fsMgr.GetStats()
}
func (m *unifiedManager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
properties, err := genV2ResourcesProperties(r, m.dbus)
if err != nil {
return err
}
if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {
return errors.Wrap(err, "error while setting unit properties")
return fmt.Errorf("unable to set unit properties: %w", err)
}
fsMgr, err := m.fsManager()
if err != nil {
return err
}
return fsMgr.Set(r)
return m.fsMgr.Set(r)
}
func (m *unifiedManager) GetPaths() map[string]string {
@@ -440,11 +450,7 @@ func (m *unifiedManager) GetCgroups() (*configs.Cgroup, error) {
}
func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) {
fsMgr, err := m.fsManager()
if err != nil {
return configs.Undefined, err
}
return fsMgr.GetFreezerState()
return m.fsMgr.GetFreezerState()
}
func (m *unifiedManager) Exists() bool {
@@ -452,9 +458,5 @@ func (m *unifiedManager) Exists() bool {
}
func (m *unifiedManager) OOMKillCount() (uint64, error) {
fsMgr, err := m.fsManager()
if err != nil {
return 0, err
}
return fsMgr.OOMKillCount()
return m.fsMgr.OOMKillCount()
}

View File

@@ -1,5 +1,3 @@
// +build linux
package cgroups
import (
@@ -7,7 +5,6 @@ import (
"errors"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"strconv"
@@ -23,11 +20,14 @@ import (
const (
CgroupProcesses = "cgroup.procs"
unifiedMountpoint = "/sys/fs/cgroup"
hybridMountpoint = "/sys/fs/cgroup/unified"
)
var (
isUnifiedOnce sync.Once
isUnified bool
isHybridOnce sync.Once
isHybrid bool
)
// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
@@ -49,6 +49,24 @@ func IsCgroup2UnifiedMode() bool {
return isUnified
}
// IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode.
func IsCgroup2HybridMode() bool {
isHybridOnce.Do(func() {
var st unix.Statfs_t
err := unix.Statfs(hybridMountpoint, &st)
if err != nil {
if os.IsNotExist(err) {
// ignore the "not found" error
isHybrid = false
return
}
panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
}
isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC
})
return isHybrid
}
type Mount struct {
Mountpoint string
Root string
@@ -118,8 +136,8 @@ func GetAllSubsystems() ([]string, error) {
return subsystems, nil
}
func readProcsFile(file string) ([]int, error) {
f, err := os.Open(file)
func readProcsFile(dir string) ([]int, error) {
f, err := OpenFile(dir, CgroupProcesses, os.O_RDONLY)
if err != nil {
return nil, err
}
@@ -210,7 +228,7 @@ func EnterPid(cgroupPaths map[string]string, pid int) error {
func rmdir(path string) error {
err := unix.Rmdir(path)
if err == nil || err == unix.ENOENT {
if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare
return nil
}
return &os.PathError{Op: "rmdir", Path: path, Err: err}
@@ -224,7 +242,7 @@ func RemovePath(path string) error {
return nil
}
infos, err := ioutil.ReadDir(path)
infos, err := os.ReadDir(path)
if err != nil {
if os.IsNotExist(err) {
err = nil
@@ -284,40 +302,61 @@ func RemovePaths(paths map[string]string) (err error) {
return fmt.Errorf("Failed to remove paths: %v", paths)
}
func GetHugePageSize() ([]string, error) {
dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return nil, err
}
files, err := dir.Readdirnames(0)
dir.Close()
if err != nil {
return nil, err
}
var (
hugePageSizes []string
initHPSOnce sync.Once
)
return getHugePageSizeFromFilenames(files)
func HugePageSizes() []string {
initHPSOnce.Do(func() {
dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return
}
files, err := dir.Readdirnames(0)
dir.Close()
if err != nil {
return
}
hugePageSizes, err = getHugePageSizeFromFilenames(files)
if err != nil {
logrus.Warn("HugePageSizes: ", err)
}
})
return hugePageSizes
}
func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
pageSizes := make([]string, 0, len(fileNames))
var warn error
for _, file := range fileNames {
// example: hugepages-1048576kB
val := strings.TrimPrefix(file, "hugepages-")
if len(val) == len(file) {
// unexpected file name: no prefix found
// Unexpected file name: no prefix found, ignore it.
continue
}
// The suffix is always "kB" (as of Linux 5.9)
// The suffix is always "kB" (as of Linux 5.13). If we find
// something else, produce an error but keep going.
eLen := len(val) - 2
val = strings.TrimSuffix(val, "kB")
if len(val) != eLen {
logrus.Warnf("GetHugePageSize: %s: invalid filename suffix (expected \"kB\")", file)
// Highly unlikely.
if warn == nil {
warn = errors.New(file + `: invalid suffix (expected "kB")`)
}
continue
}
size, err := strconv.Atoi(val)
if err != nil {
return nil, err
// Highly unlikely.
if warn == nil {
warn = fmt.Errorf("%s: %w", file, err)
}
continue
}
// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
// but in our case the size is in KB already.
@@ -331,34 +370,12 @@ func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
pageSizes = append(pageSizes, val)
}
return pageSizes, nil
return pageSizes, warn
}
// GetPids returns all pids, that were added to cgroup at path.
func GetPids(dir string) ([]int, error) {
return readProcsFile(filepath.Join(dir, CgroupProcesses))
}
// GetAllPids returns all pids, that were added to cgroup at path and to all its
// subcgroups.
func GetAllPids(path string) ([]int, error) {
var pids []int
// collect pids from all sub-cgroups
err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
if iErr != nil {
return iErr
}
if info.IsDir() || info.Name() != CgroupProcesses {
return nil
}
cPids, err := readProcsFile(p)
if err != nil {
return err
}
pids = append(pids, cPids...)
return nil
})
return pids, err
return readProcsFile(dir)
}
// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
@@ -376,7 +393,7 @@ func WriteCgroupProc(dir string, pid int) error {
file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY)
if err != nil {
return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
return fmt.Errorf("failed to write %v: %w", pid, err)
}
defer file.Close()
@@ -393,7 +410,7 @@ func WriteCgroupProc(dir string, pid int) error {
continue
}
return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
return fmt.Errorf("failed to write %v: %w", pid, err)
}
return err
}
@@ -446,5 +463,5 @@ func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
if blkIoWeight == 0 {
return 0
}
return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
return 1 + (uint64(blkIoWeight)-10)*9999/990
}

View File

@@ -46,11 +46,8 @@ func NewNotFoundError(sub string) error {
}
func IsNotFound(err error) bool {
if err == nil {
return false
}
_, ok := err.(*NotFoundError)
return ok
var nfErr *NotFoundError
return errors.As(err, &nfErr)
}
func tryDefaultPath(cgroupPath, subsystem string) string {
@@ -116,6 +113,11 @@ func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
return "", errUnified
}
// If subsystem is empty, we look for the cgroupv2 hybrid path.
if len(subsystem) == 0 {
return hybridMountpoint, nil
}
// Avoid parsing mountinfo by trying the default path first, if possible.
if path := tryDefaultPath(cgroupPath, subsystem); path != "" {
return path, nil
@@ -154,7 +156,7 @@ func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, sub
func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
if len(m.Subsystems) == 0 {
return "", fmt.Errorf("no subsystem for mount")
return "", errors.New("no subsystem for mount")
}
return getControllerPath(m.Subsystems[0], cgroups)
@@ -226,6 +228,11 @@ func GetOwnCgroupPath(subsystem string) (string, error) {
return "", err
}
// If subsystem is empty, we look for the cgroupv2 hybrid path.
if len(subsystem) == 0 {
return hybridMountpoint, nil
}
return getCgroupPathHelper(subsystem, cgroup)
}

View File

@@ -28,17 +28,26 @@ type Cgroup struct {
// ScopePrefix describes prefix for the scope name
ScopePrefix string `json:"scope_prefix"`
// Paths represent the absolute cgroups paths to join.
// This takes precedence over Path.
Paths map[string]string
// Resources contains various cgroups settings to apply
*Resources
// Systemd tells if systemd should be used to manage cgroups.
Systemd bool
// SystemdProps are any additional properties for systemd,
// derived from org.systemd.property.xxx annotations.
// Ignored unless systemd is used for managing cgroups.
SystemdProps []systemdDbus.Property `json:"-"`
// Rootless tells if rootless cgroups should be used.
Rootless bool
// The host UID that should own the cgroup, or nil to accept
// the default ownership. This should only be set when the
// cgroupfs is to be mounted read/write.
// Not all cgroup manager implementations support changing
// the ownership.
OwnerUID *int `json:"owner_uid,omitempty"`
}
type Resources struct {
@@ -117,6 +126,9 @@ type Resources struct {
// Set class identifier for container's network packets
NetClsClassid uint32 `json:"net_cls_classid_u"`
// Rdma resource restriction configuration
Rdma map[string]LinuxRdma `json:"rdma"`
// Used on cgroups v2:
// CpuWeight sets a proportional bandwidth limit.

View File

@@ -1,3 +1,4 @@
//go:build !linux
// +build !linux
package configs

View File

@@ -7,10 +7,10 @@ import (
"os/exec"
"time"
"github.com/sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
type Rlimit struct {
@@ -31,10 +31,12 @@ type IDMap struct {
// for syscalls. Additional architectures can be added by specifying them in
// Architectures.
type Seccomp struct {
DefaultAction Action `json:"default_action"`
Architectures []string `json:"architectures"`
Syscalls []*Syscall `json:"syscalls"`
DefaultErrnoRet *uint `json:"default_errno_ret"`
DefaultAction Action `json:"default_action"`
Architectures []string `json:"architectures"`
Syscalls []*Syscall `json:"syscalls"`
DefaultErrnoRet *uint `json:"default_errno_ret"`
ListenerPath string `json:"listener_path,omitempty"`
ListenerMetadata string `json:"listener_metadata,omitempty"`
}
// Action is taken upon rule match in Seccomp
@@ -47,6 +49,9 @@ const (
Allow
Trace
Log
Notify
KillThread
KillProcess
)
// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
@@ -246,6 +251,19 @@ const (
Poststop HookName = "poststop"
)
// KnownHookNames returns the known hook names.
// Used by `runc features`.
func KnownHookNames() []string {
return []string{
string(Prestart), // deprecated
string(CreateRuntime),
string(CreateContainer),
string(StartContainer),
string(Poststart),
string(Poststop),
}
}
type Capabilities struct {
// Bounding is the set of capabilities checked by the kernel.
Bounding []string
@@ -262,7 +280,7 @@ type Capabilities struct {
func (hooks HookList) RunHooks(state *specs.State) error {
for i, h := range hooks {
if err := h.Run(state); err != nil {
return errors.Wrapf(err, "Running hook #%d:", i)
return fmt.Errorf("error running hook #%d: %w", i, err)
}
}
@@ -375,7 +393,7 @@ func (c Command) Run(s *specs.State) error {
go func() {
err := cmd.Wait()
if err != nil {
err = fmt.Errorf("error running hook: %v, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
}
errC <- err
}()

View File

@@ -1,17 +1,24 @@
package configs
import "fmt"
import "errors"
var (
errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.")
errNoUserMap = errors.New("User namespaces enabled, but no user mapping found.")
errNoGIDMap = errors.New("User namespaces enabled, but no gid mappings found.")
errNoGroupMap = errors.New("User namespaces enabled, but no group mapping found.")
)
// HostUID gets the translated uid for the process on host which could be
// different when user namespaces are enabled.
func (c Config) HostUID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.UidMappings == nil {
return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.")
return -1, errNoUIDMap
}
id, found := c.hostIDFromMapping(containerId, c.UidMappings)
if !found {
return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.")
return -1, errNoUserMap
}
return id, nil
}
@@ -30,11 +37,11 @@ func (c Config) HostRootUID() (int, error) {
func (c Config) HostGID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.GidMappings == nil {
return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.")
return -1, errNoGIDMap
}
id, found := c.hostIDFromMapping(containerId, c.GidMappings)
if !found {
return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.")
return -1, errNoGroupMap
}
return id, nil
}

View File

@@ -1,3 +1,4 @@
//go:build gofuzz
// +build gofuzz
package configs

View File

@@ -1,6 +1,9 @@
package configs
type IntelRdt struct {
// The identity for RDT Class of Service
ClosID string `json:"closID,omitempty"`
// The schema for L3 cache id and capacity bitmask (CBM)
// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
L3CacheSchema string `json:"l3_cache_schema,omitempty"`

View File

@@ -1,5 +1,7 @@
package configs
import "golang.org/x/sys/unix"
const (
// EXT_COPYUP is a directive to copy up the contents of a directory when
// a tmpfs is mounted over it.
@@ -28,6 +30,9 @@ type Mount struct {
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"`
// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
RecAttr *unix.MountAttr `json:"rec_attr"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
@@ -37,3 +42,7 @@ type Mount struct {
// Optional Command to be run after Source is mounted.
PostmountCmds []Command `json:"postmount_cmds"`
}
func (m *Mount) IsBind() bool {
return m.Flags&unix.MS_BIND != 0
}

View File

@@ -1,3 +1,4 @@
//go:build linux
// +build linux
package configs

View File

@@ -1,3 +1,4 @@
//go:build !linux && !windows
// +build !linux,!windows
package configs

View File

@@ -1,3 +1,4 @@
//go:build !linux
// +build !linux
package configs

View File

@@ -0,0 +1,9 @@
package configs
// LinuxRdma for Linux cgroup 'rdma' resource management (Linux 4.11)
type LinuxRdma struct {
// Maximum number of HCA handles that can be opened. Default is "no limit".
HcaHandles *uint32 `json:"hca_handles,omitempty"`
// Maximum number of HCA objects that can be created. Default is "no limit".
HcaObjects *uint32 `json:"hca_objects,omitempty"`
}

View File

@@ -52,7 +52,7 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
}
for _, c := range warns {
if err := c(config); err != nil {
logrus.WithError(err).Warnf("invalid configuration")
logrus.WithError(err).Warn("invalid configuration")
}
}
return nil
@@ -62,20 +62,17 @@ func (v *ConfigValidator) Validate(config *configs.Config) error {
// to the container's root filesystem.
func (v *ConfigValidator) rootfs(config *configs.Config) error {
if _, err := os.Stat(config.Rootfs); err != nil {
if os.IsNotExist(err) {
return fmt.Errorf("rootfs (%s) does not exist", config.Rootfs)
}
return err
return fmt.Errorf("invalid rootfs: %w", err)
}
cleaned, err := filepath.Abs(config.Rootfs)
if err != nil {
return err
return fmt.Errorf("invalid rootfs: %w", err)
}
if cleaned, err = filepath.EvalSymlinks(cleaned); err != nil {
return err
return fmt.Errorf("invalid rootfs: %w", err)
}
if filepath.Clean(config.Rootfs) != cleaned {
return fmt.Errorf("%s is not an absolute path or is a symlink", config.Rootfs)
return errors.New("invalid rootfs: not an absolute path, or a symlink")
}
return nil
}
@@ -131,6 +128,36 @@ func (v *ConfigValidator) cgroupnamespace(config *configs.Config) error {
return nil
}
// convertSysctlVariableToDotsSeparator can return sysctl variables in dots separator format.
// The '/' separator is also accepted in place of a '.'.
// Convert the sysctl variables to dots separator format for validation.
// More info:
// https://man7.org/linux/man-pages/man8/sysctl.8.html
// https://man7.org/linux/man-pages/man5/sysctl.d.5.html
// For example:
// Input sysctl variable "net/ipv4/conf/eno2.100.rp_filter"
// will return the converted value "net.ipv4.conf.eno2/100.rp_filter"
func convertSysctlVariableToDotsSeparator(val string) string {
if val == "" {
return val
}
firstSepIndex := strings.IndexAny(val, "./")
if firstSepIndex == -1 || val[firstSepIndex] == '.' {
return val
}
f := func(r rune) rune {
switch r {
case '.':
return '/'
case '/':
return '.'
}
return r
}
return strings.Map(f, val)
}
// sysctl validates that the specified sysctl keys are valid or not.
// /proc/sys isn't completely namespaced and depending on which namespaces
// are specified, a subset of sysctls are permitted.
@@ -153,6 +180,7 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
)
for s := range config.Sysctl {
s := convertSysctlVariableToDotsSeparator(s)
if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") {
if config.Namespaces.Contains(configs.NEWIPC) {
continue
@@ -176,7 +204,7 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
hostnet, hostnetErr = isHostNetNS(path)
})
if hostnetErr != nil {
return hostnetErr
return fmt.Errorf("invalid netns path: %w", hostnetErr)
}
if hostnet {
return fmt.Errorf("sysctl %q not allowed in host network namespace", s)
@@ -205,19 +233,16 @@ func (v *ConfigValidator) intelrdt(config *configs.Config) error {
return errors.New("intelRdt is specified in config, but Intel RDT is not supported or enabled")
}
if config.IntelRdt.ClosID == "." || config.IntelRdt.ClosID == ".." || strings.Contains(config.IntelRdt.ClosID, "/") {
return fmt.Errorf("invalid intelRdt.ClosID %q", config.IntelRdt.ClosID)
}
if !intelrdt.IsCATEnabled() && config.IntelRdt.L3CacheSchema != "" {
return errors.New("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled")
}
if !intelrdt.IsMBAEnabled() && config.IntelRdt.MemBwSchema != "" {
return errors.New("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled")
}
if intelrdt.IsCATEnabled() && config.IntelRdt.L3CacheSchema == "" {
return errors.New("Intel RDT/CAT is enabled and intelRdt is specified in config, but intelRdt.l3CacheSchema is empty")
}
if intelrdt.IsMBAEnabled() && config.IntelRdt.MemBwSchema == "" {
return errors.New("Intel RDT/MBA is enabled and intelRdt is specified in config, but intelRdt.memBwSchema is empty")
}
}
return nil
@@ -268,10 +293,10 @@ func isHostNetNS(path string) (bool, error) {
var st1, st2 unix.Stat_t
if err := unix.Stat(currentProcessNetns, &st1); err != nil {
return false, fmt.Errorf("unable to stat %q: %s", currentProcessNetns, err)
return false, &os.PathError{Op: "stat", Path: currentProcessNetns, Err: err}
}
if err := unix.Stat(path, &st2); err != nil {
return false, fmt.Errorf("unable to stat %q: %s", path, err)
return false, &os.PathError{Op: "stat", Path: path, Err: err}
}
return (st1.Dev == st2.Dev) && (st1.Ino == st2.Ino), nil

View File

@@ -18,7 +18,7 @@ func mountConsole(slavePath string) error {
if f != nil {
f.Close()
}
return unix.Mount(slavePath, "/dev/console", "bind", unix.MS_BIND, "")
return mount(slavePath, "/dev/console", "", "bind", unix.MS_BIND, "")
}
// dupStdio opens the slavePath for the console and dups the fds to the current

View File

@@ -74,22 +74,12 @@ type BaseContainer interface {
ID() string
// Returns the current status of the container.
//
// errors:
// ContainerNotExists - Container no longer exists,
// Systemerror - System error.
Status() (Status, error)
// State returns the current container's state information.
//
// errors:
// SystemError - System error.
State() (*State, error)
// OCIState returns the current container's state information.
//
// errors:
// SystemError - System error.
OCIState() (*specs.State, error)
// Returns the current config of the container.
@@ -97,48 +87,26 @@ type BaseContainer interface {
// Returns the PIDs inside this container. The PIDs are in the namespace of the calling process.
//
// errors:
// ContainerNotExists - Container no longer exists,
// Systemerror - System error.
//
// Some of the returned PIDs may no longer refer to processes in the Container, unless
// the Container state is PAUSED in which case every PID in the slice is valid.
Processes() ([]int, error)
// Returns statistics for the container.
//
// errors:
// ContainerNotExists - Container no longer exists,
// Systemerror - System error.
Stats() (*Stats, error)
// Set resources of container as configured
//
// We can use this to change resources when containers are running.
//
// errors:
// SystemError - System error.
Set(config configs.Config) error
// Start a process inside the container. Returns error if process fails to
// start. You can track process lifecycle with passed Process structure.
//
// errors:
// ContainerNotExists - Container no longer exists,
// ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused,
// SystemError - System error.
Start(process *Process) (err error)
// Run immediately starts the process inside the container. Returns error if process
// fails to start. It does not block waiting for the exec fifo after start returns but
// opens the fifo after start returns.
//
// errors:
// ContainerNotExists - Container no longer exists,
// ConfigInvalid - config is invalid,
// ContainerPaused - Container is paused,
// SystemError - System error.
Run(process *Process) (err error)
// Destroys the container, if its in a valid state, after killing any
@@ -149,25 +117,14 @@ type BaseContainer interface {
//
// Running containers must first be stopped using Signal(..).
// Paused containers must first be resumed using Resume(..).
//
// errors:
// ContainerNotStopped - Container is still running,
// ContainerPaused - Container is paused,
// SystemError - System error.
Destroy() error
// Signal sends the provided signal code to the container's initial process.
//
// If all is specified the signal is sent to all processes in the container
// including the initial process.
//
// errors:
// SystemError - System error.
Signal(s os.Signal, all bool) error
// Exec signals the container to exec the users process at the end of the init.
//
// errors:
// SystemError - System error.
Exec() error
}

View File

@@ -1,5 +1,3 @@
// +build linux
package libcontainer
import (
@@ -8,10 +6,10 @@ import (
"errors"
"fmt"
"io"
"io/ioutil"
"net"
"os"
"os/exec"
"path"
"path/filepath"
"reflect"
"strconv"
@@ -19,21 +17,20 @@ import (
"sync"
"time"
"github.com/checkpoint-restore/go-criu/v5"
criurpc "github.com/checkpoint-restore/go-criu/v5/rpc"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink/nl"
"golang.org/x/sys/unix"
"google.golang.org/protobuf/proto"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/checkpoint-restore/go-criu/v5"
criurpc "github.com/checkpoint-restore/go-criu/v5/rpc"
errorsf "github.com/pkg/errors"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink/nl"
"golang.org/x/sys/unix"
"google.golang.org/protobuf/proto"
)
const stdioFdCount = 3
@@ -98,48 +95,26 @@ type Container interface {
// Methods below here are platform specific
// Checkpoint checkpoints the running container's state to disk using the criu(8) utility.
//
// errors:
// Systemerror - System error.
Checkpoint(criuOpts *CriuOpts) error
// Restore restores the checkpointed container to a running state using the criu(8) utility.
//
// errors:
// Systemerror - System error.
Restore(process *Process, criuOpts *CriuOpts) error
// If the Container state is RUNNING or CREATED, sets the Container state to PAUSING and pauses
// the execution of any user processes. Asynchronously, when the container finished being paused the
// state is changed to PAUSED.
// If the Container state is PAUSED, do nothing.
//
// errors:
// ContainerNotExists - Container no longer exists,
// ContainerNotRunning - Container not running or created,
// Systemerror - System error.
Pause() error
// If the Container state is PAUSED, resumes the execution of any user processes in the
// Container before setting the Container state to RUNNING.
// If the Container state is RUNNING, do nothing.
//
// errors:
// ContainerNotExists - Container no longer exists,
// ContainerNotPaused - Container is not paused,
// Systemerror - System error.
Resume() error
// NotifyOOM returns a read-only channel signaling when the container receives an OOM notification.
//
// errors:
// Systemerror - System error.
NotifyOOM() (<-chan struct{}, error)
// NotifyMemoryPressure returns a read-only channel signaling when the container reaches a given pressure level
//
// errors:
// Systemerror - System error.
NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error)
}
@@ -184,7 +159,7 @@ func (c *linuxContainer) Processes() ([]int, error) {
pids, err = c.cgroupManager.GetAllPids()
if err != nil {
return nil, newSystemErrorWithCause(err, "getting all container pids from cgroups")
return nil, fmt.Errorf("unable to get all container pids: %w", err)
}
return pids, nil
}
@@ -195,11 +170,11 @@ func (c *linuxContainer) Stats() (*Stats, error) {
stats = &Stats{}
)
if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil {
return stats, newSystemErrorWithCause(err, "getting container stats from cgroups")
return stats, fmt.Errorf("unable to get container cgroup stats: %w", err)
}
if c.intelRdtManager != nil {
if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil {
return stats, newSystemErrorWithCause(err, "getting container's Intel RDT stats")
return stats, fmt.Errorf("unable to get container Intel RDT stats: %w", err)
}
}
for _, iface := range c.config.Networks {
@@ -207,7 +182,7 @@ func (c *linuxContainer) Stats() (*Stats, error) {
case "veth":
istats, err := getNetworkInterfaceStats(iface.HostInterfaceName)
if err != nil {
return stats, newSystemErrorWithCausef(err, "getting network stats for interface %q", iface.HostInterfaceName)
return stats, fmt.Errorf("unable to get network stats for interface %q: %w", iface.HostInterfaceName, err)
}
stats.Interfaces = append(stats.Interfaces, istats)
}
@@ -223,7 +198,7 @@ func (c *linuxContainer) Set(config configs.Config) error {
return err
}
if status == Stopped {
return newGenericError(errors.New("container not running"), ContainerNotRunning)
return ErrNotRunning
}
if err := c.cgroupManager.Set(config.Cgroups.Resources); err != nil {
// Set configs back
@@ -254,7 +229,7 @@ func (c *linuxContainer) Start(process *Process) error {
c.m.Lock()
defer c.m.Unlock()
if c.config.Cgroups.Resources.SkipDevices {
return newGenericError(errors.New("can't start container with SkipDevices set"), ConfigInvalid)
return errors.New("can't start container with SkipDevices set")
}
if process.Init {
if err := c.createExecFifo(); err != nil {
@@ -310,7 +285,7 @@ func (c *linuxContainer) exec() error {
}
func readFromExecFifo(execFifo io.Reader) error {
data, err := ioutil.ReadAll(execFifo)
data, err := io.ReadAll(execFifo)
if err != nil {
return err
}
@@ -336,7 +311,7 @@ func fifoOpen(path string, block bool) openResult {
}
f, err := os.OpenFile(path, flags, 0)
if err != nil {
return openResult{err: newSystemErrorWithCause(err, "open exec fifo for reading")}
return openResult{err: fmt.Errorf("exec fifo: %w", err)}
}
return openResult{file: f}
}
@@ -361,7 +336,7 @@ type openResult struct {
func (c *linuxContainer) start(process *Process) (retErr error) {
parent, err := c.newParentProcess(process)
if err != nil {
return newSystemErrorWithCause(err, "creating new parent process")
return fmt.Errorf("unable to create new parent process: %w", err)
}
logsDone := parent.forwardChildLogs()
@@ -371,13 +346,13 @@ func (c *linuxContainer) start(process *Process) (retErr error) {
// runc init closing the _LIBCONTAINER_LOGPIPE log fd.
err := <-logsDone
if err != nil && retErr == nil {
retErr = newSystemErrorWithCause(err, "forwarding init logs")
retErr = fmt.Errorf("unable to forward init logs: %w", err)
}
}()
}
if err := parent.start(); err != nil {
return newSystemErrorWithCause(err, "starting container process")
return fmt.Errorf("unable to start container process: %w", err)
}
if process.Init {
@@ -390,7 +365,7 @@ func (c *linuxContainer) start(process *Process) (retErr error) {
if err := c.config.Hooks[configs.Poststart].RunHooks(s); err != nil {
if err := ignoreTerminateErrors(parent.terminate()); err != nil {
logrus.Warn(errorsf.Wrapf(err, "Running Poststart hook"))
logrus.Warn(fmt.Errorf("error running poststart hook: %w", err))
}
return err
}
@@ -416,11 +391,19 @@ func (c *linuxContainer) Signal(s os.Signal, all bool) error {
// to avoid a PID reuse attack
if status == Running || status == Created || status == Paused {
if err := c.initProcess.signal(s); err != nil {
return newSystemErrorWithCause(err, "signaling init process")
return fmt.Errorf("unable to signal init: %w", err)
}
if status == Paused {
// For cgroup v1, killing a process in a frozen cgroup
// does nothing until it's thawed. Only thaw the cgroup
// for SIGKILL.
if s, ok := s.(unix.Signal); ok && s == unix.SIGKILL {
_ = c.cgroupManager.Freeze(configs.Thawed)
}
}
return nil
}
return newGenericError(errors.New("container not running"), ContainerNotRunning)
return ErrNotRunning
}
func (c *linuxContainer) createExecFifo() error {
@@ -472,13 +455,13 @@ func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {
func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
parentInitPipe, childInitPipe, err := utils.NewSockPair("init")
if err != nil {
return nil, newSystemErrorWithCause(err, "creating new init pipe")
return nil, fmt.Errorf("unable to create init pipe: %w", err)
}
messageSockPair := filePair{parentInitPipe, childInitPipe}
parentLogPipe, childLogPipe, err := os.Pipe()
if err != nil {
return nil, fmt.Errorf("Unable to create the log pipe: %s", err)
return nil, fmt.Errorf("unable to create log pipe: %w", err)
}
logFilePair := filePair{parentLogPipe, childLogPipe}
@@ -493,7 +476,7 @@ func (c *linuxContainer) newParentProcess(p *Process) (parentProcess, error) {
// that problem), but we no longer do that. However, there's no need to do
// this for `runc exec` so we just keep it this way to be safe.
if err := c.includeExecFifo(cmd); err != nil {
return nil, newSystemErrorWithCause(err, "including execfifo in cmd.Exec setup")
return nil, fmt.Errorf("unable to setup exec fifo: %w", err)
}
return c.newInitProcess(p, cmd, messageSockPair, logFilePair)
}
@@ -537,6 +520,33 @@ func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, chi
return cmd
}
// shouldSendMountSources says whether the child process must setup bind mounts with
// the source pre-opened (O_PATH) in the host user namespace.
// See https://github.com/opencontainers/runc/issues/2484
func (c *linuxContainer) shouldSendMountSources() bool {
// Passing the mount sources via SCM_RIGHTS is only necessary when
// both userns and mntns are active.
if !c.config.Namespaces.Contains(configs.NEWUSER) ||
!c.config.Namespaces.Contains(configs.NEWNS) {
return false
}
// nsexec.c send_mountsources() requires setns(mntns) capabilities
// CAP_SYS_CHROOT and CAP_SYS_ADMIN.
if c.config.RootlessEUID {
return false
}
// We need to send sources if there are bind-mounts.
for _, m := range c.config.Mounts {
if m.IsBind() {
return true
}
}
return false
}
func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) {
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
nsMaps := make(map[configs.NamespaceType]string)
@@ -546,10 +556,40 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPa
}
}
_, sharePidns := nsMaps[configs.NEWPID]
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
if err != nil {
return nil, err
}
if c.shouldSendMountSources() {
// Elements on this slice will be paired with mounts (see StartInitialization() and
// prepareRootfs()). This slice MUST have the same size as c.config.Mounts.
mountFds := make([]int, len(c.config.Mounts))
for i, m := range c.config.Mounts {
if !m.IsBind() {
// Non bind-mounts do not use an fd.
mountFds[i] = -1
continue
}
// The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need
// to allocate a fd so that we know the number to pass in the environment variable. The fd
// must not be closed before cmd.Start(), so we reuse messageSockPair.child because the
// lifecycle of that fd is already taken care of.
cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child)
mountFds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
}
mountFdsJson, err := json.Marshal(mountFds)
if err != nil {
return nil, fmt.Errorf("Error creating _LIBCONTAINER_MOUNT_FDS: %w", err)
}
cmd.Env = append(cmd.Env,
"_LIBCONTAINER_MOUNT_FDS="+string(mountFdsJson),
)
}
init := &initProcess{
cmd: cmd,
messageSockPair: messageSockPair,
@@ -570,15 +610,15 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP
cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initSetns))
state, err := c.currentState()
if err != nil {
return nil, newSystemErrorWithCause(err, "getting container's current state")
return nil, fmt.Errorf("unable to get container state: %w", err)
}
// for setns process, we don't have to set cloneflags as the process namespaces
// will only be set via setns syscall
data, err := c.bootstrapData(0, state.NamespacePaths)
data, err := c.bootstrapData(0, state.NamespacePaths, initSetns)
if err != nil {
return nil, err
}
return &setnsProcess{
proc := &setnsProcess{
cmd: cmd,
cgroupPaths: state.CgroupPaths,
rootlessCgroups: c.config.RootlessCgroups,
@@ -590,7 +630,29 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockP
process: p,
bootstrapData: data,
initProcessPid: state.InitProcessPid,
}, nil
}
if len(p.SubCgroupPaths) > 0 {
if add, ok := p.SubCgroupPaths[""]; ok {
// cgroup v1: using the same path for all controllers.
// cgroup v2: the only possible way.
for k := range proc.cgroupPaths {
proc.cgroupPaths[k] = path.Join(proc.cgroupPaths[k], add)
}
// cgroup v2: do not try to join init process's cgroup
// as a fallback (see (*setnsProcess).start).
proc.initProcessPid = 0
} else {
// Per-controller paths.
for ctrl, add := range p.SubCgroupPaths {
if val, ok := proc.cgroupPaths[ctrl]; ok {
proc.cgroupPaths[ctrl] = path.Join(val, add)
} else {
return nil, fmt.Errorf("unknown controller %s in SubCgroupPaths", ctrl)
}
}
}
}
return proc, nil
}
func (c *linuxContainer) newInitConfig(process *Process) *initConfig {
@@ -655,7 +717,7 @@ func (c *linuxContainer) Pause() error {
c: c,
})
}
return newGenericError(fmt.Errorf("container not running or created: %s", status), ContainerNotRunning)
return ErrNotRunning
}
func (c *linuxContainer) Resume() error {
@@ -666,7 +728,7 @@ func (c *linuxContainer) Resume() error {
return err
}
if status != Paused {
return newGenericError(fmt.Errorf("container not paused"), ContainerNotPaused)
return ErrNotPaused
}
if err := c.cgroupManager.Freeze(configs.Thawed); err != nil {
return err
@@ -771,7 +833,7 @@ func (c *linuxContainer) checkCriuVersion(minVersion int) error {
var err error
c.criuVersion, err = criu.GetCriuVersion()
if err != nil {
return fmt.Errorf("CRIU version check failed: %s", err)
return fmt.Errorf("CRIU version check failed: %w", err)
}
return compareCriuVersion(c.criuVersion, minVersion)
@@ -781,6 +843,9 @@ const descriptorsFilename = "descriptors.json"
func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
mountDest = dest[len(c.config.Rootfs):]
}
extMnt := &criurpc.ExtMountMap{
Key: proto.String(mountDest),
Val: proto.String(mountDest),
@@ -972,20 +1037,6 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
return err
}
if criuOpts.WorkDirectory == "" {
criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
}
if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
return err
}
workDir, err := os.Open(criuOpts.WorkDirectory)
if err != nil {
return err
}
defer workDir.Close()
imageDir, err := os.Open(criuOpts.ImagesDirectory)
if err != nil {
return err
@@ -994,7 +1045,6 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
rpcOpts := criurpc.CriuOpts{
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
WorkDirFd: proto.Int32(int32(workDir.Fd())),
LogLevel: proto.Int32(4),
LogFile: proto.String("dump.log"),
Root: proto.String(c.config.Rootfs),
@@ -1012,6 +1062,19 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
LazyPages: proto.Bool(criuOpts.LazyPages),
}
// if criuOpts.WorkDirectory is not set, criu default is used.
if criuOpts.WorkDirectory != "" {
if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
return err
}
workDir, err := os.Open(criuOpts.WorkDirectory)
if err != nil {
return err
}
defer workDir.Close()
rpcOpts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
}
c.handleCriuConfigurationFile(&rpcOpts)
// If the container is running in a network namespace and has
@@ -1054,7 +1117,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
// append optional manage cgroups mode
if criuOpts.ManageCgroupsMode != 0 {
mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
mode := criuOpts.ManageCgroupsMode
rpcOpts.ManageCgroupsMode = &mode
}
@@ -1144,7 +1207,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
return err
}
err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600)
err = os.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0o600)
if err != nil {
return err
}
@@ -1159,6 +1222,9 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
if dest, err := securejoin.SecureJoin(c.config.Rootfs, mountDest); err == nil {
mountDest = dest[len(c.config.Rootfs):]
}
extMnt := &criurpc.ExtMountMap{
Key: proto.String(mountDest),
Val: proto.String(m.Source),
@@ -1203,7 +1269,9 @@ func (c *linuxContainer) makeCriuRestoreMountpoints(m *configs.Mount) error {
case "bind":
// The prepareBindMount() function checks if source
// exists. So it cannot be used for other filesystem types.
if err := prepareBindMount(m, c.config.Rootfs); err != nil {
// TODO: pass something else than nil? Not sure if criu is
// impacted by issue #2484
if err := prepareBindMount(m, c.config.Rootfs, nil); err != nil {
return err
}
default:
@@ -1256,7 +1324,7 @@ func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error
for _, u := range umounts {
_ = utils.WithProcfd(c.config.Rootfs, u, func(procfd string) error {
if e := unix.Unmount(procfd, unix.MNT_DETACH); e != nil {
if e != unix.EINVAL {
if e != unix.EINVAL { //nolint:errorlint // unix errors are bare
// Ignore EINVAL as it means 'target is not a mount point.'
// It probably has already been unmounted.
logrus.Warnf("Error during cleanup unmounting of %s (%s): %v", procfd, u, e)
@@ -1282,8 +1350,8 @@ func (c *linuxContainer) prepareCriuRestoreMounts(mounts []*configs.Mount) error
// set up in the order they are configured.
if m.Device == "bind" {
if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(procfd string) error {
if err := unix.Mount(m.Source, procfd, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
return errorsf.Wrapf(err, "unable to bind mount %q to %q (through %q)", m.Source, m.Destination, procfd)
if err := mount(m.Source, m.Destination, procfd, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
return err
}
return nil
}); err != nil {
@@ -1311,19 +1379,6 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
if err := c.checkCriuVersion(30000); err != nil {
return err
}
if criuOpts.WorkDirectory == "" {
criuOpts.WorkDirectory = filepath.Join(c.root, "criu.work")
}
// Since a container can be C/R'ed multiple times,
// the work directory may already exist.
if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
return err
}
workDir, err := os.Open(criuOpts.WorkDirectory)
if err != nil {
return err
}
defer workDir.Close()
if criuOpts.ImagesDirectory == "" {
return errors.New("invalid directory to restore checkpoint")
}
@@ -1346,7 +1401,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
if err != nil {
return err
}
err = unix.Mount(c.config.Rootfs, root, "", unix.MS_BIND|unix.MS_REC, "")
err = mount(c.config.Rootfs, root, "", "", unix.MS_BIND|unix.MS_REC, "")
if err != nil {
return err
}
@@ -1356,7 +1411,6 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
Type: &t,
Opts: &criurpc.CriuOpts{
ImagesDirFd: proto.Int32(int32(imageDir.Fd())),
WorkDirFd: proto.Int32(int32(workDir.Fd())),
EvasiveDevices: proto.Bool(true),
LogLevel: proto.Int32(4),
LogFile: proto.String("restore.log"),
@@ -1383,7 +1437,26 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
}
req.Opts.LsmProfile = proto.String(criuOpts.LsmProfile)
}
if criuOpts.LsmMountContext != "" {
if err := c.checkCriuVersion(31600); err != nil {
return errors.New("--lsm-mount-context requires at least CRIU 3.16")
}
req.Opts.LsmMountContext = proto.String(criuOpts.LsmMountContext)
}
if criuOpts.WorkDirectory != "" {
// Since a container can be C/R'ed multiple times,
// the work directory may already exist.
if err := os.Mkdir(criuOpts.WorkDirectory, 0o700); err != nil && !os.IsExist(err) {
return err
}
workDir, err := os.Open(criuOpts.WorkDirectory)
if err != nil {
return err
}
defer workDir.Close()
req.Opts.WorkDirFd = proto.Int32(int32(workDir.Fd()))
}
c.handleCriuConfigurationFile(req.Opts)
if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil {
@@ -1432,7 +1505,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
// append optional manage cgroups mode
if criuOpts.ManageCgroupsMode != 0 {
mode := criurpc.CriuCgMode(criuOpts.ManageCgroupsMode)
mode := criuOpts.ManageCgroupsMode
req.Opts.ManageCgroupsMode = &mode
}
@@ -1440,7 +1513,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
fds []string
fdJSON []byte
)
if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
if fdJSON, err = os.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
return err
}
@@ -1477,7 +1550,7 @@ func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error {
}
if err := c.cgroupManager.Set(c.config.Cgroups.Resources); err != nil {
return newSystemError(err)
return err
}
if cgroups.IsCgroup2UnifiedMode() {
@@ -1835,7 +1908,7 @@ func (c *linuxContainer) updateState(process parentProcess) (*State, error) {
}
func (c *linuxContainer) saveState(s *State) (retErr error) {
tmpFile, err := ioutil.TempFile(c.root, "state-")
tmpFile, err := os.CreateTemp(c.root, "state-")
if err != nil {
return err
}
@@ -1928,9 +2001,10 @@ func (c *linuxContainer) currentState() (*State, error) {
startTime, _ = c.initProcess.startTime()
externalDescriptors = c.initProcess.externalDescriptors()
}
intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID())
if err != nil {
intelRdtPath = ""
intelRdtPath := ""
if c.intelRdtManager != nil {
intelRdtPath = c.intelRdtManager.GetPath()
}
state := &State{
BaseState: BaseState{
@@ -1998,16 +2072,16 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp
if p, ok := namespaces[ns]; ok && p != "" {
// check if the requested namespace is supported
if !configs.IsNamespaceSupported(ns) {
return nil, newSystemError(fmt.Errorf("namespace %s is not supported", ns))
return nil, fmt.Errorf("namespace %s is not supported", ns)
}
// only set to join this namespace if it exists
if _, err := os.Lstat(p); err != nil {
return nil, newSystemErrorWithCausef(err, "running lstat on namespace path %q", p)
return nil, fmt.Errorf("namespace path: %w", err)
}
// do not allow namespace path with comma as we use it to separate
// the namespace paths
if strings.ContainsRune(p, ',') {
return nil, newSystemError(fmt.Errorf("invalid path %s", p))
return nil, fmt.Errorf("invalid namespace path %s", p)
}
paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(ns), p))
}
@@ -2039,7 +2113,7 @@ type netlinkError struct{ error }
// such as one that uses nsenter package to bootstrap the container's
// init process correctly, i.e. with correct namespaces, uid/gid
// mapping etc.
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (_ io.Reader, Err error) {
func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) {
// create the netlink message
r := nl.NewNetlinkRequest(int(InitMsg), 0)
@@ -2134,6 +2208,25 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
Value: c.config.RootlessEUID,
})
// Bind mount source to open.
if it == initStandard && c.shouldSendMountSources() {
var mounts []byte
for _, m := range c.config.Mounts {
if m.IsBind() {
if strings.IndexByte(m.Source, 0) >= 0 {
return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
}
mounts = append(mounts, []byte(m.Source)...)
}
mounts = append(mounts, byte(0))
}
r.AddData(&Bytemsg{
Type: MountSourcesAttr,
Value: mounts,
})
}
return bytes.NewReader(r.Serialize()), nil
}
@@ -2144,7 +2237,7 @@ func ignoreTerminateErrors(err error) error {
if err == nil {
return nil
}
// terminate() might return an error from ether Kill or Wait.
// terminate() might return an error from either Kill or Wait.
// The (*Cmd).Wait documentation says: "If the command fails to run
// or doesn't complete successfully, the error is of type *ExitError".
// Filter out such errors (like "exit status 1" or "signal: killed").
@@ -2152,13 +2245,11 @@ func ignoreTerminateErrors(err error) error {
if errors.As(err, &exitErr) {
return nil
}
// TODO: use errors.Is(err, os.ErrProcessDone) here and
// remove "process already finished" string comparison below
// once go 1.16 is minimally supported version.
if errors.Is(err, os.ErrProcessDone) {
return nil
}
s := err.Error()
if strings.Contains(s, "process already finished") ||
strings.Contains(s, "Wait was already called") {
if strings.Contains(s, "Wait was already called") {
return nil
}
return err

View File

@@ -30,4 +30,5 @@ type CriuOpts struct {
LazyPages bool // restore memory pages lazily using userfaultfd
StatusFd int // fd for feedback when lazy server is ready
LsmProfile string // LSM profile used to restore the container
LsmMountContext string // LSM mount context value to use during restore
}

View File

@@ -1,10 +1,10 @@
//go:build !windows
// +build !windows
package devices
import (
"errors"
"io/ioutil"
"os"
"path/filepath"
@@ -16,8 +16,8 @@ var ErrNotADevice = errors.New("not a device node")
// Testing dependencies
var (
unixLstat = unix.Lstat
ioutilReadDir = ioutil.ReadDir
unixLstat = unix.Lstat
osReadDir = os.ReadDir
)
func mkDev(d *Rule) (uint64, error) {
@@ -40,7 +40,7 @@ func DeviceFromPath(path, permissions string) (*Device, error) {
var (
devType Type
mode = stat.Mode
devNumber = uint64(stat.Rdev)
devNumber = uint64(stat.Rdev) //nolint:unconvert // Rdev is uint32 on e.g. MIPS.
major = unix.Major(devNumber)
minor = unix.Minor(devNumber)
)
@@ -76,7 +76,7 @@ func HostDevices() ([]*Device, error) {
// GetDevices recursively traverses a directory specified by path
// and returns all devices found there.
func GetDevices(path string) ([]*Device, error) {
files, err := ioutilReadDir(path)
files, err := osReadDir(path)
if err != nil {
return nil, err
}
@@ -103,7 +103,7 @@ func GetDevices(path string) ([]*Device, error) {
}
device, err := DeviceFromPath(filepath.Join(path, f.Name()), "rwm")
if err != nil {
if err == ErrNotADevice {
if errors.Is(err, ErrNotADevice) {
continue
}
if os.IsNotExist(err) {

View File

@@ -1,70 +1,13 @@
package libcontainer
import "io"
import "errors"
// ErrorCode is the API error code type.
type ErrorCode int
// API error codes.
const (
// Factory errors
IdInUse ErrorCode = iota
InvalidIdFormat
// Container errors
ContainerNotExists
ContainerPaused
ContainerNotStopped
ContainerNotRunning
ContainerNotPaused
// Process errors
NoProcessOps
// Common errors
ConfigInvalid
ConsoleExists
SystemError
var (
ErrExist = errors.New("container with given ID already exists")
ErrInvalidID = errors.New("invalid container ID format")
ErrNotExist = errors.New("container does not exist")
ErrPaused = errors.New("container paused")
ErrRunning = errors.New("container still running")
ErrNotRunning = errors.New("container not running")
ErrNotPaused = errors.New("container not paused")
)
func (c ErrorCode) String() string {
switch c {
case IdInUse:
return "Id already in use"
case InvalidIdFormat:
return "Invalid format"
case ContainerPaused:
return "Container paused"
case ConfigInvalid:
return "Invalid configuration"
case SystemError:
return "System error"
case ContainerNotExists:
return "Container does not exist"
case ContainerNotStopped:
return "Container is not stopped"
case ContainerNotRunning:
return "Container is not running"
case ConsoleExists:
return "Console exists for process"
case ContainerNotPaused:
return "Container is not paused"
case NoProcessOps:
return "No process operations"
default:
return "Unknown error"
}
}
// Error is the API error type.
type Error interface {
error
// Returns an error if it failed to write the detail of the Error to w.
// The detail of the Error may include the error message and a
// representation of the stack trace.
Detail(w io.Writer) error
// Returns the error code for this error.
Code() ErrorCode
}

View File

@@ -14,29 +14,15 @@ type Factory interface {
//
// Returns the new container with a running process.
//
// errors:
// IdInUse - id is already in use by a container
// InvalidIdFormat - id has incorrect format
// ConfigInvalid - config is invalid
// Systemerror - System error
//
// On error, any partially created container parts are cleaned up (the operation is atomic).
Create(id string, config *configs.Config) (Container, error)
// Load takes an ID for an existing container and returns the container information
// from the state. This presents a read only view of the container.
//
// errors:
// Path does not exist
// System error
Load(id string) (Container, error)
// StartInitialization is an internal API to libcontainer used during the reexec of the
// container.
//
// Errors:
// Pipe connection error
// System error
StartInitialization() error
// Type returns info string about factory type (e.g. lxc, libcontainer...)

View File

@@ -1,9 +1,8 @@
// +build linux
package libcontainer
import (
"encoding/json"
"errors"
"fmt"
"os"
"path/filepath"
@@ -13,17 +12,14 @@ import (
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/moby/sys/mountinfo"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups/manager"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/configs/validate"
"github.com/opencontainers/runc/libcontainer/intelrdt"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
"github.com/sirupsen/logrus"
)
const (
@@ -41,7 +37,9 @@ func InitArgs(args ...string) func(*LinuxFactory) error {
// Resolve relative paths to ensure that its available
// after directory changes.
if args[0], err = filepath.Abs(args[0]); err != nil {
return newGenericError(err, ConfigInvalid)
// The only error returned from filepath.Abs is
// the one from os.Getwd, i.e. a system error.
return err
}
}
@@ -50,100 +48,6 @@ func InitArgs(args ...string) func(*LinuxFactory) error {
}
}
func getUnifiedPath(paths map[string]string) string {
path := ""
for k, v := range paths {
if path == "" {
path = v
} else if v != path {
panic(errors.Errorf("expected %q path to be unified path %q, got %q", k, path, v))
}
}
// can be empty
if path != "" {
if filepath.Clean(path) != path || !filepath.IsAbs(path) {
panic(errors.Errorf("invalid dir path %q", path))
}
}
return path
}
func systemdCgroupV2(l *LinuxFactory, rootless bool) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return systemd.NewUnifiedManager(config, getUnifiedPath(paths), rootless)
}
return nil
}
// SystemdCgroups is an options func to configure a LinuxFactory to return
// containers that use systemd to create and manage cgroups.
func SystemdCgroups(l *LinuxFactory) error {
if !systemd.IsRunningSystemd() {
return fmt.Errorf("systemd not running on this host, can't use systemd as cgroups manager")
}
if cgroups.IsCgroup2UnifiedMode() {
return systemdCgroupV2(l, false)
}
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return systemd.NewLegacyManager(config, paths)
}
return nil
}
// RootlessSystemdCgroups is rootless version of SystemdCgroups.
func RootlessSystemdCgroups(l *LinuxFactory) error {
if !systemd.IsRunningSystemd() {
return fmt.Errorf("systemd not running on this host, can't use systemd as cgroups manager")
}
if !cgroups.IsCgroup2UnifiedMode() {
return fmt.Errorf("cgroup v2 not enabled on this host, can't use systemd (rootless) as cgroups manager")
}
return systemdCgroupV2(l, true)
}
func cgroupfs2(l *LinuxFactory, rootless bool) error {
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
m, err := fs2.NewManager(config, getUnifiedPath(paths), rootless)
if err != nil {
panic(err)
}
return m
}
return nil
}
func cgroupfs(l *LinuxFactory, rootless bool) error {
if cgroups.IsCgroup2UnifiedMode() {
return cgroupfs2(l, rootless)
}
l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
return fs.NewManager(config, paths, rootless)
}
return nil
}
// Cgroupfs is an options func to configure a LinuxFactory to return containers
// that use the native cgroups filesystem implementation to create and manage
// cgroups.
func Cgroupfs(l *LinuxFactory) error {
return cgroupfs(l, false)
}
// RootlessCgroupfs is an options func to configure a LinuxFactory to return
// containers that use the native cgroups filesystem implementation to create
// and manage cgroups. The difference between RootlessCgroupfs and Cgroupfs is
// that RootlessCgroupfs can transparently handle permission errors that occur
// during rootless container (including euid=0 in userns) setup (while still allowing cgroup usage if
// they've been set up properly).
func RootlessCgroupfs(l *LinuxFactory) error {
return cgroupfs(l, true)
}
// IntelRdtfs is an options func to configure a LinuxFactory to return
// containers that use the Intel RDT "resource control" filesystem to
// create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth).
@@ -165,7 +69,7 @@ func TmpfsRoot(l *LinuxFactory) error {
return err
}
if !mounted {
if err := unix.Mount("tmpfs", l.Root, "tmpfs", 0, ""); err != nil {
if err := mount("tmpfs", l.Root, "", "tmpfs", 0, ""); err != nil {
return err
}
}
@@ -186,7 +90,7 @@ func CriuPath(criupath string) func(*LinuxFactory) error {
func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
if root != "" {
if err := os.MkdirAll(root, 0o700); err != nil {
return nil, newGenericError(err, SystemError)
return nil, err
}
}
l := &LinuxFactory{
@@ -197,10 +101,6 @@ func New(root string, options ...func(*LinuxFactory) error) (Factory, error) {
CriuPath: "criu",
}
if err := Cgroupfs(l); err != nil {
return nil, err
}
for _, opt := range options {
if opt == nil {
continue
@@ -237,37 +137,69 @@ type LinuxFactory struct {
// Validator provides validation to container configurations.
Validator validate.Validator
// NewCgroupsManager returns an initialized cgroups manager for a single container.
NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager
// NewIntelRdtManager returns an initialized Intel RDT manager for a single container.
NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager
}
func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) {
if l.Root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
return nil, errors.New("root not set")
}
if err := l.validateID(id); err != nil {
return nil, err
}
if err := l.Validator.Validate(config); err != nil {
return nil, newGenericError(err, ConfigInvalid)
return nil, err
}
containerRoot, err := securejoin.SecureJoin(l.Root, id)
if err != nil {
return nil, err
}
if _, err := os.Stat(containerRoot); err == nil {
return nil, newGenericError(fmt.Errorf("container with id exists: %v", id), IdInUse)
return nil, ErrExist
} else if !os.IsNotExist(err) {
return nil, newGenericError(err, SystemError)
return nil, err
}
cm, err := manager.New(config.Cgroups)
if err != nil {
return nil, err
}
// Check that cgroup does not exist or empty (no processes).
// Note for cgroup v1 this check is not thorough, as there are multiple
// separate hierarchies, while both Exists() and GetAllPids() only use
// one for "devices" controller (assuming others are the same, which is
// probably true in almost all scenarios). Checking all the hierarchies
// would be too expensive.
if cm.Exists() {
pids, err := cm.GetAllPids()
// Reading PIDs can race with cgroups removal, so ignore ENOENT and ENODEV.
if err != nil && !errors.Is(err, os.ErrNotExist) && !errors.Is(err, unix.ENODEV) {
return nil, fmt.Errorf("unable to get cgroup PIDs: %w", err)
}
if len(pids) != 0 {
// TODO: return an error.
logrus.Warnf("container's cgroup is not empty: %d process(es) found", len(pids))
logrus.Warn("DEPRECATED: running container in a non-empty cgroup won't be supported in runc 1.2; https://github.com/opencontainers/runc/issues/3132")
}
}
// Check that cgroup is not frozen. Do not use Exists() here
// since in cgroup v1 it only checks "devices" controller.
st, err := cm.GetFreezerState()
if err != nil {
return nil, fmt.Errorf("unable to get cgroup freezer state: %w", err)
}
if st == configs.Frozen {
return nil, errors.New("container's cgroup unexpectedly frozen")
}
if err := os.MkdirAll(containerRoot, 0o711); err != nil {
return nil, newGenericError(err, SystemError)
return nil, err
}
if err := os.Chown(containerRoot, unix.Geteuid(), unix.Getegid()); err != nil {
return nil, newGenericError(err, SystemError)
return nil, err
}
c := &linuxContainer{
id: id,
@@ -278,7 +210,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
criuPath: l.CriuPath,
newuidmapPath: l.NewuidmapPath,
newgidmapPath: l.NewgidmapPath,
cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
cgroupManager: cm,
}
if l.NewIntelRdtManager != nil {
c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
@@ -289,7 +221,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
func (l *LinuxFactory) Load(id string) (Container, error) {
if l.Root == "" {
return nil, newGenericError(fmt.Errorf("invalid root"), ConfigInvalid)
return nil, errors.New("root not set")
}
// when load, we need to check id is valid or not.
if err := l.validateID(id); err != nil {
@@ -299,7 +231,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
if err != nil {
return nil, err
}
state, err := l.loadState(containerRoot, id)
state, err := l.loadState(containerRoot)
if err != nil {
return nil, err
}
@@ -308,6 +240,10 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
processStartTime: state.InitProcessStartTime,
fds: state.ExternalDescriptors,
}
cm, err := manager.NewWithPaths(state.Config.Cgroups, state.CgroupPaths)
if err != nil {
return nil, err
}
c := &linuxContainer{
initProcess: r,
initProcessStartTime: state.InitProcessStartTime,
@@ -318,7 +254,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
criuPath: l.CriuPath,
newuidmapPath: l.NewuidmapPath,
newgidmapPath: l.NewgidmapPath,
cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths),
cgroupManager: cm,
root: containerRoot,
created: state.Created,
}
@@ -343,11 +279,26 @@ func (l *LinuxFactory) StartInitialization() (err error) {
envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE")
pipefd, err := strconv.Atoi(envInitPipe)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
err = fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE: %w", err)
logrus.Error(err)
return err
}
pipe := os.NewFile(uintptr(pipefd), "pipe")
defer pipe.Close()
defer func() {
// We have an error during the initialization of the container's init,
// send it back to the parent process in the form of an initError.
if werr := writeSync(pipe, procError); werr != nil {
fmt.Fprintln(os.Stderr, err)
return
}
if werr := utils.WriteJSON(pipe, &initError{Message: err.Error()}); werr != nil {
fmt.Fprintln(os.Stderr, err)
return
}
}()
// Only init processes have FIFOFD.
fifofd := -1
envInitType := os.Getenv("_LIBCONTAINER_INITTYPE")
@@ -355,7 +306,7 @@ func (l *LinuxFactory) StartInitialization() (err error) {
if it == initStandard {
envFifoFd := os.Getenv("_LIBCONTAINER_FIFOFD")
if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err)
return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD: %w", err)
}
}
@@ -363,7 +314,7 @@ func (l *LinuxFactory) StartInitialization() (err error) {
if envConsole := os.Getenv("_LIBCONTAINER_CONSOLE"); envConsole != "" {
console, err := strconv.Atoi(envConsole)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err)
return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE: %w", err)
}
consoleSocket = os.NewFile(uintptr(console), "console-socket")
defer consoleSocket.Close()
@@ -372,32 +323,26 @@ func (l *LinuxFactory) StartInitialization() (err error) {
logPipeFdStr := os.Getenv("_LIBCONTAINER_LOGPIPE")
logPipeFd, err := strconv.Atoi(logPipeFdStr)
if err != nil {
return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE=%s to int: %s", logPipeFdStr, err)
return fmt.Errorf("unable to convert _LIBCONTAINER_LOGPIPE: %w", err)
}
// Get mount files (O_PATH).
mountFds, err := parseMountFds()
if err != nil {
return err
}
// clear the current process's environment to clean any libcontainer
// specific env vars.
os.Clearenv()
defer func() {
// We have an error during the initialization of the container's init,
// send it back to the parent process in the form of an initError.
if werr := utils.WriteJSON(pipe, syncT{procError}); werr != nil {
fmt.Fprintln(os.Stderr, err)
return
}
if werr := utils.WriteJSON(pipe, newSystemError(err)); werr != nil {
fmt.Fprintln(os.Stderr, err)
return
}
}()
defer func() {
if e := recover(); e != nil {
err = fmt.Errorf("panic from initialization: %v, %v", e, string(debug.Stack()))
err = fmt.Errorf("panic from initialization: %w, %v", e, string(debug.Stack()))
}
}()
i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd)
i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds)
if err != nil {
return err
}
@@ -406,7 +351,7 @@ func (l *LinuxFactory) StartInitialization() (err error) {
return i.Init()
}
func (l *LinuxFactory) loadState(root, id string) (*State, error) {
func (l *LinuxFactory) loadState(root string) (*State, error) {
stateFilePath, err := securejoin.SecureJoin(root, stateFilename)
if err != nil {
return nil, err
@@ -414,21 +359,21 @@ func (l *LinuxFactory) loadState(root, id string) (*State, error) {
f, err := os.Open(stateFilePath)
if err != nil {
if os.IsNotExist(err) {
return nil, newGenericError(fmt.Errorf("container %q does not exist", id), ContainerNotExists)
return nil, ErrNotExist
}
return nil, newGenericError(err, SystemError)
return nil, err
}
defer f.Close()
var state *State
if err := json.NewDecoder(f).Decode(&state); err != nil {
return nil, newGenericError(err, SystemError)
return nil, err
}
return state, nil
}
func (l *LinuxFactory) validateID(id string) error {
if !idRegex.MatchString(id) || string(os.PathSeparator)+id != utils.CleanPath(string(os.PathSeparator)+id) {
return newGenericError(fmt.Errorf("invalid id format: %v", id), InvalidIdFormat)
return ErrInvalidID
}
return nil
@@ -451,3 +396,18 @@ func NewgidmapPath(newgidmapPath string) func(*LinuxFactory) error {
return nil
}
}
func parseMountFds() ([]int, error) {
fdsJson := os.Getenv("_LIBCONTAINER_MOUNT_FDS")
if fdsJson == "" {
// Always return the nil slice if no fd is present.
return nil, nil
}
var mountFds []int
if err := json.Unmarshal([]byte(fdsJson), &mountFds); err != nil {
return nil, fmt.Errorf("Error unmarshalling _LIBCONTAINER_MOUNT_FDS: %w", err)
}
return mountFds, nil
}

View File

@@ -1,92 +0,0 @@
package libcontainer
import (
"fmt"
"io"
"text/template"
"time"
"github.com/opencontainers/runc/libcontainer/stacktrace"
)
var errorTemplate = template.Must(template.New("error").Parse(`Timestamp: {{.Timestamp}}
Code: {{.ECode}}
{{if .Message }}
Message: {{.Message}}
{{end}}
Frames:{{range $i, $frame := .Stack.Frames}}
---
{{$i}}: {{$frame.Function}}
Package: {{$frame.Package}}
File: {{$frame.File}}@{{$frame.Line}}{{end}}
`))
func newGenericError(err error, c ErrorCode) Error {
if le, ok := err.(Error); ok {
return le
}
gerr := &genericError{
Timestamp: time.Now(),
Err: err,
ECode: c,
Stack: stacktrace.Capture(1),
}
if err != nil {
gerr.Message = err.Error()
}
return gerr
}
func newSystemError(err error) Error {
return createSystemError(err, "")
}
func newSystemErrorWithCausef(err error, cause string, v ...interface{}) Error {
return createSystemError(err, fmt.Sprintf(cause, v...))
}
func newSystemErrorWithCause(err error, cause string) Error {
return createSystemError(err, cause)
}
// createSystemError creates the specified error with the correct number of
// stack frames skipped. This is only to be called by the other functions for
// formatting the error.
func createSystemError(err error, cause string) Error {
gerr := &genericError{
Timestamp: time.Now(),
Err: err,
ECode: SystemError,
Cause: cause,
Stack: stacktrace.Capture(2),
}
if err != nil {
gerr.Message = err.Error()
}
return gerr
}
type genericError struct {
Timestamp time.Time
ECode ErrorCode
Err error `json:"-"`
Cause string
Message string
Stack stacktrace.Stacktrace
}
func (e *genericError) Error() string {
if e.Cause == "" {
return e.Message
}
frame := e.Stack.Frames[0]
return fmt.Sprintf("%s:%d: %s caused: %s", frame.File, frame.Line, e.Cause, e.Message)
}
func (e *genericError) Code() ErrorCode {
return e.ECode
}
func (e *genericError) Detail(w io.Writer) error {
return errorTemplate.Execute(w, e)
}

View File

@@ -1,30 +1,29 @@
// +build linux
package libcontainer
import (
"bytes"
"encoding/json"
"errors"
"fmt"
"io"
"io/ioutil"
"net"
"os"
"strconv"
"strings"
"unsafe"
"github.com/containerd/console"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/capabilities"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/user"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"github.com/vishvananda/netlink"
"golang.org/x/sys/unix"
)
type initType string
@@ -77,7 +76,7 @@ type initer interface {
Init() error
}
func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int) (initer, error) {
func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds []int) (initer, error) {
var config *initConfig
if err := json.NewDecoder(pipe).Decode(&config); err != nil {
return nil, err
@@ -87,6 +86,11 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd,
}
switch t {
case initSetns:
// mountFds must be nil in this case. We don't mount while doing runc exec.
if mountFds != nil {
return nil, errors.New("mountFds must be nil. Can't mount while doing runc exec.")
}
return &linuxSetnsInit{
pipe: pipe,
consoleSocket: consoleSocket,
@@ -101,6 +105,7 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd,
config: config,
fifoFd: fifoFd,
logFd: logFd,
mountFds: mountFds,
}, nil
}
return nil, fmt.Errorf("unknown init type %q", t)
@@ -139,7 +144,7 @@ func finalizeNamespace(config *initConfig) error {
// inherited are marked close-on-exec so they stay out of the
// container
if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
return errors.Wrap(err, "close exec fds")
return fmt.Errorf("error closing exec fds: %w", err)
}
// we only do chdir if it's specified
@@ -158,7 +163,7 @@ func finalizeNamespace(config *initConfig) error {
// to the directory, but the user running runc does not.
// This is useful in cases where the cwd is also a volume that's been chowned to the container user.
default:
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err)
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
}
}
@@ -174,26 +179,26 @@ func finalizeNamespace(config *initConfig) error {
}
// drop capabilities in bounding set before changing user
if err := w.ApplyBoundingSet(); err != nil {
return errors.Wrap(err, "apply bounding set")
return fmt.Errorf("unable to apply bounding set: %w", err)
}
// preserve existing capabilities while we change users
if err := system.SetKeepCaps(); err != nil {
return errors.Wrap(err, "set keep caps")
return fmt.Errorf("unable to set keep caps: %w", err)
}
if err := setupUser(config); err != nil {
return errors.Wrap(err, "setup user")
return fmt.Errorf("unable to setup user: %w", err)
}
// Change working directory AFTER the user has been set up, if we haven't done it yet.
if doChdir {
if err := unix.Chdir(config.Cwd); err != nil {
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err)
return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %w", config.Cwd, err)
}
}
if err := system.ClearKeepCaps(); err != nil {
return errors.Wrap(err, "clear keep caps")
return fmt.Errorf("unable to clear keep caps: %w", err)
}
if err := w.ApplyCaps(); err != nil {
return errors.Wrap(err, "apply caps")
return fmt.Errorf("unable to apply caps: %w", err)
}
return nil
}
@@ -272,6 +277,36 @@ func syncParentHooks(pipe io.ReadWriter) error {
return readSync(pipe, procResume)
}
// syncParentSeccomp sends to the given pipe a JSON payload which
// indicates that the parent should pick up the seccomp fd with pidfd_getfd()
// and send it to the seccomp agent over a unix socket. It then waits for
// the parent to indicate that it is cleared to resume and closes the seccompFd.
// If the seccompFd is -1, there isn't anything to sync with the parent, so it
// returns no error.
func syncParentSeccomp(pipe io.ReadWriter, seccompFd int) error {
if seccompFd == -1 {
return nil
}
// Tell parent.
if err := writeSyncWithFd(pipe, procSeccomp, seccompFd); err != nil {
unix.Close(seccompFd)
return err
}
// Wait for parent to give the all-clear.
if err := readSync(pipe, procSeccompDone); err != nil {
unix.Close(seccompFd)
return fmt.Errorf("sync parent seccomp: %w", err)
}
if err := unix.Close(seccompFd); err != nil {
return fmt.Errorf("close seccomp fd: %w", err)
}
return nil
}
// setupUser changes the groups, gid, and uid for the user inside the container
func setupUser(config *initConfig) error {
// Set up defaults.
@@ -325,11 +360,11 @@ func setupUser(config *initConfig) error {
// Before we change to the container's user make sure that the processes
// STDIO is correctly owned by the user that we are switching to.
if err := fixStdioPermissions(config, execUser); err != nil {
if err := fixStdioPermissions(execUser); err != nil {
return err
}
setgroups, err := ioutil.ReadFile("/proc/self/setgroups")
setgroups, err := os.ReadFile("/proc/self/setgroups")
if err != nil && !os.IsNotExist(err) {
return err
}
@@ -343,7 +378,7 @@ func setupUser(config *initConfig) error {
if allowSupGroups {
suppGroups := append(execUser.Sgids, addGroups...)
if err := unix.Setgroups(suppGroups); err != nil {
return err
return &os.SyscallError{Syscall: "setgroups", Err: err}
}
}
@@ -366,10 +401,10 @@ func setupUser(config *initConfig) error {
// fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
// The ownership needs to match because it is created outside of the container and needs to be
// localized.
func fixStdioPermissions(config *initConfig, u *user.ExecUser) error {
func fixStdioPermissions(u *user.ExecUser) error {
var null unix.Stat_t
if err := unix.Stat("/dev/null", &null); err != nil {
return err
return &os.PathError{Op: "stat", Path: "/dev/null", Err: err}
}
for _, fd := range []uintptr{
os.Stdin.Fd(),
@@ -378,7 +413,7 @@ func fixStdioPermissions(config *initConfig, u *user.ExecUser) error {
} {
var s unix.Stat_t
if err := unix.Fstat(int(fd), &s); err != nil {
return err
return &os.PathError{Op: "fstat", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
}
// Skip chown of /dev/null if it was used as one of the STDIO fds.
@@ -399,10 +434,12 @@ func fixStdioPermissions(config *initConfig, u *user.ExecUser) error {
// privileged_wrt_inode_uidgid() has failed). In either case, we
// are in a configuration where it's better for us to just not
// touch the stdio rather than bail at this point.
// nolint:errorlint // unix errors are bare
if err == unix.EINVAL || err == unix.EPERM {
continue
}
return err
return &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
}
}
return nil
@@ -456,8 +493,8 @@ func setupRoute(config *configs.Config) error {
func setupRlimits(limits []configs.Rlimit, pid int) error {
for _, rlimit := range limits {
if err := system.Prlimit(pid, rlimit.Type, unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil {
return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
if err := unix.Prlimit(pid, rlimit.Type, &unix.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}, nil); err != nil {
return fmt.Errorf("error setting rlimit type %v: %w", rlimit.Type, err)
}
}
return nil
@@ -482,27 +519,12 @@ func isWaitable(pid int) (bool, error) {
si := &siginfo{}
_, _, e := unix.Syscall6(unix.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), unix.WEXITED|unix.WNOWAIT|unix.WNOHANG, 0, 0)
if e != 0 {
return false, os.NewSyscallError("waitid", e)
return false, &os.SyscallError{Syscall: "waitid", Err: e}
}
return si.si_pid != 0, nil
}
// isNoChildren returns true if err represents a unix.ECHILD (formerly syscall.ECHILD) false otherwise
func isNoChildren(err error) bool {
switch err := err.(type) {
case unix.Errno:
if err == unix.ECHILD {
return true
}
case *os.SyscallError:
if err.Err == unix.ECHILD {
return true
}
}
return false
}
// signalAllProcesses freezes then iterates over all the processes inside the
// manager's cgroups sending the signal s to them.
// If s is SIGKILL then it will wait for each process to exit.
@@ -548,7 +570,7 @@ func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
for _, p := range procs {
if s != unix.SIGKILL {
if ok, err := isWaitable(p.Pid); err != nil {
if !isNoChildren(err) {
if !errors.Is(err, unix.ECHILD) {
logrus.Warn("signalAllProcesses: ", p.Pid, err)
}
continue
@@ -565,7 +587,7 @@ func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
// to retrieve its exit code.
if subreaper == 0 {
if _, err := p.Wait(); err != nil {
if !isNoChildren(err) {
if !errors.Is(err, unix.ECHILD) {
logrus.Warn("wait: ", err)
}
}

View File

@@ -1,13 +1,11 @@
// +build linux
package intelrdt
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"io/ioutil"
"os"
"path/filepath"
"strconv"
@@ -15,6 +13,7 @@ import (
"sync"
"github.com/moby/sys/mountinfo"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
@@ -70,7 +69,7 @@ import (
* |-- ...
* |-- schemata
* |-- tasks
* |-- <container_id>
* |-- <clos>
* |-- ...
* |-- schemata
* |-- tasks
@@ -153,7 +152,7 @@ type Manager interface {
// Returns statistics for Intel RDT
GetStats() (*Stats, error)
// Destroys the Intel RDT 'container_id' group
// Destroys the Intel RDT container-specific 'container_id' group
Destroy() error
// Returns Intel RDT path to save in a state file and to be able to
@@ -181,14 +180,10 @@ func NewManager(config *configs.Config, id string, path string) Manager {
}
const (
IntelRdtTasks = "tasks"
intelRdtTasks = "tasks"
)
var (
// The absolute root path of the Intel RDT "resource control" filesystem
intelRdtRoot string
intelRdtRootLock sync.Mutex
// The flag to indicate if Intel RDT/CAT is enabled
catEnabled bool
// The flag to indicate if Intel RDT/MBA is enabled
@@ -198,13 +193,9 @@ var (
// For Intel RDT initialization
initOnce sync.Once
)
type intelRdtData struct {
root string
config *configs.Config
pid int
}
errNotFound = errors.New("Intel RDT resctrl mount point not found")
)
// Check if Intel RDT sub-features are enabled in featuresInit()
func featuresInit() {
@@ -215,9 +206,10 @@ func featuresInit() {
return
}
// 2. Check if Intel RDT "resource control" filesystem is mounted
// The user guarantees to mount the filesystem
if !isIntelRdtMounted() {
// 2. Check if Intel RDT "resource control" filesystem is available.
// The user guarantees to mount the filesystem.
root, err := Root()
if err != nil {
return
}
@@ -226,7 +218,7 @@ func featuresInit() {
// selectively disabled or enabled by kernel command line
// (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel
if flagsSet.CAT {
if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3")); err == nil {
if _, err := os.Stat(filepath.Join(root, "info", "L3")); err == nil {
catEnabled = true
}
}
@@ -236,15 +228,15 @@ func featuresInit() {
// depends on MBA
mbaEnabled = true
} else if flagsSet.MBA {
if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "MB")); err == nil {
if _, err := os.Stat(filepath.Join(root, "info", "MB")); err == nil {
mbaEnabled = true
}
}
if flagsSet.MBMTotal || flagsSet.MBMLocal || flagsSet.CMT {
if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3_MON")); err != nil {
if _, err := os.Stat(filepath.Join(root, "info", "L3_MON")); err != nil {
return
}
enabledMonFeatures, err = getMonFeatures(intelRdtRoot)
enabledMonFeatures, err = getMonFeatures(root)
if err != nil {
return
}
@@ -271,7 +263,7 @@ func findIntelRdtMountpointDir(f io.Reader) (string, error) {
return "", err
}
if len(mi) < 1 {
return "", NewNotFoundError("Intel RDT")
return "", errNotFound
}
// Check if MBA Software Controller is enabled through mount option "-o mba_MBps"
@@ -282,10 +274,16 @@ func findIntelRdtMountpointDir(f io.Reader) (string, error) {
return mi[0].Mountpoint, nil
}
// Gets the root path of Intel RDT "resource control" filesystem
func getIntelRdtRoot() (string, error) {
intelRdtRootLock.Lock()
defer intelRdtRootLock.Unlock()
// For Root() use only.
var (
intelRdtRoot string
rootMu sync.Mutex
)
// Root returns the Intel RDT "resource control" filesystem mount point.
func Root() (string, error) {
rootMu.Lock()
defer rootMu.Unlock()
if intelRdtRoot != "" {
return intelRdtRoot, nil
@@ -309,11 +307,6 @@ func getIntelRdtRoot() (string, error) {
return intelRdtRoot, nil
}
func isIntelRdtMounted() bool {
_, err := getIntelRdtRoot()
return err == nil
}
type cpuInfoFlags struct {
CAT bool // Cache Allocation Technology
MBA bool // Memory Bandwidth Allocation
@@ -366,33 +359,15 @@ func parseCpuInfoFile(path string) (cpuInfoFlags, error) {
return infoFlags, nil
}
func parseUint(s string, base, bitSize int) (uint64, error) {
value, err := strconv.ParseUint(s, base, bitSize)
if err != nil {
intValue, intErr := strconv.ParseInt(s, base, bitSize)
// 1. Handle negative values greater than MinInt64 (and)
// 2. Handle negative values lesser than MinInt64
if intErr == nil && intValue < 0 {
return 0, nil
} else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
return 0, nil
}
return value, err
}
return value, nil
}
// Gets a single uint64 value from the specified file.
func getIntelRdtParamUint(path, file string) (uint64, error) {
fileName := filepath.Join(path, file)
contents, err := ioutil.ReadFile(fileName)
contents, err := os.ReadFile(fileName)
if err != nil {
return 0, err
}
res, err := parseUint(string(bytes.TrimSpace(contents)), 10, 64)
res, err := fscommon.ParseUint(string(bytes.TrimSpace(contents)), 10, 64)
if err != nil {
return res, fmt.Errorf("unable to parse %q as a uint from file %q", string(contents), fileName)
}
@@ -401,7 +376,7 @@ func getIntelRdtParamUint(path, file string) (uint64, error) {
// Gets a string value from the specified file
func getIntelRdtParamString(path, file string) (string, error) {
contents, err := ioutil.ReadFile(filepath.Join(path, file))
contents, err := os.ReadFile(filepath.Join(path, file))
if err != nil {
return "", err
}
@@ -413,29 +388,17 @@ func writeFile(dir, file, data string) error {
if dir == "" {
return fmt.Errorf("no such directory for %s", file)
}
if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0o600); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", data, file, err)
if err := os.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0o600); err != nil {
return newLastCmdError(fmt.Errorf("intelrdt: unable to write %v: %w", data, err))
}
return nil
}
func getIntelRdtData(c *configs.Config, pid int) (*intelRdtData, error) {
rootPath, err := getIntelRdtRoot()
if err != nil {
return nil, err
}
return &intelRdtData{
root: rootPath,
config: c,
pid: pid,
}, nil
}
// Get the read-only L3 cache information
func getL3CacheInfo() (*L3CacheInfo, error) {
l3CacheInfo := &L3CacheInfo{}
rootPath, err := getIntelRdtRoot()
rootPath, err := Root()
if err != nil {
return l3CacheInfo, err
}
@@ -465,7 +428,7 @@ func getL3CacheInfo() (*L3CacheInfo, error) {
func getMemBwInfo() (*MemBwInfo, error) {
memBwInfo := &MemBwInfo{}
rootPath, err := getIntelRdtRoot()
rootPath, err := Root()
if err != nil {
return memBwInfo, err
}
@@ -498,7 +461,7 @@ func getMemBwInfo() (*MemBwInfo, error) {
// Get diagnostics for last filesystem operation error from file info/last_cmd_status
func getLastCmdStatus() (string, error) {
rootPath, err := getIntelRdtRoot()
rootPath, err := Root()
if err != nil {
return "", err
}
@@ -515,13 +478,13 @@ func getLastCmdStatus() (string, error) {
// WriteIntelRdtTasks writes the specified pid into the "tasks" file
func WriteIntelRdtTasks(dir string, pid int) error {
if dir == "" {
return fmt.Errorf("no such directory for %s", IntelRdtTasks)
return fmt.Errorf("no such directory for %s", intelRdtTasks)
}
// Don't attach any pid if -1 is specified as a pid
if pid != -1 {
if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0o600); err != nil {
return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err)
if err := os.WriteFile(filepath.Join(dir, intelRdtTasks), []byte(strconv.Itoa(pid)), 0o600); err != nil {
return newLastCmdError(fmt.Errorf("intelrdt: unable to add pid %d: %w", pid, err))
}
}
return nil
@@ -545,15 +508,19 @@ func IsMBAScEnabled() bool {
return mbaScEnabled
}
// Get the 'container_id' path in Intel RDT "resource control" filesystem
func GetIntelRdtPath(id string) (string, error) {
rootPath, err := getIntelRdtRoot()
// Get the path of the clos group in "resource control" filesystem that the container belongs to
func (m *intelRdtManager) getIntelRdtPath() (string, error) {
rootPath, err := Root()
if err != nil {
return "", err
}
path := filepath.Join(rootPath, id)
return path, nil
clos := m.id
if m.config.IntelRdt != nil && m.config.IntelRdt.ClosID != "" {
clos = m.config.IntelRdt.ClosID
}
return filepath.Join(rootPath, clos), nil
}
// Applies Intel RDT configuration to the process with the specified pid
@@ -562,30 +529,48 @@ func (m *intelRdtManager) Apply(pid int) (err error) {
if m.config.IntelRdt == nil {
return nil
}
d, err := getIntelRdtData(m.config, pid)
if err != nil && !IsNotFound(err) {
path, err := m.getIntelRdtPath()
if err != nil {
return err
}
m.mu.Lock()
defer m.mu.Unlock()
path, err := d.join(m.id)
if err != nil {
return err
if m.config.IntelRdt.ClosID != "" && m.config.IntelRdt.L3CacheSchema == "" && m.config.IntelRdt.MemBwSchema == "" {
// Check that the CLOS exists, i.e. it has been pre-configured to
// conform with the runtime spec
if _, err := os.Stat(path); err != nil {
return fmt.Errorf("clos dir not accessible (must be pre-created when l3CacheSchema and memBwSchema are empty): %w", err)
}
}
if err := os.MkdirAll(path, 0o755); err != nil {
return newLastCmdError(err)
}
if err := WriteIntelRdtTasks(path, pid); err != nil {
return newLastCmdError(err)
}
m.path = path
return nil
}
// Destroys the Intel RDT 'container_id' group
// Destroys the Intel RDT container-specific 'container_id' group
func (m *intelRdtManager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
if err := os.RemoveAll(m.GetPath()); err != nil {
return err
// Don't remove resctrl group if closid has been explicitly specified. The
// group is likely externally managed, i.e. by some other entity than us.
// There are probably other containers/tasks sharing the same group.
if m.config.IntelRdt == nil || m.config.IntelRdt.ClosID == "" {
m.mu.Lock()
defer m.mu.Unlock()
if err := os.RemoveAll(m.GetPath()); err != nil {
return err
}
m.path = ""
}
m.path = ""
return nil
}
@@ -593,7 +578,7 @@ func (m *intelRdtManager) Destroy() error {
// restore the object later
func (m *intelRdtManager) GetPath() string {
if m.path == "" {
m.path, _ = GetIntelRdtPath(m.id)
m.path, _ = m.getIntelRdtPath()
}
return m.path
}
@@ -607,9 +592,9 @@ func (m *intelRdtManager) GetStats() (*Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
stats := NewStats()
stats := newStats()
rootPath, err := getIntelRdtRoot()
rootPath, err := Root()
if err != nil {
return nil, err
}
@@ -620,7 +605,7 @@ func (m *intelRdtManager) GetStats() (*Stats, error) {
}
schemaRootStrings := strings.Split(tmpRootStrings, "\n")
// The L3 cache and memory bandwidth schemata in 'container_id' group
// The L3 cache and memory bandwidth schemata in container's clos group
containerPath := m.GetPath()
tmpStrings, err := getIntelRdtParamString(containerPath, "schemata")
if err != nil {
@@ -643,7 +628,7 @@ func (m *intelRdtManager) GetStats() (*Stats, error) {
}
}
// The L3 cache schema in 'container_id' group
// The L3 cache schema in container's clos group
for _, schema := range schemaStrings {
if strings.Contains(schema, "L3") {
stats.L3CacheSchema = strings.TrimSpace(schema)
@@ -666,7 +651,7 @@ func (m *intelRdtManager) GetStats() (*Stats, error) {
}
}
// The memory bandwidth schema in 'container_id' group
// The memory bandwidth schema in container's clos group
for _, schema := range schemaStrings {
if strings.Contains(schema, "MB") {
stats.MemBwSchema = strings.TrimSpace(schema)
@@ -736,24 +721,30 @@ func (m *intelRdtManager) Set(container *configs.Config) error {
l3CacheSchema := container.IntelRdt.L3CacheSchema
memBwSchema := container.IntelRdt.MemBwSchema
// TODO: verify that l3CacheSchema and/or memBwSchema match the
// existing schemata if ClosID has been specified. This is a more
// involved than reading the file and doing plain string comparison as
// the value written in does not necessarily match what gets read out
// (leading zeros, cache id ordering etc).
// Write a single joint schema string to schemata file
if l3CacheSchema != "" && memBwSchema != "" {
if err := writeFile(path, "schemata", l3CacheSchema+"\n"+memBwSchema); err != nil {
return NewLastCmdError(err)
return err
}
}
// Write only L3 cache schema string to schemata file
if l3CacheSchema != "" && memBwSchema == "" {
if err := writeFile(path, "schemata", l3CacheSchema); err != nil {
return NewLastCmdError(err)
return err
}
}
// Write only memory bandwidth schema string to schemata file
if l3CacheSchema == "" && memBwSchema != "" {
if err := writeFile(path, "schemata", memBwSchema); err != nil {
return NewLastCmdError(err)
return err
}
}
}
@@ -761,56 +752,10 @@ func (m *intelRdtManager) Set(container *configs.Config) error {
return nil
}
func (raw *intelRdtData) join(id string) (string, error) {
path := filepath.Join(raw.root, id)
if err := os.MkdirAll(path, 0o755); err != nil {
return "", NewLastCmdError(err)
}
if err := WriteIntelRdtTasks(path, raw.pid); err != nil {
return "", NewLastCmdError(err)
}
return path, nil
}
type NotFoundError struct {
ResourceControl string
}
func (e *NotFoundError) Error() string {
return fmt.Sprintf("mountpoint for %s not found", e.ResourceControl)
}
func NewNotFoundError(res string) error {
return &NotFoundError{
ResourceControl: res,
}
}
func IsNotFound(err error) bool {
if err == nil {
return false
}
_, ok := err.(*NotFoundError)
return ok
}
type LastCmdError struct {
LastCmdStatus string
Err error
}
func (e *LastCmdError) Error() string {
return e.Err.Error() + ", last_cmd_status: " + e.LastCmdStatus
}
func NewLastCmdError(err error) error {
lastCmdStatus, err1 := getLastCmdStatus()
func newLastCmdError(err error) error {
status, err1 := getLastCmdStatus()
if err1 == nil {
return &LastCmdError{
LastCmdStatus: lastCmdStatus,
Err: err,
}
return fmt.Errorf("%w, last_cmd_status: %s", err, status)
}
return err
}

View File

@@ -1,5 +1,3 @@
// +build linux
package intelrdt
// The flag to indicate if Intel RDT/MBM is enabled

View File

@@ -3,7 +3,6 @@ package intelrdt
import (
"bufio"
"io"
"io/ioutil"
"os"
"path/filepath"
@@ -49,7 +48,7 @@ func parseMonFeatures(reader io.Reader) (monFeatures, error) {
}
func getMonitoringStats(containerPath string, stats *Stats) error {
numaFiles, err := ioutil.ReadDir(filepath.Join(containerPath, "mon_data"))
numaFiles, err := os.ReadDir(filepath.Join(containerPath, "mon_data"))
if err != nil {
return err
}

View File

@@ -1,5 +1,3 @@
// +build linux
package intelrdt
type L3CacheInfo struct {
@@ -54,6 +52,6 @@ type Stats struct {
CMTStats *[]CMTNumaNodeStats `json:"cmt_stats,omitempty"`
}
func NewStats() *Stats {
func newStats() *Stats {
return &Stats{}
}

View File

@@ -1,13 +1,11 @@
// +build linux
package keys
import (
"errors"
"fmt"
"strconv"
"strings"
"github.com/pkg/errors"
"golang.org/x/sys/unix"
)
@@ -16,7 +14,7 @@ type KeySerial uint32
func JoinSessionKeyring(name string) (KeySerial, error) {
sessKeyID, err := unix.KeyctlJoinSessionKeyring(name)
if err != nil {
return 0, errors.Wrap(err, "create session key")
return 0, fmt.Errorf("unable to create session key: %w", err)
}
return KeySerial(sessKeyID), nil
}

View File

@@ -3,37 +3,28 @@ package logs
import (
"bufio"
"encoding/json"
"fmt"
"io"
"os"
"sync"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
)
var (
configureMutex sync.Mutex
// loggingConfigured will be set once logging has been configured via invoking `ConfigureLogging`.
// Subsequent invocations of `ConfigureLogging` would be no-op
loggingConfigured = false
)
type Config struct {
LogLevel logrus.Level
LogFormat string
LogFilePath string
LogPipeFd int
LogCaller bool
}
func ForwardLogs(logPipe io.ReadCloser) chan error {
done := make(chan error, 1)
s := bufio.NewScanner(logPipe)
logger := logrus.StandardLogger()
if logger.ReportCaller {
// Need a copy of the standard logger, but with ReportCaller
// turned off, as the logs are merely forwarded and their
// true source is not this file/line/function.
logNoCaller := *logrus.StandardLogger()
logNoCaller.ReportCaller = false
logger = &logNoCaller
}
go func() {
for s.Scan() {
processEntry(s.Bytes())
processEntry(s.Bytes(), logger)
}
if err := logPipe.Close(); err != nil {
logrus.Errorf("error closing log source: %v", err)
@@ -47,60 +38,19 @@ func ForwardLogs(logPipe io.ReadCloser) chan error {
return done
}
func processEntry(text []byte) {
func processEntry(text []byte, logger *logrus.Logger) {
if len(text) == 0 {
return
}
var jl struct {
Level string `json:"level"`
Msg string `json:"msg"`
Level logrus.Level `json:"level"`
Msg string `json:"msg"`
}
if err := json.Unmarshal(text, &jl); err != nil {
logrus.Errorf("failed to decode %q to json: %v", text, err)
return
}
lvl, err := logrus.ParseLevel(jl.Level)
if err != nil {
logrus.Errorf("failed to parse log level %q: %v", jl.Level, err)
return
}
logrus.StandardLogger().Logf(lvl, jl.Msg)
}
func ConfigureLogging(config Config) error {
configureMutex.Lock()
defer configureMutex.Unlock()
if loggingConfigured {
return errors.New("logging has already been configured")
}
logrus.SetLevel(config.LogLevel)
logrus.SetReportCaller(config.LogCaller)
// XXX: while 0 is a valid fd (usually stdin), here we assume
// that we never deliberately set LogPipeFd to 0.
if config.LogPipeFd > 0 {
logrus.SetOutput(os.NewFile(uintptr(config.LogPipeFd), "logpipe"))
} else if config.LogFilePath != "" {
f, err := os.OpenFile(config.LogFilePath, os.O_CREATE|os.O_WRONLY|os.O_APPEND|os.O_SYNC, 0o644)
if err != nil {
return err
}
logrus.SetOutput(f)
}
switch config.LogFormat {
case "text":
// retain logrus's default.
case "json":
logrus.SetFormatter(new(logrus.JSONFormatter))
default:
return fmt.Errorf("unknown log-format %q", config.LogFormat)
}
loggingConfigured = true
return nil
logger.Log(jl.Level, jl.Msg)
}

View File

@@ -1,5 +1,3 @@
// +build linux
package libcontainer
import (
@@ -23,6 +21,7 @@ const (
RootlessEUIDAttr uint16 = 27287
UidmapPathAttr uint16 = 27288
GidmapPathAttr uint16 = 27289
MountSourcesAttr uint16 = 27290
)
type Int32msg struct {

View File

@@ -0,0 +1,83 @@
package libcontainer
import (
"strconv"
"golang.org/x/sys/unix"
)
// mountError holds an error from a failed mount or unmount operation.
type mountError struct {
op string
source string
target string
procfd string
flags uintptr
data string
err error
}
// Error provides a string error representation.
func (e *mountError) Error() string {
out := e.op + " "
if e.source != "" {
out += e.source + ":" + e.target
} else {
out += e.target
}
if e.procfd != "" {
out += " (via " + e.procfd + ")"
}
if e.flags != uintptr(0) {
out += ", flags: 0x" + strconv.FormatUint(uint64(e.flags), 16)
}
if e.data != "" {
out += ", data: " + e.data
}
out += ": " + e.err.Error()
return out
}
// Unwrap returns the underlying error.
// This is a convention used by Go 1.13+ standard library.
func (e *mountError) Unwrap() error {
return e.err
}
// mount is a simple unix.Mount wrapper. If procfd is not empty, it is used
// instead of target (and the target is only used to add context to an error).
func mount(source, target, procfd, fstype string, flags uintptr, data string) error {
dst := target
if procfd != "" {
dst = procfd
}
if err := unix.Mount(source, dst, fstype, flags, data); err != nil {
return &mountError{
op: "mount",
source: source,
target: target,
procfd: procfd,
flags: flags,
data: data,
err: err,
}
}
return nil
}
// unmount is a simple unix.Unmount wrapper.
func unmount(target string, flags int) error {
err := unix.Unmount(target, flags)
if err != nil {
return &mountError{
op: "unmount",
target: target,
flags: uintptr(flags),
err: err,
}
}
return nil
}

View File

@@ -1,11 +1,9 @@
// +build linux
package libcontainer
import (
"bytes"
"fmt"
"io/ioutil"
"os"
"path/filepath"
"strconv"
@@ -75,7 +73,7 @@ func getNetworkInterfaceStats(interfaceName string) (*types.NetworkInterface, er
// Reads the specified statistics available under /sys/class/net/<EthInterface>/statistics
func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) {
data, err := ioutil.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile))
data, err := os.ReadFile(filepath.Join("/sys/class/net", ethInterface, "statistics", statsFile))
if err != nil {
return 0, err
}

View File

@@ -1,11 +1,8 @@
// +build linux
package libcontainer
import (
"errors"
"fmt"
"io/ioutil"
"os"
"path/filepath"
@@ -35,7 +32,7 @@ func registerMemoryEvent(cgDir string, evName string, arg string) (<-chan struct
eventControlPath := filepath.Join(cgDir, "cgroup.event_control")
data := fmt.Sprintf("%d %d %s", eventfd.Fd(), evFile.Fd(), arg)
if err := ioutil.WriteFile(eventControlPath, []byte(data), 0o700); err != nil {
if err := os.WriteFile(eventControlPath, []byte(data), 0o700); err != nil {
eventfd.Close()
evFile.Close()
return nil, err

View File

@@ -1,13 +1,11 @@
// +build linux
package libcontainer
import (
"fmt"
"path/filepath"
"unsafe"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@@ -15,19 +13,19 @@ import (
func registerMemoryEventV2(cgDir, evName, cgEvName string) (<-chan struct{}, error) {
fd, err := unix.InotifyInit()
if err != nil {
return nil, errors.Wrap(err, "unable to init inotify")
return nil, fmt.Errorf("unable to init inotify: %w", err)
}
// watching oom kill
evFd, err := unix.InotifyAddWatch(fd, filepath.Join(cgDir, evName), unix.IN_MODIFY)
if err != nil {
unix.Close(fd)
return nil, errors.Wrap(err, "unable to add inotify watch")
return nil, fmt.Errorf("unable to add inotify watch: %w", err)
}
// Because no `unix.IN_DELETE|unix.IN_DELETE_SELF` event for cgroup file system, so watching all process exited
cgFd, err := unix.InotifyAddWatch(fd, filepath.Join(cgDir, cgEvName), unix.IN_MODIFY)
if err != nil {
unix.Close(fd)
return nil, errors.Wrap(err, "unable to add inotify watch")
return nil, fmt.Errorf("unable to add inotify watch: %w", err)
}
ch := make(chan struct{})
go func() {
@@ -53,7 +51,7 @@ func registerMemoryEventV2(cgDir, evName, cgEvName string) (<-chan struct{}, err
offset = 0
for offset <= uint32(n-unix.SizeofInotifyEvent) {
rawEvent := (*unix.InotifyEvent)(unsafe.Pointer(&buffer[offset]))
offset += unix.SizeofInotifyEvent + uint32(rawEvent.Len)
offset += unix.SizeofInotifyEvent + rawEvent.Len
if rawEvent.Mask&unix.IN_MODIFY != unix.IN_MODIFY {
continue
}

View File

@@ -1,7 +1,7 @@
package libcontainer
import (
"fmt"
"errors"
"io"
"math"
"os"
@@ -9,6 +9,8 @@ import (
"github.com/opencontainers/runc/libcontainer/configs"
)
var errInvalidProcess = errors.New("invalid process")
type processOperations interface {
wait() (*os.ProcessState, error)
signal(sig os.Signal) error
@@ -78,13 +80,22 @@ type Process struct {
ops processOperations
LogLevel string
// SubCgroupPaths specifies sub-cgroups to run the process in.
// Map keys are controller names, map values are paths (relative to
// container's top-level cgroup).
//
// If empty, the default top-level container's cgroup is used.
//
// For cgroup v2, the only key allowed is "".
SubCgroupPaths map[string]string
}
// Wait waits for the process to exit.
// Wait releases any resources associated with the Process
func (p Process) Wait() (*os.ProcessState, error) {
if p.ops == nil {
return nil, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
return nil, errInvalidProcess
}
return p.ops.wait()
}
@@ -94,7 +105,7 @@ func (p Process) Pid() (int, error) {
// math.MinInt32 is returned here, because it's invalid value
// for the kill() system call.
if p.ops == nil {
return math.MinInt32, newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
return math.MinInt32, errInvalidProcess
}
return p.ops.pid(), nil
}
@@ -102,7 +113,7 @@ func (p Process) Pid() (int, error) {
// Signal sends a signal to the Process.
func (p Process) Signal(sig os.Signal) error {
if p.ops == nil {
return newGenericError(fmt.Errorf("invalid process"), NoProcessOps)
return errInvalidProcess
}
return p.ops.signal(sig)
}

View File

@@ -1,5 +1,3 @@
// +build linux
package libcontainer
import (
@@ -7,6 +5,7 @@ import (
"errors"
"fmt"
"io"
"net"
"os"
"os/exec"
"path/filepath"
@@ -25,10 +24,6 @@ import (
"golang.org/x/sys/unix"
)
// Synchronisation value for cgroup namespace setup.
// The same constant is defined in nsexec.c as "CREATECGROUPNS".
const createCgroupns = 0x80
type parentProcess interface {
// pid returns the pid for the running process.
pid() int
@@ -96,7 +91,7 @@ func (p *setnsProcess) start() (retErr error) {
p.messageSockPair.child.Close()
p.logFilePair.child.Close()
if err != nil {
return newSystemErrorWithCause(err, "starting setns process")
return fmt.Errorf("error starting setns process: %w", err)
}
waitInit := initWaiter(p.messageSockPair.parent)
@@ -104,7 +99,7 @@ func (p *setnsProcess) start() (retErr error) {
if retErr != nil {
if newOom, err := p.manager.OOMKillCount(); err == nil && newOom != oom {
// Someone in this cgroup was killed, this _might_ be us.
retErr = newSystemErrorWithCause(retErr, "possibly OOM-killed")
retErr = fmt.Errorf("%w (possibly OOM-killed)", retErr)
}
werr := <-waitInit
if werr != nil {
@@ -119,7 +114,7 @@ func (p *setnsProcess) start() (retErr error) {
if p.bootstrapData != nil {
if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
return fmt.Errorf("error copying bootstrap data to pipe: %w", err)
}
}
err = <-waitInit
@@ -127,14 +122,14 @@ func (p *setnsProcess) start() (retErr error) {
return err
}
if err := p.execSetns(); err != nil {
return newSystemErrorWithCause(err, "executing setns process")
return fmt.Errorf("error executing setns process: %w", err)
}
if len(p.cgroupPaths) > 0 {
if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil && !p.rootlessCgroups {
// On cgroup v2 + nesting + domain controllers, EnterPid may fail with EBUSY.
for _, path := range p.cgroupPaths {
if err := cgroups.WriteCgroupProc(path, p.pid()); err != nil && !p.rootlessCgroups {
// On cgroup v2 + nesting + domain controllers, WriteCgroupProc may fail with EBUSY.
// https://github.com/opencontainers/runc/issues/2356#issuecomment-621277643
// Try to join the cgroup of InitProcessPid.
if cgroups.IsCgroup2UnifiedMode() {
if cgroups.IsCgroup2UnifiedMode() && p.initProcessPid != 0 {
initProcCgroupFile := fmt.Sprintf("/proc/%d/cgroup", p.initProcessPid)
initCg, initCgErr := cgroups.ParseCgroupFile(initProcCgroupFile)
if initCgErr == nil {
@@ -148,7 +143,7 @@ func (p *setnsProcess) start() (retErr error) {
}
}
if err != nil {
return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid())
return fmt.Errorf("error adding pid %d to cgroups: %w", p.pid(), err)
}
}
}
@@ -157,17 +152,17 @@ func (p *setnsProcess) start() (retErr error) {
_, err := os.Stat(p.intelRdtPath)
if err == nil {
if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil {
return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid())
return fmt.Errorf("error adding pid %d to Intel RDT: %w", p.pid(), err)
}
}
}
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for process")
return fmt.Errorf("error setting rlimits for process: %w", err)
}
if err := utils.WriteJSON(p.messageSockPair.parent, p.config); err != nil {
return newSystemErrorWithCause(err, "writing config to pipe")
return fmt.Errorf("error writing config to pipe: %w", err)
}
ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
@@ -178,13 +173,49 @@ func (p *setnsProcess) start() (retErr error) {
case procHooks:
// This shouldn't happen.
panic("unexpected procHooks in setns")
case procSeccomp:
if p.config.Config.Seccomp.ListenerPath == "" {
return errors.New("listenerPath is not set")
}
seccompFd, err := recvSeccompFd(uintptr(p.pid()), uintptr(sync.Fd))
if err != nil {
return err
}
defer unix.Close(seccompFd)
bundle, annotations := utils.Annotations(p.config.Config.Labels)
containerProcessState := &specs.ContainerProcessState{
Version: specs.Version,
Fds: []string{specs.SeccompFdName},
Pid: p.cmd.Process.Pid,
Metadata: p.config.Config.Seccomp.ListenerMetadata,
State: specs.State{
Version: specs.Version,
ID: p.config.ContainerId,
Status: specs.StateRunning,
Pid: p.initProcessPid,
Bundle: bundle,
Annotations: annotations,
},
}
if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
containerProcessState, seccompFd); err != nil {
return err
}
// Sync with child.
if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
return err
}
return nil
default:
return newSystemError(errors.New("invalid JSON payload from child"))
return errors.New("invalid JSON payload from child")
}
})
if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
return newSystemErrorWithCause(err, "calling shutdown on init pipe")
return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err}
}
// Must be done after Shutdown so the child will exit and we can wait for it.
if ierr != nil {
@@ -202,16 +233,16 @@ func (p *setnsProcess) execSetns() error {
status, err := p.cmd.Process.Wait()
if err != nil {
_ = p.cmd.Wait()
return newSystemErrorWithCause(err, "waiting on setns process to finish")
return fmt.Errorf("error waiting on setns process to finish: %w", err)
}
if !status.Success() {
_ = p.cmd.Wait()
return newSystemError(&exec.ExitError{ProcessState: status})
return &exec.ExitError{ProcessState: status}
}
var pid *pid
if err := json.NewDecoder(p.messageSockPair.parent).Decode(&pid); err != nil {
_ = p.cmd.Wait()
return newSystemErrorWithCause(err, "reading pid from init pipe")
return fmt.Errorf("error reading pid from init pipe: %w", err)
}
// Clean up the zombie parent process
@@ -335,7 +366,7 @@ func (p *initProcess) start() (retErr error) {
_ = p.logFilePair.child.Close()
if err != nil {
p.process.ops = nil
return newSystemErrorWithCause(err, "starting init process command")
return fmt.Errorf("unable to start init: %w", err)
}
waitInit := initWaiter(p.messageSockPair.parent)
@@ -355,9 +386,9 @@ func (p *initProcess) start() (retErr error) {
if logrus.GetLevel() >= logrus.DebugLevel {
// Only show the original error if debug is set,
// as it is not generally very useful.
retErr = newSystemErrorWithCause(retErr, oomError)
retErr = fmt.Errorf(oomError+": %w", retErr)
} else {
retErr = newSystemError(errors.New(oomError))
retErr = errors.New(oomError)
}
}
@@ -382,15 +413,15 @@ func (p *initProcess) start() (retErr error) {
// cgroup. We don't need to worry about not doing this and not being root
// because we'd be using the rootless cgroup manager in that case.
if err := p.manager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying cgroup configuration for process")
return fmt.Errorf("unable to apply cgroup configuration: %w", err)
}
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Apply(p.pid()); err != nil {
return newSystemErrorWithCause(err, "applying Intel RDT configuration for process")
return fmt.Errorf("unable to apply Intel RDT configuration: %w", err)
}
}
if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
return fmt.Errorf("can't copy bootstrap data to pipe: %w", err)
}
err = <-waitInit
if err != nil {
@@ -399,7 +430,7 @@ func (p *initProcess) start() (retErr error) {
childPid, err := p.getChildPid()
if err != nil {
return newSystemErrorWithCause(err, "getting the final child's pid from pipe")
return fmt.Errorf("can't get final child's PID from pipe: %w", err)
}
// Save the standard descriptor names before the container process
@@ -407,30 +438,23 @@ func (p *initProcess) start() (retErr error) {
// we won't know at checkpoint time which file descriptor to look up.
fds, err := getPipeFds(childPid)
if err != nil {
return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", childPid)
return fmt.Errorf("error getting pipe fds for pid %d: %w", childPid, err)
}
p.setExternalDescriptors(fds)
// Now it's time to setup cgroup namesapce
if p.config.Config.Namespaces.Contains(configs.NEWCGROUP) && p.config.Config.Namespaces.PathOf(configs.NEWCGROUP) == "" {
if _, err := p.messageSockPair.parent.Write([]byte{createCgroupns}); err != nil {
return newSystemErrorWithCause(err, "sending synchronization value to init process")
}
}
// Wait for our first child to exit
if err := p.waitForChildExit(childPid); err != nil {
return newSystemErrorWithCause(err, "waiting for our first child to exit")
return fmt.Errorf("error waiting for our first child to exit: %w", err)
}
if err := p.createNetworkInterfaces(); err != nil {
return newSystemErrorWithCause(err, "creating network interfaces")
return fmt.Errorf("error creating network interfaces: %w", err)
}
if err := p.updateSpecState(); err != nil {
return newSystemErrorWithCause(err, "updating the spec state")
return fmt.Errorf("error updating spec state: %w", err)
}
if err := p.sendConfig(); err != nil {
return newSystemErrorWithCause(err, "sending config to init process")
return fmt.Errorf("error sending config to init process: %w", err)
}
var (
sentRun bool
@@ -439,25 +463,60 @@ func (p *initProcess) start() (retErr error) {
ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error {
switch sync.Type {
case procSeccomp:
if p.config.Config.Seccomp.ListenerPath == "" {
return errors.New("listenerPath is not set")
}
seccompFd, err := recvSeccompFd(uintptr(childPid), uintptr(sync.Fd))
if err != nil {
return err
}
defer unix.Close(seccompFd)
s, err := p.container.currentOCIState()
if err != nil {
return err
}
// initProcessStartTime hasn't been set yet.
s.Pid = p.cmd.Process.Pid
s.Status = specs.StateCreating
containerProcessState := &specs.ContainerProcessState{
Version: specs.Version,
Fds: []string{specs.SeccompFdName},
Pid: s.Pid,
Metadata: p.config.Config.Seccomp.ListenerMetadata,
State: *s,
}
if err := sendContainerProcessState(p.config.Config.Seccomp.ListenerPath,
containerProcessState, seccompFd); err != nil {
return err
}
// Sync with child.
if err := writeSync(p.messageSockPair.parent, procSeccompDone); err != nil {
return err
}
case procReady:
// set rlimits, this has to be done here because we lose permissions
// to raise the limits once we enter a user-namespace
if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil {
return newSystemErrorWithCause(err, "setting rlimits for ready process")
return fmt.Errorf("error setting rlimits for ready process: %w", err)
}
// call prestart and CreateRuntime hooks
if !p.config.Config.Namespaces.Contains(configs.NEWNS) {
// Setup cgroup before the hook, so that the prestart and CreateRuntime hook could apply cgroup permissions.
if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for ready process")
return fmt.Errorf("error setting cgroup config for ready process: %w", err)
}
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting Intel RDT config for ready process")
return fmt.Errorf("error setting Intel RDT config for ready process: %w", err)
}
}
if p.config.Config.Hooks != nil {
if len(p.config.Config.Hooks) != 0 {
s, err := p.container.currentOCIState()
if err != nil {
return err
@@ -493,26 +552,26 @@ func (p *initProcess) start() (retErr error) {
// procRun sync.
state, uerr := p.container.updateState(p)
if uerr != nil {
return newSystemErrorWithCause(err, "store init state")
return fmt.Errorf("unable to store init state: %w", err)
}
p.container.initProcessStartTime = state.InitProcessStartTime
// Sync with child.
if err := writeSync(p.messageSockPair.parent, procRun); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'run'")
return err
}
sentRun = true
case procHooks:
// Setup cgroup before prestart hook, so that the prestart hook could apply cgroup permissions.
if err := p.manager.Set(p.config.Config.Cgroups.Resources); err != nil {
return newSystemErrorWithCause(err, "setting cgroup config for procHooks process")
return fmt.Errorf("error setting cgroup config for procHooks process: %w", err)
}
if p.intelRdtManager != nil {
if err := p.intelRdtManager.Set(p.config.Config); err != nil {
return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process")
return fmt.Errorf("error setting Intel RDT config for procHooks process: %w", err)
}
}
if p.config.Config.Hooks != nil {
if len(p.config.Config.Hooks) != 0 {
s, err := p.container.currentOCIState()
if err != nil {
return err
@@ -531,24 +590,24 @@ func (p *initProcess) start() (retErr error) {
}
// Sync with child.
if err := writeSync(p.messageSockPair.parent, procResume); err != nil {
return newSystemErrorWithCause(err, "writing syncT 'resume'")
return err
}
sentResume = true
default:
return newSystemError(errors.New("invalid JSON payload from child"))
return errors.New("invalid JSON payload from child")
}
return nil
})
if !sentRun {
return newSystemErrorWithCause(ierr, "container init")
return fmt.Errorf("error during container init: %w", ierr)
}
if p.config.Config.Namespaces.Contains(configs.NEWNS) && !sentResume {
return newSystemError(errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process"))
return errors.New("could not synchronise after executing prestart and CreateRuntime hooks with container process")
}
if err := unix.Shutdown(int(p.messageSockPair.parent.Fd()), unix.SHUT_WR); err != nil {
return newSystemErrorWithCause(err, "shutting down init pipe")
return &os.PathError{Op: "shutdown", Path: "(init pipe)", Err: err}
}
// Must be done after Shutdown so the child will exit and we can wait for it.
@@ -634,6 +693,46 @@ func (p *initProcess) forwardChildLogs() chan error {
return logs.ForwardLogs(p.logFilePair.parent)
}
func recvSeccompFd(childPid, childFd uintptr) (int, error) {
pidfd, _, errno := unix.Syscall(unix.SYS_PIDFD_OPEN, childPid, 0, 0)
if errno != 0 {
return -1, fmt.Errorf("performing SYS_PIDFD_OPEN syscall: %w", errno)
}
defer unix.Close(int(pidfd))
seccompFd, _, errno := unix.Syscall(unix.SYS_PIDFD_GETFD, pidfd, childFd, 0)
if errno != 0 {
return -1, fmt.Errorf("performing SYS_PIDFD_GETFD syscall: %w", errno)
}
return int(seccompFd), nil
}
func sendContainerProcessState(listenerPath string, state *specs.ContainerProcessState, fd int) error {
conn, err := net.Dial("unix", listenerPath)
if err != nil {
return fmt.Errorf("failed to connect with seccomp agent specified in the seccomp profile: %w", err)
}
socket, err := conn.(*net.UnixConn).File()
if err != nil {
return fmt.Errorf("cannot get seccomp socket: %w", err)
}
defer socket.Close()
b, err := json.Marshal(state)
if err != nil {
return fmt.Errorf("cannot marshall seccomp state: %w", err)
}
err = utils.SendFds(socket, b, fd)
if err != nil {
return fmt.Errorf("cannot send seccomp fd to %s: %w", listenerPath, err)
}
return nil
}
func getPipeFds(pid int) ([]string, error) {
fds := make([]string, 3)
@@ -694,7 +793,7 @@ func (p *Process) InitializeIO(rootuid, rootgid int) (i *IO, err error) {
// change ownership of the pipes in case we are in a user namespace
for _, fd := range fds {
if err := unix.Fchown(int(fd), rootuid, rootgid); err != nil {
return nil, err
return nil, &os.PathError{Op: "fchown", Path: "fd " + strconv.Itoa(int(fd)), Err: err}
}
}
return i, nil
@@ -719,7 +818,7 @@ func initWaiter(r io.Reader) chan error {
return
}
}
ch <- newSystemErrorWithCause(err, "waiting for init preliminary setup")
ch <- fmt.Errorf("waiting for init preliminary setup: %w", err)
}()
return ch

View File

@@ -1,9 +1,7 @@
// +build linux
package libcontainer
import (
"fmt"
"errors"
"os"
"os/exec"
@@ -31,7 +29,7 @@ type restoredProcess struct {
}
func (p *restoredProcess) start() error {
return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError)
return errors.New("restored process cannot be started")
}
func (p *restoredProcess) pid() int {
@@ -51,7 +49,8 @@ func (p *restoredProcess) wait() (*os.ProcessState, error) {
// maybe use --exec-cmd in criu
err := p.cmd.Wait()
if err != nil {
if _, ok := err.(*exec.ExitError); !ok {
var exitErr *exec.ExitError
if !errors.As(err, &exitErr) {
return nil, err
}
}
@@ -89,7 +88,7 @@ type nonChildProcess struct {
}
func (p *nonChildProcess) start() error {
return newGenericError(fmt.Errorf("restored process cannot be started"), SystemError)
return errors.New("restored process cannot be started")
}
func (p *nonChildProcess) pid() int {
@@ -97,11 +96,11 @@ func (p *nonChildProcess) pid() int {
}
func (p *nonChildProcess) terminate() error {
return newGenericError(fmt.Errorf("restored process cannot be terminated"), SystemError)
return errors.New("restored process cannot be terminated")
}
func (p *nonChildProcess) wait() (*os.ProcessState, error) {
return nil, newGenericError(fmt.Errorf("restored process cannot be waited on"), SystemError)
return nil, errors.New("restored process cannot be waited on")
}
func (p *nonChildProcess) startTime() (uint64, error) {

View File

@@ -1,15 +1,14 @@
// +build linux
package libcontainer
import (
"errors"
"fmt"
"io"
"io/ioutil"
"os"
"os/exec"
"path"
"path/filepath"
"strconv"
"strings"
"time"
@@ -36,6 +35,7 @@ type mountConfig struct {
cgroup2Path string
rootlessCgroups bool
cgroupns bool
fd *int
}
// needsSetupDev returns true if /dev needs to be set up.
@@ -51,10 +51,14 @@ func needsSetupDev(config *configs.Config) bool {
// prepareRootfs sets up the devices, mount points, and filesystems for use
// inside a new mount namespace. It doesn't set anything as ro. You must call
// finalizeRootfs after this function to finish setting up the rootfs.
func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds []int) (err error) {
config := iConfig.Config
if err := prepareRoot(config); err != nil {
return newSystemErrorWithCause(err, "preparing rootfs")
return fmt.Errorf("error preparing rootfs: %w", err)
}
if mountFds != nil && len(mountFds) != len(config.Mounts) {
return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v. Slice: %v", len(config.Mounts), len(mountFds), mountFds)
}
mountConfig := &mountConfig{
@@ -65,32 +69,39 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
cgroupns: config.Namespaces.Contains(configs.NEWCGROUP),
}
setupDev := needsSetupDev(config)
for _, m := range config.Mounts {
for i, m := range config.Mounts {
for _, precmd := range m.PremountCmds {
if err := mountCmd(precmd); err != nil {
return newSystemErrorWithCause(err, "running premount command")
return fmt.Errorf("error running premount command: %w", err)
}
}
// Just before the loop we checked that if not empty, len(mountFds) == len(config.Mounts).
// Therefore, we can access mountFds[i] without any concerns.
if mountFds != nil && mountFds[i] != -1 {
mountConfig.fd = &mountFds[i]
}
if err := mountToRootfs(m, mountConfig); err != nil {
return newSystemErrorWithCausef(err, "mounting %q to rootfs at %q", m.Source, m.Destination)
return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err)
}
for _, postcmd := range m.PostmountCmds {
if err := mountCmd(postcmd); err != nil {
return newSystemErrorWithCause(err, "running postmount command")
return fmt.Errorf("error running postmount command: %w", err)
}
}
}
if setupDev {
if err := createDevices(config); err != nil {
return newSystemErrorWithCause(err, "creating device nodes")
return fmt.Errorf("error creating device nodes: %w", err)
}
if err := setupPtmx(config); err != nil {
return newSystemErrorWithCause(err, "setting up ptmx")
return fmt.Errorf("error setting up ptmx: %w", err)
}
if err := setupDevSymlinks(config.Rootfs); err != nil {
return newSystemErrorWithCause(err, "setting up /dev symlinks")
return fmt.Errorf("error setting up /dev symlinks: %w", err)
}
}
@@ -112,7 +123,7 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
// operation not being perfectly split).
if err := unix.Chdir(config.Rootfs); err != nil {
return newSystemErrorWithCausef(err, "changing dir to %q", config.Rootfs)
return &os.PathError{Op: "chdir", Path: config.Rootfs, Err: err}
}
s := iConfig.SpecState
@@ -130,12 +141,12 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig) (err error) {
err = chroot()
}
if err != nil {
return newSystemErrorWithCause(err, "jailing process inside rootfs")
return fmt.Errorf("error jailing process inside rootfs: %w", err)
}
if setupDev {
if err := reOpenDevNull(); err != nil {
return newSystemErrorWithCause(err, "reopening /dev/null inside container")
return fmt.Errorf("error reopening /dev/null inside container: %w", err)
}
}
@@ -161,7 +172,7 @@ func finalizeRootfs(config *configs.Config) (err error) {
}
if m.Device == "tmpfs" || utils.CleanPath(m.Destination) == "/dev" {
if err := remountReadonly(m); err != nil {
return newSystemErrorWithCausef(err, "remounting %q as readonly", m.Destination)
return err
}
}
}
@@ -169,7 +180,7 @@ func finalizeRootfs(config *configs.Config) (err error) {
// set rootfs ( / ) as readonly
if config.Readonlyfs {
if err := setReadonly(); err != nil {
return newSystemErrorWithCause(err, "setting rootfs as readonly")
return fmt.Errorf("error setting rootfs as readonly: %w", err)
}
}
@@ -183,14 +194,14 @@ func finalizeRootfs(config *configs.Config) (err error) {
// /tmp has to be mounted as private to allow MS_MOVE to work in all situations
func prepareTmp(topTmpDir string) (string, error) {
tmpdir, err := ioutil.TempDir(topTmpDir, "runctop")
tmpdir, err := os.MkdirTemp(topTmpDir, "runctop")
if err != nil {
return "", err
}
if err := unix.Mount(tmpdir, tmpdir, "bind", unix.MS_BIND, ""); err != nil {
if err := mount(tmpdir, tmpdir, "", "bind", unix.MS_BIND, ""); err != nil {
return "", err
}
if err := unix.Mount("", tmpdir, "", uintptr(unix.MS_PRIVATE), ""); err != nil {
if err := mount("", tmpdir, "", "", uintptr(unix.MS_PRIVATE), ""); err != nil {
return "", err
}
return tmpdir, nil
@@ -206,13 +217,18 @@ func mountCmd(cmd configs.Command) error {
command.Env = cmd.Env
command.Dir = cmd.Dir
if out, err := command.CombinedOutput(); err != nil {
return fmt.Errorf("%#v failed: %s: %v", cmd, string(out), err)
return fmt.Errorf("%#v failed: %s: %w", cmd, string(out), err)
}
return nil
}
func prepareBindMount(m *configs.Mount, rootfs string) error {
stat, err := os.Stat(m.Source)
func prepareBindMount(m *configs.Mount, rootfs string, mountFd *int) error {
source := m.Source
if mountFd != nil {
source = "/proc/self/fd/" + strconv.Itoa(*mountFd)
}
stat, err := os.Stat(source)
if err != nil {
// error out if the source of a bind mount does not exist as we will be
// unable to bind anything to it.
@@ -226,7 +242,7 @@ func prepareBindMount(m *configs.Mount, rootfs string) error {
if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
return err
}
if err := checkProcMount(rootfs, dest, m.Source); err != nil {
if err := checkProcMount(rootfs, dest, source); err != nil {
return err
}
if err := createIfNotExists(dest, stat.IsDir()); err != nil {
@@ -256,9 +272,11 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
Data: "mode=755",
PropagationFlags: m.PropagationFlags,
}
if err := mountToRootfs(tmpfs, c); err != nil {
return err
}
for _, b := range binds {
if c.cgroupns {
subsystemPath := filepath.Join(c.root, b.Destination)
@@ -278,7 +296,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
data = cgroups.CgroupNamePrefix + data
source = "systemd"
}
return unix.Mount(source, procfd, "cgroup", uintptr(flags), data)
return mount(source, b.Destination, procfd, "cgroup", uintptr(flags), data)
}); err != nil {
return err
}
@@ -310,9 +328,9 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
return err
}
return utils.WithProcfd(c.root, m.Destination, func(procfd string) error {
if err := unix.Mount(m.Source, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
if err := mount(m.Source, m.Destination, procfd, "cgroup2", uintptr(m.Flags), m.Data); err != nil {
// when we are in UserNS but CgroupNS is not unshared, we cannot mount cgroup2 (#2158)
if err == unix.EPERM || err == unix.EBUSY {
if errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY) {
src := fs2.UnifiedMountpoint
if c.cgroupns && c.cgroup2Path != "" {
// Emulate cgroupns by bind-mounting
@@ -320,8 +338,8 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
// the whole /sys/fs/cgroup.
src = c.cgroup2Path
}
err = unix.Mount(src, procfd, "", uintptr(m.Flags)|unix.MS_BIND, "")
if err == unix.ENOENT && c.rootlessCgroups {
err = mount(src, m.Destination, procfd, "", uintptr(m.Flags)|unix.MS_BIND, "")
if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
err = nil
}
}
@@ -335,12 +353,12 @@ func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
// Set up a scratch dir for the tmpfs on the host.
tmpdir, err := prepareTmp("/tmp")
if err != nil {
return newSystemErrorWithCause(err, "tmpcopyup: failed to setup tmpdir")
return fmt.Errorf("tmpcopyup: failed to setup tmpdir: %w", err)
}
defer cleanupTmp(tmpdir)
tmpDir, err := ioutil.TempDir(tmpdir, "runctmpdir")
tmpDir, err := os.MkdirTemp(tmpdir, "runctmpdir")
if err != nil {
return newSystemErrorWithCause(err, "tmpcopyup: failed to create tmpdir")
return fmt.Errorf("tmpcopyup: failed to create tmpdir: %w", err)
}
defer os.RemoveAll(tmpDir)
@@ -348,15 +366,15 @@ func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
// m.Destination since we are going to mount *on the host*.
oldDest := m.Destination
m.Destination = tmpDir
err = mountPropagate(m, "/", mountLabel)
err = mountPropagate(m, "/", mountLabel, nil)
m.Destination = oldDest
if err != nil {
return err
}
defer func() {
if Err != nil {
if err := unix.Unmount(tmpDir, unix.MNT_DETACH); err != nil {
logrus.Warnf("tmpcopyup: failed to unmount tmpdir on error: %v", err)
if err := unmount(tmpDir, unix.MNT_DETACH); err != nil {
logrus.Warnf("tmpcopyup: %v", err)
}
}
}()
@@ -369,8 +387,8 @@ func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, procfd, tmpDir, err)
}
// Now move the mount into the container.
if err := unix.Mount(tmpDir, procfd, "", unix.MS_MOVE, ""); err != nil {
return fmt.Errorf("tmpcopyup: failed to move mount %s to %s (%s): %w", tmpDir, procfd, m.Destination, err)
if err := mount(tmpDir, m.Destination, procfd, "", unix.MS_MOVE, ""); err != nil {
return fmt.Errorf("tmpcopyup: failed to move mount: %w", err)
}
return nil
})
@@ -379,6 +397,7 @@ func doTmpfsCopyUp(m *configs.Mount, rootfs, mountLabel string) (Err error) {
func mountToRootfs(m *configs.Mount, c *mountConfig) error {
rootfs := c.root
mountLabel := c.label
mountFd := c.fd
dest, err := securejoin.SecureJoin(rootfs, m.Destination)
if err != nil {
return err
@@ -402,12 +421,12 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
return err
}
// Selinux kernels do not support labeling of /proc or /sys
return mountPropagate(m, rootfs, "")
return mountPropagate(m, rootfs, "", nil)
case "mqueue":
if err := os.MkdirAll(dest, 0o755); err != nil {
return err
}
if err := mountPropagate(m, rootfs, ""); err != nil {
if err := mountPropagate(m, rootfs, "", nil); err != nil {
return err
}
return label.SetFileLabel(dest, mountLabel)
@@ -422,11 +441,13 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
if m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP {
err = doTmpfsCopyUp(m, rootfs, mountLabel)
} else {
err = mountPropagate(m, rootfs, mountLabel)
err = mountPropagate(m, rootfs, mountLabel, nil)
}
if err != nil {
return err
}
if stat != nil {
if err = os.Chmod(dest, stat.Mode()); err != nil {
return err
@@ -434,17 +455,17 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
}
return nil
case "bind":
if err := prepareBindMount(m, rootfs); err != nil {
if err := prepareBindMount(m, rootfs, mountFd); err != nil {
return err
}
if err := mountPropagate(m, rootfs, mountLabel); err != nil {
if err := mountPropagate(m, rootfs, mountLabel, mountFd); err != nil {
return err
}
// bind mount won't change mount options, we need remount to make mount options effective.
// first check that we have non-default options required before attempting a remount
if m.Flags&^(unix.MS_REC|unix.MS_REMOUNT|unix.MS_BIND) != 0 {
// only remount if unique mount options are set
if err := remount(m, rootfs); err != nil {
if err := remount(m, rootfs, mountFd); err != nil {
return err
}
}
@@ -470,7 +491,10 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
if err := os.MkdirAll(dest, 0o755); err != nil {
return err
}
return mountPropagate(m, rootfs, mountLabel)
return mountPropagate(m, rootfs, mountLabel, mountFd)
}
if err := setRecAttr(m, rootfs); err != nil {
return err
}
return nil
}
@@ -570,7 +594,7 @@ func checkProcMount(rootfs, dest, source string) error {
func isProc(path string) (bool, error) {
var s unix.Statfs_t
if err := unix.Statfs(path, &s); err != nil {
return false, err
return false, &os.PathError{Op: "statfs", Path: path, Err: err}
}
return s.Type == unix.PROC_SUPER_MAGIC, nil
}
@@ -593,7 +617,7 @@ func setupDevSymlinks(rootfs string) error {
dst = filepath.Join(rootfs, link[1])
)
if err := os.Symlink(src, dst); err != nil && !os.IsExist(err) {
return fmt.Errorf("symlink %s %s %s", src, dst, err)
return err
}
}
return nil
@@ -607,20 +631,24 @@ func reOpenDevNull() error {
var stat, devNullStat unix.Stat_t
file, err := os.OpenFile("/dev/null", os.O_RDWR, 0)
if err != nil {
return fmt.Errorf("Failed to open /dev/null - %s", err)
return err
}
defer file.Close() //nolint: errcheck
if err := unix.Fstat(int(file.Fd()), &devNullStat); err != nil {
return err
return &os.PathError{Op: "fstat", Path: file.Name(), Err: err}
}
for fd := 0; fd < 3; fd++ {
if err := unix.Fstat(fd, &stat); err != nil {
return err
return &os.PathError{Op: "fstat", Path: "fd " + strconv.Itoa(fd), Err: err}
}
if stat.Rdev == devNullStat.Rdev {
// Close and re-open the fd.
if err := unix.Dup3(int(file.Fd()), fd, 0); err != nil {
return err
return &os.PathError{
Op: "dup3",
Path: "fd " + strconv.Itoa(int(file.Fd())),
Err: err,
}
}
}
}
@@ -658,7 +686,7 @@ func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error {
_ = f.Close()
}
return utils.WithProcfd(rootfs, dest, func(procfd string) error {
return unix.Mount(node.Path, procfd, "bind", unix.MS_BIND, "")
return mount(node.Path, dest, procfd, "bind", unix.MS_BIND, "")
})
}
@@ -679,9 +707,9 @@ func createDeviceNode(rootfs string, node *devices.Device, bind bool) error {
return bindMountDeviceNode(rootfs, dest, node)
}
if err := mknodDevice(dest, node); err != nil {
if os.IsExist(err) {
if errors.Is(err, os.ErrExist) {
return nil
} else if os.IsPermission(err) {
} else if errors.Is(err, os.ErrPermission) {
return bindMountDeviceNode(rootfs, dest, node)
}
return err
@@ -706,9 +734,9 @@ func mknodDevice(dest string, node *devices.Device) error {
return err
}
if err := unix.Mknod(dest, uint32(fileMode), int(dev)); err != nil {
return err
return &os.PathError{Op: "mknod", Path: dest, Err: err}
}
return unix.Chown(dest, int(node.Uid), int(node.Gid))
return os.Chown(dest, int(node.Uid), int(node.Gid))
}
// Get the parent mount point of directory passed in as argument. Also return
@@ -755,7 +783,7 @@ func rootfsParentMountPrivate(rootfs string) error {
// shared. Secondly when we bind mount rootfs it will propagate to
// parent namespace and we don't want that to happen.
if sharedMount {
return unix.Mount("", parentMount, "", unix.MS_PRIVATE, "")
return mount("", parentMount, "", "", unix.MS_PRIVATE, "")
}
return nil
@@ -766,7 +794,7 @@ func prepareRoot(config *configs.Config) error {
if config.RootPropagation != 0 {
flag = config.RootPropagation
}
if err := unix.Mount("", "/", "", uintptr(flag), ""); err != nil {
if err := mount("", "/", "", "", uintptr(flag), ""); err != nil {
return err
}
@@ -777,13 +805,13 @@ func prepareRoot(config *configs.Config) error {
return err
}
return unix.Mount(config.Rootfs, config.Rootfs, "bind", unix.MS_BIND|unix.MS_REC, "")
return mount(config.Rootfs, config.Rootfs, "", "bind", unix.MS_BIND|unix.MS_REC, "")
}
func setReadonly() error {
flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY)
err := unix.Mount("", "/", "", flags, "")
err := mount("", "/", "", "", flags, "")
if err == nil {
return nil
}
@@ -792,7 +820,7 @@ func setReadonly() error {
return &os.PathError{Op: "statfs", Path: "/", Err: err}
}
flags |= uintptr(s.Flags)
return unix.Mount("", "/", "", flags, "")
return mount("", "/", "", "", flags, "")
}
func setupPtmx(config *configs.Config) error {
@@ -801,7 +829,7 @@ func setupPtmx(config *configs.Config) error {
return err
}
if err := os.Symlink("pts/ptmx", ptmx); err != nil {
return fmt.Errorf("symlink dev ptmx %s", err)
return err
}
return nil
}
@@ -817,23 +845,23 @@ func pivotRoot(rootfs string) error {
oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return err
return &os.PathError{Op: "open", Path: "/", Err: err}
}
defer unix.Close(oldroot) //nolint: errcheck
newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return err
return &os.PathError{Op: "open", Path: rootfs, Err: err}
}
defer unix.Close(newroot) //nolint: errcheck
// Change to the new root so that the pivot_root actually acts on it.
if err := unix.Fchdir(newroot); err != nil {
return err
return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(newroot), Err: err}
}
if err := unix.PivotRoot(".", "."); err != nil {
return fmt.Errorf("pivot_root %s", err)
return &os.PathError{Op: "pivot_root", Path: ".", Err: err}
}
// Currently our "." is oldroot (according to the current kernel code).
@@ -842,7 +870,7 @@ func pivotRoot(rootfs string) error {
// pivot_root(2).
if err := unix.Fchdir(oldroot); err != nil {
return err
return &os.PathError{Op: "fchdir", Path: "fd " + strconv.Itoa(oldroot), Err: err}
}
// Make oldroot rslave to make sure our unmounts don't propagate to the
@@ -850,17 +878,17 @@ func pivotRoot(rootfs string) error {
// known to cause issues due to races where we still have a reference to a
// mount while a process in the host namespace are trying to operate on
// something they think has no mounts (devicemapper in particular).
if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
if err := mount("", ".", "", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
return err
}
// Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
if err := unix.Unmount(".", unix.MNT_DETACH); err != nil {
// Perform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
if err := unmount(".", unix.MNT_DETACH); err != nil {
return err
}
// Switch back to our shiny new root.
if err := unix.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err)
return &os.PathError{Op: "chdir", Path: "/", Err: err}
}
return nil
}
@@ -899,8 +927,8 @@ func msMoveRoot(rootfs string) error {
for _, info := range mountinfos {
p := info.Mountpoint
// Be sure umount events are not propagated to the host.
if err := unix.Mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
if err == unix.ENOENT {
if err := mount("", p, "", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
if errors.Is(err, unix.ENOENT) {
// If the mountpoint doesn't exist that means that we've
// already blasted away some parent directory of the mountpoint
// and so we don't care about this error.
@@ -908,13 +936,13 @@ func msMoveRoot(rootfs string) error {
}
return err
}
if err := unix.Unmount(p, unix.MNT_DETACH); err != nil {
if err != unix.EINVAL && err != unix.EPERM {
if err := unmount(p, unix.MNT_DETACH); err != nil {
if !errors.Is(err, unix.EINVAL) && !errors.Is(err, unix.EPERM) {
return err
} else {
// If we have not privileges for umounting (e.g. rootless), then
// cover the path.
if err := unix.Mount("tmpfs", p, "tmpfs", 0, ""); err != nil {
if err := mount("tmpfs", p, "", "tmpfs", 0, ""); err != nil {
return err
}
}
@@ -922,7 +950,7 @@ func msMoveRoot(rootfs string) error {
}
// Move the rootfs on top of "/" in our mount namespace.
if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil {
if err := mount(rootfs, "/", "", "", unix.MS_MOVE, ""); err != nil {
return err
}
return chroot()
@@ -930,9 +958,12 @@ func msMoveRoot(rootfs string) error {
func chroot() error {
if err := unix.Chroot("."); err != nil {
return err
return &os.PathError{Op: "chroot", Path: ".", Err: err}
}
return unix.Chdir("/")
if err := unix.Chdir("/"); err != nil {
return &os.PathError{Op: "chdir", Path: "/", Err: err}
}
return nil
}
// createIfNotExists creates a file or a directory only if it does not already exist.
@@ -957,11 +988,11 @@ func createIfNotExists(path string, isDir bool) error {
// readonlyPath will make a path read only.
func readonlyPath(path string) error {
if err := unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
if os.IsNotExist(err) {
if err := mount(path, path, "", "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
if errors.Is(err, os.ErrNotExist) {
return nil
}
return &os.PathError{Op: "bind-mount", Path: path, Err: err}
return err
}
var s unix.Statfs_t
@@ -970,8 +1001,8 @@ func readonlyPath(path string) error {
}
flags := uintptr(s.Flags) & (unix.MS_NOSUID | unix.MS_NODEV | unix.MS_NOEXEC)
if err := unix.Mount(path, path, "", flags|unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY, ""); err != nil {
return &os.PathError{Op: "bind-mount-ro", Path: path, Err: err}
if err := mount(path, path, "", "", flags|unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY, ""); err != nil {
return err
}
return nil
@@ -991,14 +1022,12 @@ func remountReadonly(m *configs.Mount) error {
// nosuid, etc.). So, let's use that case so that we can do
// this re-mount without failing in a userns.
flags |= unix.MS_REMOUNT | unix.MS_BIND | unix.MS_RDONLY
if err := unix.Mount("", dest, "", uintptr(flags), ""); err != nil {
switch err {
case unix.EBUSY:
if err := mount("", dest, "", "", uintptr(flags), ""); err != nil {
if errors.Is(err, unix.EBUSY) {
time.Sleep(100 * time.Millisecond)
continue
default:
return err
}
return err
}
return nil
}
@@ -1011,9 +1040,9 @@ func remountReadonly(m *configs.Mount) error {
// For files, maskPath bind mounts /dev/null over the top of the specified path.
// For directories, maskPath mounts read-only tmpfs over the top of the specified path.
func maskPath(path string, mountLabel string) error {
if err := unix.Mount("/dev/null", path, "", unix.MS_BIND, ""); err != nil && !os.IsNotExist(err) {
if err == unix.ENOTDIR {
return unix.Mount("tmpfs", path, "tmpfs", unix.MS_RDONLY, label.FormatMountLabel("", mountLabel))
if err := mount("/dev/null", path, "", "", unix.MS_BIND, ""); err != nil && !errors.Is(err, os.ErrNotExist) {
if errors.Is(err, unix.ENOTDIR) {
return mount("tmpfs", path, "", "tmpfs", unix.MS_RDONLY, label.FormatMountLabel("", mountLabel))
}
return err
}
@@ -1024,33 +1053,38 @@ func maskPath(path string, mountLabel string) error {
// For e.g. net.ipv4.ip_forward translated to /proc/sys/net/ipv4/ip_forward.
func writeSystemProperty(key, value string) error {
keyPath := strings.Replace(key, ".", "/", -1)
return ioutil.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0o644)
return os.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0o644)
}
func remount(m *configs.Mount, rootfs string) error {
func remount(m *configs.Mount, rootfs string, mountFd *int) error {
source := m.Source
if mountFd != nil {
source = "/proc/self/fd/" + strconv.Itoa(*mountFd)
}
return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
flags := uintptr(m.Flags | unix.MS_REMOUNT)
err := unix.Mount(m.Source, procfd, m.Device, flags, "")
err := mount(source, m.Destination, procfd, m.Device, flags, "")
if err == nil {
return nil
}
// Check if the source has ro flag...
var s unix.Statfs_t
if err := unix.Statfs(m.Source, &s); err != nil {
return &os.PathError{Op: "statfs", Path: m.Source, Err: err}
if err := unix.Statfs(source, &s); err != nil {
return &os.PathError{Op: "statfs", Path: source, Err: err}
}
if s.Flags&unix.MS_RDONLY != unix.MS_RDONLY {
return err
}
// ... and retry the mount with ro flag set.
flags |= unix.MS_RDONLY
return unix.Mount(m.Source, procfd, m.Device, flags, "")
return mount(source, m.Destination, procfd, m.Device, flags, "")
})
}
// Do the mount operation followed by additional mounts required to take care
// of propagation flags. This will always be scoped inside the container rootfs.
func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
func mountPropagate(m *configs.Mount, rootfs string, mountLabel string, mountFd *int) error {
var (
data = label.FormatMountLabel(m.Data, mountLabel)
flags = m.Flags
@@ -1067,17 +1101,22 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
// mutating underneath us, we verify that we are actually going to mount
// inside the container with WithProcfd() -- mounting through a procfd
// mounts on the target.
source := m.Source
if mountFd != nil {
source = "/proc/self/fd/" + strconv.Itoa(*mountFd)
}
if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
return unix.Mount(m.Source, procfd, m.Device, uintptr(flags), data)
return mount(source, m.Destination, procfd, m.Device, uintptr(flags), data)
}); err != nil {
return fmt.Errorf("mount through procfd: %w", err)
return err
}
// We have to apply mount propagation flags in a separate WithProcfd() call
// because the previous call invalidates the passed procfd -- the mount
// target needs to be re-opened.
if err := utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
for _, pflag := range m.PropagationFlags {
if err := unix.Mount("", procfd, "", uintptr(pflag), ""); err != nil {
if err := mount("", m.Destination, procfd, "", uintptr(pflag), ""); err != nil {
return err
}
}
@@ -1087,3 +1126,12 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
}
return nil
}
func setRecAttr(m *configs.Mount, rootfs string) error {
if m.RecAttr == nil {
return nil
}
return utils.WithProcfd(rootfs, m.Destination, func(procfd string) error {
return unix.MountSetattr(-1, procfd, unix.AT_RECURSIVE, m.RecAttr)
})
}

View File

@@ -2,6 +2,7 @@ package seccomp
import (
"fmt"
"sort"
"github.com/opencontainers/runc/libcontainer/configs"
)
@@ -16,13 +17,36 @@ var operators = map[string]configs.Operator{
"SCMP_CMP_MASKED_EQ": configs.MaskEqualTo,
}
// KnownOperators returns the list of the known operations.
// Used by `runc features`.
func KnownOperators() []string {
var res []string
for k := range operators {
res = append(res, k)
}
sort.Strings(res)
return res
}
var actions = map[string]configs.Action{
"SCMP_ACT_KILL": configs.Kill,
"SCMP_ACT_ERRNO": configs.Errno,
"SCMP_ACT_TRAP": configs.Trap,
"SCMP_ACT_ALLOW": configs.Allow,
"SCMP_ACT_TRACE": configs.Trace,
"SCMP_ACT_LOG": configs.Log,
"SCMP_ACT_KILL": configs.Kill,
"SCMP_ACT_ERRNO": configs.Errno,
"SCMP_ACT_TRAP": configs.Trap,
"SCMP_ACT_ALLOW": configs.Allow,
"SCMP_ACT_TRACE": configs.Trace,
"SCMP_ACT_LOG": configs.Log,
"SCMP_ACT_NOTIFY": configs.Notify,
}
// KnownActions returns the list of the known actions.
// Used by `runc features`.
func KnownActions() []string {
var res []string
for k := range actions {
res = append(res, k)
}
sort.Strings(res)
return res
}
var archs = map[string]string{
@@ -44,6 +68,17 @@ var archs = map[string]string{
"SCMP_ARCH_S390X": "s390x",
}
// KnownArchs returns the list of the known archs.
// Used by `runc features`.
func KnownArchs() []string {
var res []string
for k := range archs {
res = append(res, k)
}
sort.Strings(res)
return res
}
// ConvertStringToOperator converts a string into a Seccomp comparison operator.
// Comparison operators use the names they are assigned by Libseccomp's header.
// Attempting to convert a string that is not a valid operator results in an
@@ -56,9 +91,7 @@ func ConvertStringToOperator(in string) (configs.Operator, error) {
}
// ConvertStringToAction converts a string into a Seccomp rule match action.
// Actions use the names they are assigned in Libseccomp's header, though some
// (notable, SCMP_ACT_TRACE) are not available in this implementation and will
// return errors.
// Actions use the names they are assigned in Libseccomp's header.
// Attempting to convert a string that is not a valid action results in an
// error.
func ConvertStringToAction(in string) (configs.Action, error) {

View File

@@ -1,23 +1,25 @@
// +build linux,cgo,seccomp
//go:build cgo && seccomp
// +build cgo,seccomp
package patchbpf
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"io"
"os"
"runtime"
"unsafe"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
"github.com/pkg/errors"
libseccomp "github.com/seccomp/libseccomp-golang"
"github.com/sirupsen/logrus"
"golang.org/x/net/bpf"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
)
// #cgo pkg-config: libseccomp
@@ -41,6 +43,11 @@ const uintptr_t C_SET_MODE_FILTER = SECCOMP_SET_MODE_FILTER;
#endif
const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG;
#ifndef SECCOMP_FILTER_FLAG_NEW_LISTENER
# define SECCOMP_FILTER_FLAG_NEW_LISTENER (1UL << 3)
#endif
const uintptr_t C_FILTER_FLAG_NEW_LISTENER = SECCOMP_FILTER_FLAG_NEW_LISTENER;
// We use the AUDIT_ARCH_* values because those are the ones used by the kernel
// and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we
// use <seccomp.h> so we get libseccomp's fallback definitions of AUDIT_ARCH_*.
@@ -85,17 +92,16 @@ loop:
// seccomp_export_bpf outputs the program in *host* endian-ness.
var insn unix.SockFilter
if err := binary.Read(rdr, utils.NativeEndian, &insn); err != nil {
switch err {
case io.EOF:
if errors.Is(err, io.EOF) {
// Parsing complete.
break loop
case io.ErrUnexpectedEOF:
// Parsing stopped mid-instruction.
return nil, errors.Wrap(err, "program parsing halted mid-instruction")
default:
// All other errors.
return nil, errors.Wrap(err, "parsing instructions")
}
if errors.Is(err, io.ErrUnexpectedEOF) {
// Parsing stopped mid-instruction.
return nil, fmt.Errorf("program parsing halted mid-instruction: %w", err)
}
// All other errors.
return nil, fmt.Errorf("error parsing instructions: %w", err)
}
program = append(program, bpf.RawInstruction{
Op: insn.Code,
@@ -110,7 +116,7 @@ loop:
func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) {
rdr, wtr, err := os.Pipe()
if err != nil {
return nil, errors.Wrap(err, "creating scratch pipe")
return nil, fmt.Errorf("error creating scratch pipe: %w", err)
}
defer wtr.Close()
defer rdr.Close()
@@ -124,23 +130,23 @@ func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error)
}()
if err := filter.ExportBPF(wtr); err != nil {
return nil, errors.Wrap(err, "exporting BPF")
return nil, fmt.Errorf("error exporting BPF: %w", err)
}
// Close so that the reader actually gets EOF.
_ = wtr.Close()
if copyErr := <-errChan; copyErr != nil {
return nil, errors.Wrap(copyErr, "reading from ExportBPF pipe")
return nil, fmt.Errorf("error reading from ExportBPF pipe: %w", copyErr)
}
// Parse the instructions.
rawProgram, err := parseProgram(readerBuffer)
if err != nil {
return nil, errors.Wrap(err, "parsing generated BPF filter")
return nil, fmt.Errorf("parsing generated BPF filter: %w", err)
}
program, ok := bpf.Disassemble(rawProgram)
if !ok {
return nil, errors.Errorf("could not disassemble entire BPF filter")
return nil, errors.New("could not disassemble entire BPF filter")
}
return program, nil
}
@@ -155,7 +161,7 @@ func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
// Convert to actual native architecture.
arch, err := libseccomp.GetNativeArch()
if err != nil {
return invalidArch, errors.Wrap(err, "get native arch")
return invalidArch, fmt.Errorf("unable to get native arch: %w", err)
}
return archToNative(arch)
case libseccomp.ArchX86:
@@ -192,7 +198,7 @@ func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
case libseccomp.ArchS390X:
return nativeArch(C.C_AUDIT_ARCH_S390X), nil
default:
return invalidArch, errors.Errorf("unknown architecture: %v", arch)
return invalidArch, fmt.Errorf("unknown architecture: %v", arch)
}
}
@@ -209,7 +215,7 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
for _, ociArch := range config.Architectures {
arch, err := libseccomp.GetArchFromString(ociArch)
if err != nil {
return nil, errors.Wrap(err, "validating seccomp architecture")
return nil, fmt.Errorf("unable to validate seccomp architecture: %w", err)
}
// Map native architecture to a real architecture value to avoid
@@ -217,7 +223,7 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
if arch == libseccomp.ArchNative {
nativeArch, err := libseccomp.GetNativeArch()
if err != nil {
return nil, errors.Wrap(err, "get native arch")
return nil, fmt.Errorf("unable to get native architecture: %w", err)
}
arch = nativeArch
}
@@ -225,7 +231,7 @@ func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
// Figure out native architecture representation of the architecture.
nativeArch, err := archToNative(arch)
if err != nil {
return nil, errors.Wrapf(err, "cannot map architecture %v to AUDIT_ARCH_ constant", arch)
return nil, fmt.Errorf("cannot map architecture %v to AUDIT_ARCH_ constant: %w", arch, err)
}
if _, ok := lastSyscalls[nativeArch]; !ok {
@@ -370,7 +376,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
},
}, sectionTail...)
default:
return nil, errors.Errorf("unknown amd64 native architecture %#x", scmpArch)
return nil, fmt.Errorf("unknown amd64 native architecture %#x", scmpArch)
}
}
@@ -378,16 +384,16 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
case 2:
// x32 and x86_64 are a unique case, we can't handle any others.
if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
return nil, errors.Errorf("unknown architecture overlap on native arch %#x", nativeArch)
return nil, fmt.Errorf("unknown architecture overlap on native arch %#x", nativeArch)
}
x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
if !ok {
return nil, errors.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchX32, maxSyscalls)
return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchX32, maxSyscalls)
}
x86sysno, ok := maxSyscalls[libseccomp.ArchAMD64]
if !ok {
return nil, errors.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchAMD64, maxSyscalls)
return nil, fmt.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchAMD64, maxSyscalls)
}
// The x32 ABI indicates that a syscall is being made by an x32
@@ -448,7 +454,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
}...)
}
default:
return nil, errors.Errorf("invalid number of architecture overlaps: %v", len(maxSyscalls))
return nil, fmt.Errorf("invalid number of architecture overlaps: %v", len(maxSyscalls))
}
// Prepend this section to the tail.
@@ -517,7 +523,7 @@ func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error)
func assemble(program []bpf.Instruction) ([]unix.SockFilter, error) {
rawProgram, err := bpf.Assemble(program)
if err != nil {
return nil, errors.Wrap(err, "assembling program")
return nil, fmt.Errorf("error assembling program: %w", err)
}
// Convert to []unix.SockFilter for unix.SockFilter.
@@ -547,11 +553,11 @@ func generatePatch(config *configs.Seccomp) ([]bpf.Instruction, error) {
lastSyscalls, err := findLastSyscalls(config)
if err != nil {
return nil, errors.Wrap(err, "finding last syscalls for -ENOSYS stub")
return nil, fmt.Errorf("error finding last syscalls for -ENOSYS stub: %w", err)
}
stubProgram, err := generateEnosysStub(lastSyscalls)
if err != nil {
return nil, errors.Wrap(err, "generating -ENOSYS stub")
return nil, fmt.Errorf("error generating -ENOSYS stub: %w", err)
}
return stubProgram, nil
}
@@ -559,12 +565,12 @@ func generatePatch(config *configs.Seccomp) ([]bpf.Instruction, error) {
func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) ([]unix.SockFilter, error) {
program, err := disassembleFilter(filter)
if err != nil {
return nil, errors.Wrap(err, "disassembling original filter")
return nil, fmt.Errorf("error disassembling original filter: %w", err)
}
patch, err := generatePatch(config)
if err != nil {
return nil, errors.Wrap(err, "generating patch for filter")
return nil, fmt.Errorf("error generating patch for filter: %w", err)
}
fullProgram := append(patch, program...)
@@ -576,49 +582,61 @@ func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (
fprog, err := assemble(fullProgram)
if err != nil {
return nil, errors.Wrap(err, "assembling modified filter")
return nil, fmt.Errorf("error assembling modified filter: %w", err)
}
return fprog, nil
}
func filterFlags(filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) {
func filterFlags(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) {
// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
apiLevel, _ := libseccomp.GetApi()
apiLevel, _ := libseccomp.GetAPI()
noNewPrivs, err = filter.GetNoNewPrivsBit()
if err != nil {
return 0, false, errors.Wrap(err, "fetch no_new_privs filter bit")
return 0, false, fmt.Errorf("unable to fetch no_new_privs filter bit: %w", err)
}
if apiLevel >= 3 {
if logBit, err := filter.GetLogBit(); err != nil {
return 0, false, errors.Wrap(err, "fetch SECCOMP_FILTER_FLAG_LOG bit")
return 0, false, fmt.Errorf("unable to fetch SECCOMP_FILTER_FLAG_LOG bit: %w", err)
} else if logBit {
flags |= uint(C.C_FILTER_FLAG_LOG)
}
}
// TODO: Support seccomp flags not yet added to libseccomp-golang...
for _, call := range config.Syscalls {
if call.Action == configs.Notify {
flags |= uint(C.C_FILTER_FLAG_NEW_LISTENER)
break
}
}
return
}
func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (err error) {
func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (fd int, err error) {
fprog := unix.SockFprog{
Len: uint16(len(filter)),
Filter: &filter[0],
}
fd = -1 // only return a valid fd when C_FILTER_FLAG_NEW_LISTENER is set
// If no seccomp flags were requested we can use the old-school prctl(2).
if flags == 0 {
err = unix.Prctl(unix.PR_SET_SECCOMP,
unix.SECCOMP_MODE_FILTER,
uintptr(unsafe.Pointer(&fprog)), 0, 0)
} else {
_, _, errno := unix.RawSyscall(unix.SYS_SECCOMP,
fdptr, _, errno := unix.RawSyscall(unix.SYS_SECCOMP,
uintptr(C.C_SET_MODE_FILTER),
uintptr(flags), uintptr(unsafe.Pointer(&fprog)))
if errno != 0 {
err = errno
}
if flags&uint(C.C_FILTER_FLAG_NEW_LISTENER) != 0 {
fd = int(fdptr)
}
}
runtime.KeepAlive(filter)
runtime.KeepAlive(fprog)
@@ -630,17 +648,17 @@ func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (err error) {
// patches said filter to handle -ENOSYS in a much nicer manner than the
// default libseccomp default action behaviour, and loads the patched filter
// into the kernel for the current process.
func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) error {
func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) (int, error) {
// Generate a patched filter.
fprog, err := enosysPatchFilter(config, filter)
if err != nil {
return errors.Wrap(err, "patching filter")
return -1, fmt.Errorf("error patching filter: %w", err)
}
// Get the set of libseccomp flags set.
seccompFlags, noNewPrivs, err := filterFlags(filter)
seccompFlags, noNewPrivs, err := filterFlags(config, filter)
if err != nil {
return errors.Wrap(err, "fetch seccomp filter flags")
return -1, fmt.Errorf("unable to fetch seccomp filter flags: %w", err)
}
// Set no_new_privs if it was requested, though in runc we handle
@@ -648,13 +666,15 @@ func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) error
if noNewPrivs {
logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path")
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return errors.Wrap(err, "enable no_new_privs bit")
return -1, fmt.Errorf("error enabling no_new_privs bit: %w", err)
}
}
// Finally, load the filter.
if err := sysSeccompSetFilter(seccompFlags, fprog); err != nil {
return errors.Wrap(err, "loading seccomp filter")
fd, err := sysSeccompSetFilter(seccompFlags, fprog)
if err != nil {
return -1, fmt.Errorf("error loading seccomp filter: %w", err)
}
return nil
return fd, nil
}

View File

@@ -1,3 +1,4 @@
//go:build !linux || !cgo || !seccomp
// +build !linux !cgo !seccomp
package patchbpf

View File

@@ -1,4 +1,5 @@
// +build linux,cgo,seccomp
//go:build cgo && seccomp
// +build cgo,seccomp
package seccomp
@@ -6,19 +7,16 @@ import (
"errors"
"fmt"
libseccomp "github.com/seccomp/libseccomp-golang"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/seccomp/patchbpf"
libseccomp "github.com/seccomp/libseccomp-golang"
"golang.org/x/sys/unix"
)
var (
actAllow = libseccomp.ActAllow
actTrap = libseccomp.ActTrap
actKill = libseccomp.ActKill
actTrace = libseccomp.ActTrace.SetReturnCode(int16(unix.EPERM))
actLog = libseccomp.ActLog
actErrno = libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM))
)
@@ -27,77 +25,118 @@ const (
syscallMaxArguments int = 6
)
// Filters given syscalls in a container, preventing them from being used
// Started in the container init process, and carried over to all child processes
// Setns calls, however, require a separate invocation, as they are not children
// of the init until they join the namespace
func InitSeccomp(config *configs.Seccomp) error {
// InitSeccomp installs the seccomp filters to be used in the container as
// specified in config.
// Returns the seccomp file descriptor if any of the filters include a
// SCMP_ACT_NOTIFY action, otherwise returns -1.
func InitSeccomp(config *configs.Seccomp) (int, error) {
if config == nil {
return errors.New("cannot initialize Seccomp - nil config passed")
return -1, errors.New("cannot initialize Seccomp - nil config passed")
}
defaultAction, err := getAction(config.DefaultAction, config.DefaultErrnoRet)
if err != nil {
return errors.New("error initializing seccomp - invalid default action")
return -1, errors.New("error initializing seccomp - invalid default action")
}
// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
apiLevel, _ := libseccomp.GetAPI()
for _, call := range config.Syscalls {
if call.Action == configs.Notify {
if apiLevel < 6 {
return -1, fmt.Errorf("seccomp notify unsupported: API level: got %d, want at least 6. Please try with libseccomp >= 2.5.0 and Linux >= 5.7", apiLevel)
}
// We can't allow the write syscall to notify to the seccomp agent.
// After InitSeccomp() is called, we need to syncParentSeccomp() to write the seccomp fd plain
// number, so the parent sends it to the seccomp agent. If we use SCMP_ACT_NOTIFY on write, we
// never can write the seccomp fd to the parent and therefore the seccomp agent never receives
// the seccomp fd and runc is hang during initialization.
//
// Note that read()/close(), that are also used in syncParentSeccomp(), _can_ use SCMP_ACT_NOTIFY.
// Because we write the seccomp fd on the pipe to the parent, the parent is able to proceed and
// send the seccomp fd to the agent (it is another process and not subject to the seccomp
// filter). We will be blocked on read()/close() inside syncParentSeccomp() but if the seccomp
// agent allows those syscalls to proceed, initialization works just fine and the agent can
// handle future read()/close() syscalls as it wanted.
if call.Name == "write" {
return -1, errors.New("SCMP_ACT_NOTIFY cannot be used for the write syscall")
}
}
}
// See comment on why write is not allowed. The same reason applies, as this can mean handling write too.
if defaultAction == libseccomp.ActNotify {
return -1, errors.New("SCMP_ACT_NOTIFY cannot be used as default action")
}
filter, err := libseccomp.NewFilter(defaultAction)
if err != nil {
return fmt.Errorf("error creating filter: %s", err)
return -1, fmt.Errorf("error creating filter: %w", err)
}
// Add extra architectures
for _, arch := range config.Architectures {
scmpArch, err := libseccomp.GetArchFromString(arch)
if err != nil {
return fmt.Errorf("error validating Seccomp architecture: %s", err)
return -1, fmt.Errorf("error validating Seccomp architecture: %w", err)
}
if err := filter.AddArch(scmpArch); err != nil {
return fmt.Errorf("error adding architecture to seccomp filter: %s", err)
return -1, fmt.Errorf("error adding architecture to seccomp filter: %w", err)
}
}
// Unset no new privs bit
if err := filter.SetNoNewPrivsBit(false); err != nil {
return fmt.Errorf("error setting no new privileges: %s", err)
return -1, fmt.Errorf("error setting no new privileges: %w", err)
}
// Add a rule for each syscall
for _, call := range config.Syscalls {
if call == nil {
return errors.New("encountered nil syscall while initializing Seccomp")
return -1, errors.New("encountered nil syscall while initializing Seccomp")
}
if err := matchCall(filter, call, defaultAction); err != nil {
return err
return -1, err
}
}
if err := patchbpf.PatchAndLoad(config, filter); err != nil {
return fmt.Errorf("error loading seccomp filter into kernel: %s", err)
seccompFd, err := patchbpf.PatchAndLoad(config, filter)
if err != nil {
return -1, fmt.Errorf("error loading seccomp filter into kernel: %w", err)
}
return nil
return seccompFd, nil
}
// Convert Libcontainer Action to Libseccomp ScmpAction
func getAction(act configs.Action, errnoRet *uint) (libseccomp.ScmpAction, error) {
switch act {
case configs.Kill:
return actKill, nil
return libseccomp.ActKill, nil
case configs.Errno:
if errnoRet != nil {
return libseccomp.ActErrno.SetReturnCode(int16(*errnoRet)), nil
}
return actErrno, nil
case configs.Trap:
return actTrap, nil
return libseccomp.ActTrap, nil
case configs.Allow:
return actAllow, nil
return libseccomp.ActAllow, nil
case configs.Trace:
if errnoRet != nil {
return libseccomp.ActTrace.SetReturnCode(int16(*errnoRet)), nil
}
return actTrace, nil
case configs.Log:
return actLog, nil
return libseccomp.ActLog, nil
case configs.Notify:
return libseccomp.ActNotify, nil
case configs.KillThread:
return libseccomp.ActKillThread, nil
case configs.KillProcess:
return libseccomp.ActKillProcess, nil
default:
return libseccomp.ActInvalid, errors.New("invalid action, cannot use in rule")
}
@@ -162,17 +201,18 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall, defAct libs
return nil
}
// If we can't resolve the syscall, assume it's not supported on this kernel
// Ignore it, don't error out
// If we can't resolve the syscall, assume it is not supported
// by this kernel. Warn about it, don't error out.
callNum, err := libseccomp.GetSyscallFromName(call.Name)
if err != nil {
logrus.Debugf("unknown seccomp syscall %q ignored", call.Name)
return nil
}
// Unconditional match - just add the rule
if len(call.Args) == 0 {
if err := filter.AddRule(callNum, callAct); err != nil {
return fmt.Errorf("error adding seccomp filter rule for syscall %s: %s", call.Name, err)
return fmt.Errorf("error adding seccomp filter rule for syscall %s: %w", call.Name, err)
}
} else {
// If two or more arguments have the same condition,
@@ -183,7 +223,7 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall, defAct libs
for _, cond := range call.Args {
newCond, err := getCondition(cond)
if err != nil {
return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %s", call.Name, err)
return fmt.Errorf("error creating seccomp syscall condition for syscall %s: %w", call.Name, err)
}
argCounts[cond.Index] += 1
@@ -206,14 +246,14 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall, defAct libs
condArr := []libseccomp.ScmpCondition{cond}
if err := filter.AddRuleConditional(callNum, callAct, condArr); err != nil {
return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
return fmt.Errorf("error adding seccomp rule for syscall %s: %w", call.Name, err)
}
}
} else {
// No conditions share same argument
// Use new, proper behavior
if err := filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
return fmt.Errorf("error adding seccomp rule for syscall %s: %w", call.Name, err)
}
}
}
@@ -225,3 +265,6 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall, defAct libs
func Version() (uint, uint, uint) {
return libseccomp.GetLibraryVersion()
}
// Enabled is true if seccomp support is compiled in.
const Enabled = true

View File

@@ -1,3 +1,4 @@
//go:build !linux || !cgo || !seccomp
// +build !linux !cgo !seccomp
package seccomp
@@ -11,14 +12,17 @@ import (
var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
// InitSeccomp does nothing because seccomp is not supported.
func InitSeccomp(config *configs.Seccomp) error {
func InitSeccomp(config *configs.Seccomp) (int, error) {
if config != nil {
return ErrSeccompNotEnabled
return -1, ErrSeccompNotEnabled
}
return nil
return -1, nil
}
// Version returns major, minor, and micro.
func Version() (uint, uint, uint) {
return 0, 0, 0
}
// Enabled is true if seccomp support is compiled in.
const Enabled = false

View File

@@ -1,19 +1,19 @@
// +build linux
package libcontainer
import (
"errors"
"fmt"
"os"
"runtime"
"strconv"
"github.com/opencontainers/selinux/go-selinux"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/selinux/go-selinux"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
// linuxSetnsInit performs the container's initialization for running a new process
@@ -30,9 +30,6 @@ func (l *linuxSetnsInit) getSessionRingName() string {
}
func (l *linuxSetnsInit) Init() error {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
if !l.config.Config.NoNewKeyring {
if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
return err
@@ -44,8 +41,8 @@ func (l *linuxSetnsInit) Init() error {
// don't bail on ENOSYS.
//
// TODO(cyphar): And we should have logging here too.
if errors.Cause(err) != unix.ENOSYS {
return errors.Wrap(err, "join session keyring")
if !errors.Is(err, unix.ENOSYS) {
return fmt.Errorf("unable to join session keyring: %w", err)
}
}
}
@@ -70,7 +67,12 @@ func (l *linuxSetnsInit) Init() error {
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
if err != nil {
return err
}
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
return err
}
}
@@ -84,14 +86,19 @@ func (l *linuxSetnsInit) Init() error {
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
if err != nil {
return fmt.Errorf("unable to init seccomp: %w", err)
}
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
return err
}
}
logrus.Debugf("setns_init: about to exec")
// Close the log pipe fd so the parent's ForwardLogs can exit.
if err := unix.Close(l.logFd); err != nil {
return newSystemErrorWithCause(err, "closing log pipe fd")
return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
}
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())

View File

@@ -1,27 +0,0 @@
package stacktrace
import "runtime"
// Capture captures a stacktrace for the current calling go program
//
// skip is the number of frames to skip
func Capture(userSkip int) Stacktrace {
var (
skip = userSkip + 1 // add one for our own function
frames []Frame
prevPc uintptr
)
for i := skip; ; i++ {
pc, file, line, ok := runtime.Caller(i)
// detect if caller is repeated to avoid loop, gccgo
// currently runs into a loop without this check
if !ok || pc == prevPc {
break
}
frames = append(frames, NewFrame(pc, file, line))
prevPc = pc
}
return Stacktrace{
Frames: frames,
}
}

View File

@@ -1,38 +0,0 @@
package stacktrace
import (
"path/filepath"
"runtime"
"strings"
)
// NewFrame returns a new stack frame for the provided information
func NewFrame(pc uintptr, file string, line int) Frame {
fn := runtime.FuncForPC(pc)
if fn == nil {
return Frame{}
}
pack, name := parseFunctionName(fn.Name())
return Frame{
Line: line,
File: filepath.Base(file),
Package: pack,
Function: name,
}
}
func parseFunctionName(name string) (string, string) {
i := strings.LastIndex(name, ".")
if i == -1 {
return "", name
}
return name[:i], name[i+1:]
}
// Frame contains all the information for a stack frame within a go program
type Frame struct {
File string
Function string
Package string
Line int
}

View File

@@ -1,5 +0,0 @@
package stacktrace
type Stacktrace struct {
Frames []Frame
}

View File

@@ -1,23 +1,22 @@
// +build linux
package libcontainer
import (
"errors"
"fmt"
"os"
"os/exec"
"runtime"
"strconv"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/apparmor"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/keys"
"github.com/opencontainers/runc/libcontainer/seccomp"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
type linuxStandardInit struct {
@@ -26,6 +25,7 @@ type linuxStandardInit struct {
parentPid int
fifoFd int
logFd int
mountFds []int
config *initConfig
}
@@ -46,8 +46,6 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {
}
func (l *linuxStandardInit) Init() error {
runtime.LockOSThread()
defer runtime.UnlockOSThread()
if !l.config.Config.NoNewKeyring {
if err := selinux.SetKeyLabel(l.config.ProcessLabel); err != nil {
return err
@@ -65,15 +63,15 @@ func (l *linuxStandardInit) Init() error {
//
// TODO(cyphar): Log this so people know what's going on, once we
// have proper logging in 'runc init'.
if errors.Cause(err) != unix.ENOSYS {
return errors.Wrap(err, "join session keyring")
if !errors.Is(err, unix.ENOSYS) {
return fmt.Errorf("unable to join session keyring: %w", err)
}
} else {
// Make session keyring searcheable. If we've gotten this far we
// Make session keyring searchable. If we've gotten this far we
// bail on any error -- we don't want to have a keyring with bad
// permissions.
if err := keys.ModKeyringPerm(sessKeyId, keepperms, newperms); err != nil {
return errors.Wrap(err, "mod keyring permissions")
return fmt.Errorf("unable to mod keyring permissions: %w", err)
}
}
}
@@ -87,9 +85,23 @@ func (l *linuxStandardInit) Init() error {
// initialises the labeling system
selinux.GetEnabled()
if err := prepareRootfs(l.pipe, l.config); err != nil {
// We don't need the mountFds after prepareRootfs() nor if it fails.
err := prepareRootfs(l.pipe, l.config, l.mountFds)
for _, m := range l.mountFds {
if m == -1 {
continue
}
if err := unix.Close(m); err != nil {
return fmt.Errorf("Unable to close mountFds fds: %w", err)
}
}
if err != nil {
return err
}
// Set up the console. This has to be done *before* we finalize the rootfs,
// but *after* we've given the user the chance to set up all of the mounts
// they wanted.
@@ -98,7 +110,7 @@ func (l *linuxStandardInit) Init() error {
return err
}
if err := system.Setctty(); err != nil {
return errors.Wrap(err, "setctty")
return &os.SyscallError{Syscall: "ioctl(setctty)", Err: err}
}
}
@@ -111,52 +123,57 @@ func (l *linuxStandardInit) Init() error {
if hostname := l.config.Config.Hostname; hostname != "" {
if err := unix.Sethostname([]byte(hostname)); err != nil {
return errors.Wrap(err, "sethostname")
return &os.SyscallError{Syscall: "sethostname", Err: err}
}
}
if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil {
return errors.Wrap(err, "apply apparmor profile")
return fmt.Errorf("unable to apply apparmor profile: %w", err)
}
for key, value := range l.config.Config.Sysctl {
if err := writeSystemProperty(key, value); err != nil {
return errors.Wrapf(err, "write sysctl key %s", key)
return err
}
}
for _, path := range l.config.Config.ReadonlyPaths {
if err := readonlyPath(path); err != nil {
return errors.Wrapf(err, "readonly path %s", path)
return fmt.Errorf("can't make %q read-only: %w", path, err)
}
}
for _, path := range l.config.Config.MaskPaths {
if err := maskPath(path, l.config.Config.MountLabel); err != nil {
return errors.Wrapf(err, "mask path %s", path)
return fmt.Errorf("can't mask path %s: %w", path, err)
}
}
pdeath, err := system.GetParentDeathSignal()
if err != nil {
return errors.Wrap(err, "get pdeath signal")
return fmt.Errorf("can't get pdeath signal: %w", err)
}
if l.config.NoNewPrivileges {
if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
return errors.Wrap(err, "set nonewprivileges")
return &os.SyscallError{Syscall: "prctl(SET_NO_NEW_PRIVS)", Err: err}
}
}
// Tell our parent that we're ready to Execv. This must be done before the
// Seccomp rules have been applied, because we need to be able to read and
// write to a socket.
if err := syncParentReady(l.pipe); err != nil {
return errors.Wrap(err, "sync ready")
return fmt.Errorf("sync ready: %w", err)
}
if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil {
return errors.Wrap(err, "set process label")
return fmt.Errorf("can't set process label: %w", err)
}
defer selinux.SetExecLabel("") //nolint: errcheck
// Without NoNewPrivileges seccomp is a privileged operation, so we need to
// do this before dropping capabilities; otherwise do it as late as possible
// just before execve so as few syscalls take place after it as possible.
if l.config.Config.Seccomp != nil && !l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
if err != nil {
return err
}
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
return err
}
}
@@ -166,7 +183,7 @@ func (l *linuxStandardInit) Init() error {
// finalizeNamespace can change user/group which clears the parent death
// signal, so we restore it here.
if err := pdeath.Restore(); err != nil {
return errors.Wrap(err, "restore pdeath signal")
return fmt.Errorf("can't restore pdeath signal: %w", err)
}
// Compare the parent from the initial start of the init process and make
// sure that it did not change. if the parent changes that means it died
@@ -181,26 +198,43 @@ func (l *linuxStandardInit) Init() error {
if err != nil {
return err
}
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles). However, this needs to be done
// before closing the pipe since we need it to pass the seccompFd to
// the parent.
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
seccompFd, err := seccomp.InitSeccomp(l.config.Config.Seccomp)
if err != nil {
return fmt.Errorf("unable to init seccomp: %w", err)
}
if err := syncParentSeccomp(l.pipe, seccompFd); err != nil {
return err
}
}
// Close the pipe to signal that we have completed our init.
logrus.Debugf("init: closing the pipe to signal completion")
_ = l.pipe.Close()
// Close the log pipe fd so the parent's ForwardLogs can exit.
if err := unix.Close(l.logFd); err != nil {
return newSystemErrorWithCause(err, "closing log pipe fd")
return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
}
// Wait for the FIFO to be opened on the other side before exec-ing the
// user process. We open it through /proc/self/fd/$fd, because the fd that
// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
// re-open an O_PATH fd through /proc.
fd, err := unix.Open("/proc/self/fd/"+strconv.Itoa(l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
fifoPath := "/proc/self/fd/" + strconv.Itoa(l.fifoFd)
fd, err := unix.Open(fifoPath, unix.O_WRONLY|unix.O_CLOEXEC, 0)
if err != nil {
return newSystemErrorWithCause(err, "open exec fifo")
return &os.PathError{Op: "open exec fifo", Path: fifoPath, Err: err}
}
if _, err := unix.Write(fd, []byte("0")); err != nil {
return newSystemErrorWithCause(err, "write 0 exec fifo")
return &os.PathError{Op: "write exec fifo", Path: fifoPath, Err: err}
}
// Close the O_PATH fifofd fd before exec because the kernel resets
// dumpable in the wrong order. This has been fixed in newer kernels, but
// we keep this to ensure CVE-2016-9962 doesn't re-emerge on older kernels.
@@ -208,14 +242,6 @@ func (l *linuxStandardInit) Init() error {
// since been resolved.
// https://github.com/torvalds/linux/blob/v4.9/fs/exec.c#L1290-L1318
_ = unix.Close(l.fifoFd)
// Set seccomp as close to execve as possible, so as few syscalls take
// place afterward (reducing the amount of syscalls that users need to
// enable in their seccomp profiles).
if l.config.Config.Seccomp != nil && l.config.NoNewPrivileges {
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
return newSystemErrorWithCause(err, "init seccomp")
}
}
s := l.config.SpecState
s.Pid = unix.Getpid()
@@ -224,8 +250,5 @@ func (l *linuxStandardInit) Init() error {
return err
}
if err := system.Exec(name, l.config.Args[0:], os.Environ()); err != nil {
return newSystemErrorWithCause(err, "exec user process")
}
return nil
return system.Exec(name, l.config.Args[0:], os.Environ())
}

View File

@@ -1,5 +1,3 @@
// +build linux
package libcontainer
import (
@@ -117,7 +115,7 @@ func (r *runningState) transition(s containerState) error {
switch s.(type) {
case *stoppedState:
if r.c.runType() == Running {
return newGenericError(fmt.Errorf("container still running"), ContainerNotStopped)
return ErrRunning
}
r.c.state = s
return nil
@@ -132,7 +130,7 @@ func (r *runningState) transition(s containerState) error {
func (r *runningState) destroy() error {
if r.c.runType() == Running {
return newGenericError(fmt.Errorf("container is not destroyed"), ContainerNotStopped)
return ErrRunning
}
return destroy(r.c)
}
@@ -190,7 +188,7 @@ func (p *pausedState) destroy() error {
}
return destroy(p.c)
}
return newGenericError(fmt.Errorf("container is paused"), ContainerPaused)
return ErrPaused
}
// restoredState is the same as the running state but also has associated checkpoint

Some files were not shown because too many files have changed in this diff Show More