Merge pull request #9045 from thaJeztah/less_libcontainer

remove uses of github.com/runc/libcontainer/cgroups
This commit is contained in:
Akihiro Suda 2023-09-02 07:56:41 +09:00 committed by GitHub
commit 74705ae4f4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
83 changed files with 62 additions and 9459 deletions

1
go.mod
View File

@ -88,7 +88,6 @@ require (
github.com/containerd/typeurl v1.0.2 // indirect
github.com/containers/ocicrypt v1.1.6 // indirect
github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect
github.com/cyphar/filepath-securejoin v0.2.3 // indirect
github.com/go-logr/logr v1.2.4 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/godbus/dbus/v5 v5.1.0 // indirect

2
go.sum
View File

@ -342,7 +342,6 @@ github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7Do
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/cyphar/filepath-securejoin v0.2.2/go.mod h1:FpkQEhXnPnOthhzymB7CGsFk2G9VLXONKD9G7QGMM+4=
github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI=
github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4=
github.com/d2g/dhcp4 v0.0.0-20170904100407-a1d1b6c41b1c/go.mod h1:Ct2BUK8SB0YC1SMSibvLzxjeJLnrYEVLULFNiHY9YfQ=
github.com/d2g/dhcp4client v1.0.0/go.mod h1:j0hNfjhrt2SxUOw55nL0ATM/z4Yt3t2Kd1mW34z5W5s=
@ -885,7 +884,6 @@ github.com/sclevine/spec v1.2.0/go.mod h1:W4J29eT/Kzv7/b9IWLB055Z+qvVC9vt0Arko24
github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc=
github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo=
github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg=
github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646 h1:RpforrEYXWkmGwJHIGnLZ3tTWStkjVVstwzNGqxX2Ds=
github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc=
github.com/sirupsen/logrus v1.0.4-0.20170822132746-89742aefa4b2/go.mod h1:pMByvHTf9Beacp5x1UXfOR9xyW/9antXMhjMPG0dEzc=
github.com/sirupsen/logrus v1.0.6/go.mod h1:pMByvHTf9Beacp5x1UXfOR9xyW/9antXMhjMPG0dEzc=

View File

@ -21,8 +21,8 @@ import (
"sort"
"github.com/containerd/containerd/log"
"github.com/containerd/containerd/pkg/systemd"
runcoptions "github.com/containerd/containerd/runtime/v2/runc/options"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

View File

@ -21,8 +21,8 @@ import (
"testing"
criconfig "github.com/containerd/containerd/pkg/cri/config"
"github.com/containerd/containerd/pkg/systemd"
"github.com/containerd/containerd/plugin"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/stretchr/testify/assert"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

View File

@ -21,8 +21,8 @@ import (
"sort"
"github.com/containerd/containerd/log"
"github.com/containerd/containerd/pkg/systemd"
runcoptions "github.com/containerd/containerd/runtime/v2/runc/options"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

View File

@ -21,8 +21,8 @@ import (
"testing"
criconfig "github.com/containerd/containerd/pkg/cri/config"
"github.com/containerd/containerd/pkg/systemd"
"github.com/containerd/containerd/plugin"
"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
"github.com/stretchr/testify/assert"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)

58
pkg/systemd/util.go Normal file
View File

@ -0,0 +1,58 @@
//go:build linux
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
// Copyright 2015 CoreOS, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package systemd
import (
"os"
"sync"
)
var (
runningSystemd bool
detectSystemd sync.Once
)
// IsRunningSystemd checks whether the host was booted with systemd as its init
// system. This functions similarly to systemd's `sd_booted(3)`: internally, it
// checks whether /run/systemd/system/ exists and is a directory.
// https://github.com/coreos/go-systemd/blob/d843340ab4bd3815fda02e648f9b09ae2dc722a7/util/util.go#L68-L78
func IsRunningSystemd() bool {
detectSystemd.Do(func() {
fi, err := os.Lstat("/run/systemd/system")
if err != nil {
return
}
runningSystemd = fi.IsDir()
})
return runningSystemd
}

View File

@ -1,21 +0,0 @@
# Copyright (C) 2017 SUSE LLC. All rights reserved.
# Use of this source code is governed by a BSD-style
# license that can be found in the LICENSE file.
language: go
go:
- 1.13.x
- 1.16.x
- tip
arch:
- AMD64
- ppc64le
os:
- linux
- osx
script:
- go test -cover -v ./...
notifications:
email: false

View File

@ -1,28 +0,0 @@
Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved.
Copyright (C) 2017 SUSE LLC. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

View File

@ -1,79 +0,0 @@
## `filepath-securejoin` ##
[![Build Status](https://travis-ci.org/cyphar/filepath-securejoin.svg?branch=master)](https://travis-ci.org/cyphar/filepath-securejoin)
An implementation of `SecureJoin`, a [candidate for inclusion in the Go
standard library][go#20126]. The purpose of this function is to be a "secure"
alternative to `filepath.Join`, and in particular it provides certain
guarantees that are not provided by `filepath.Join`.
> **NOTE**: This code is *only* safe if you are not at risk of other processes
> modifying path components after you've used `SecureJoin`. If it is possible
> for a malicious process to modify path components of the resolved path, then
> you will be vulnerable to some fairly trivial TOCTOU race conditions. [There
> are some Linux kernel patches I'm working on which might allow for a better
> solution.][lwn-obeneath]
>
> In addition, with a slightly modified API it might be possible to use
> `O_PATH` and verify that the opened path is actually the resolved one -- but
> I have not done that yet. I might add it in the future as a helper function
> to help users verify the path (we can't just return `/proc/self/fd/<foo>`
> because that doesn't always work transparently for all users).
This is the function prototype:
```go
func SecureJoin(root, unsafePath string) (string, error)
```
This library **guarantees** the following:
* If no error is set, the resulting string **must** be a child path of
`root` and will not contain any symlink path components (they will all be
expanded).
* When expanding symlinks, all symlink path components **must** be resolved
relative to the provided root. In particular, this can be considered a
userspace implementation of how `chroot(2)` operates on file paths. Note that
these symlinks will **not** be expanded lexically (`filepath.Clean` is not
called on the input before processing).
* Non-existent path components are unaffected by `SecureJoin` (similar to
`filepath.EvalSymlinks`'s semantics).
* The returned path will always be `filepath.Clean`ed and thus not contain any
`..` components.
A (trivial) implementation of this function on GNU/Linux systems could be done
with the following (note that this requires root privileges and is far more
opaque than the implementation in this library, and also requires that
`readlink` is inside the `root` path):
```go
package securejoin
import (
"os/exec"
"path/filepath"
)
func SecureJoin(root, unsafePath string) (string, error) {
unsafePath = string(filepath.Separator) + unsafePath
cmd := exec.Command("chroot", root,
"readlink", "--canonicalize-missing", "--no-newline", unsafePath)
output, err := cmd.CombinedOutput()
if err != nil {
return "", err
}
expanded := string(output)
return filepath.Join(root, expanded), nil
}
```
[lwn-obeneath]: https://lwn.net/Articles/767547/
[go#20126]: https://github.com/golang/go/issues/20126
### License ###
The license of this project is the same as Go, which is a BSD 3-clause license
available in the `LICENSE` file.

View File

@ -1 +0,0 @@
0.2.3

View File

@ -1,115 +0,0 @@
// Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved.
// Copyright (C) 2017 SUSE LLC. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package securejoin is an implementation of the hopefully-soon-to-be-included
// SecureJoin helper that is meant to be part of the "path/filepath" package.
// The purpose of this project is to provide a PoC implementation to make the
// SecureJoin proposal (https://github.com/golang/go/issues/20126) more
// tangible.
package securejoin
import (
"bytes"
"errors"
"os"
"path/filepath"
"strings"
"syscall"
)
// IsNotExist tells you if err is an error that implies that either the path
// accessed does not exist (or path components don't exist). This is
// effectively a more broad version of os.IsNotExist.
func IsNotExist(err error) bool {
// Check that it's not actually an ENOTDIR, which in some cases is a more
// convoluted case of ENOENT (usually involving weird paths).
return errors.Is(err, os.ErrNotExist) || errors.Is(err, syscall.ENOTDIR) || errors.Is(err, syscall.ENOENT)
}
// SecureJoinVFS joins the two given path components (similar to Join) except
// that the returned path is guaranteed to be scoped inside the provided root
// path (when evaluated). Any symbolic links in the path are evaluated with the
// given root treated as the root of the filesystem, similar to a chroot. The
// filesystem state is evaluated through the given VFS interface (if nil, the
// standard os.* family of functions are used).
//
// Note that the guarantees provided by this function only apply if the path
// components in the returned string are not modified (in other words are not
// replaced with symlinks on the filesystem) after this function has returned.
// Such a symlink race is necessarily out-of-scope of SecureJoin.
func SecureJoinVFS(root, unsafePath string, vfs VFS) (string, error) {
// Use the os.* VFS implementation if none was specified.
if vfs == nil {
vfs = osVFS{}
}
var path bytes.Buffer
n := 0
for unsafePath != "" {
if n > 255 {
return "", &os.PathError{Op: "SecureJoin", Path: root + "/" + unsafePath, Err: syscall.ELOOP}
}
// Next path component, p.
i := strings.IndexRune(unsafePath, filepath.Separator)
var p string
if i == -1 {
p, unsafePath = unsafePath, ""
} else {
p, unsafePath = unsafePath[:i], unsafePath[i+1:]
}
// Create a cleaned path, using the lexical semantics of /../a, to
// create a "scoped" path component which can safely be joined to fullP
// for evaluation. At this point, path.String() doesn't contain any
// symlink components.
cleanP := filepath.Clean(string(filepath.Separator) + path.String() + p)
if cleanP == string(filepath.Separator) {
path.Reset()
continue
}
fullP := filepath.Clean(root + cleanP)
// Figure out whether the path is a symlink.
fi, err := vfs.Lstat(fullP)
if err != nil && !IsNotExist(err) {
return "", err
}
// Treat non-existent path components the same as non-symlinks (we
// can't do any better here).
if IsNotExist(err) || fi.Mode()&os.ModeSymlink == 0 {
path.WriteString(p)
path.WriteRune(filepath.Separator)
continue
}
// Only increment when we actually dereference a link.
n++
// It's a symlink, expand it by prepending it to the yet-unparsed path.
dest, err := vfs.Readlink(fullP)
if err != nil {
return "", err
}
// Absolute symlinks reset any work we've already done.
if filepath.IsAbs(dest) {
path.Reset()
}
unsafePath = dest + string(filepath.Separator) + unsafePath
}
// We have to clean path.String() here because it may contain '..'
// components that are entirely lexical, but would be misleading otherwise.
// And finally do a final clean to ensure that root is also lexically
// clean.
fullP := filepath.Clean(string(filepath.Separator) + path.String())
return filepath.Clean(root + fullP), nil
}
// SecureJoin is a wrapper around SecureJoinVFS that just uses the os.* library
// of functions as the VFS. If in doubt, use this function over SecureJoinVFS.
func SecureJoin(root, unsafePath string) (string, error) {
return SecureJoinVFS(root, unsafePath, nil)
}

View File

@ -1,41 +0,0 @@
// Copyright (C) 2017 SUSE LLC. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package securejoin
import "os"
// In future this should be moved into a separate package, because now there
// are several projects (umoci and go-mtree) that are using this sort of
// interface.
// VFS is the minimal interface necessary to use SecureJoinVFS. A nil VFS is
// equivalent to using the standard os.* family of functions. This is mainly
// used for the purposes of mock testing, but also can be used to otherwise use
// SecureJoin with VFS-like system.
type VFS interface {
// Lstat returns a FileInfo describing the named file. If the file is a
// symbolic link, the returned FileInfo describes the symbolic link. Lstat
// makes no attempt to follow the link. These semantics are identical to
// os.Lstat.
Lstat(name string) (os.FileInfo, error)
// Readlink returns the destination of the named symbolic link. These
// semantics are identical to os.Readlink.
Readlink(name string) (string, error)
}
// osVFS is the "nil" VFS, in that it just passes everything through to the os
// module.
type osVFS struct{}
// Lstat returns a FileInfo describing the named file. If the file is a
// symbolic link, the returned FileInfo describes the symbolic link. Lstat
// makes no attempt to follow the link. These semantics are identical to
// os.Lstat.
func (o osVFS) Lstat(name string) (os.FileInfo, error) { return os.Lstat(name) }
// Readlink returns the destination of the named symbolic link. These
// semantics are identical to os.Readlink.
func (o osVFS) Readlink(name string) (string, error) { return os.Readlink(name) }

View File

@ -1,59 +0,0 @@
package cgroups
import (
"github.com/opencontainers/runc/libcontainer/configs"
)
type Manager interface {
// Apply creates a cgroup, if not yet created, and adds a process
// with the specified pid into that cgroup. A special value of -1
// can be used to merely create a cgroup.
Apply(pid int) error
// GetPids returns the PIDs of all processes inside the cgroup.
GetPids() ([]int, error)
// GetAllPids returns the PIDs of all processes inside the cgroup
// any all its sub-cgroups.
GetAllPids() ([]int, error)
// GetStats returns cgroups statistics.
GetStats() (*Stats, error)
// Freeze sets the freezer cgroup to the specified state.
Freeze(state configs.FreezerState) error
// Destroy removes cgroup.
Destroy() error
// Path returns a cgroup path to the specified controller/subsystem.
// For cgroupv2, the argument is unused and can be empty.
Path(string) string
// Set sets cgroup resources parameters/limits. If the argument is nil,
// the resources specified during Manager creation (or the previous call
// to Set) are used.
Set(r *configs.Resources) error
// GetPaths returns cgroup path(s) to save in a state file in order to
// restore later.
//
// For cgroup v1, a key is cgroup subsystem name, and the value is the
// path to the cgroup for this subsystem.
//
// For cgroup v2 unified hierarchy, a key is "", and the value is the
// unified path.
GetPaths() map[string]string
// GetCgroups returns the cgroup data as configured.
GetCgroups() (*configs.Cgroup, error)
// GetFreezerState retrieves the current FreezerState of the cgroup.
GetFreezerState() (configs.FreezerState, error)
// Exists returns whether the cgroup path exists or not.
Exists() bool
// OOMKillCount reports OOM kill count for the cgroup.
OOMKillCount() (uint64, error)
}

View File

@ -1,386 +0,0 @@
// SPDX-License-Identifier: Apache-2.0
/*
* Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com>
* Copyright (C) 2020 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package devices
import (
"bufio"
"fmt"
"io"
"sort"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/devices"
)
// deviceMeta is a Rule without the Allow or Permissions fields, and no
// wildcard-type support. It's effectively the "match" portion of a metadata
// rule, for the purposes of our emulation.
type deviceMeta struct {
node devices.Type
major int64
minor int64
}
// deviceRule is effectively the tuple (deviceMeta, Permissions).
type deviceRule struct {
meta deviceMeta
perms devices.Permissions
}
// deviceRules is a mapping of device metadata rules to the associated
// permissions in the ruleset.
type deviceRules map[deviceMeta]devices.Permissions
func (r deviceRules) orderedEntries() []deviceRule {
var rules []deviceRule
for meta, perms := range r {
rules = append(rules, deviceRule{meta: meta, perms: perms})
}
sort.Slice(rules, func(i, j int) bool {
// Sort by (major, minor, type).
a, b := rules[i].meta, rules[j].meta
return a.major < b.major ||
(a.major == b.major && a.minor < b.minor) ||
(a.major == b.major && a.minor == b.minor && a.node < b.node)
})
return rules
}
type Emulator struct {
defaultAllow bool
rules deviceRules
}
func (e *Emulator) IsBlacklist() bool {
return e.defaultAllow
}
func (e *Emulator) IsAllowAll() bool {
return e.IsBlacklist() && len(e.rules) == 0
}
func parseLine(line string) (*deviceRule, error) {
// Input: node major:minor perms.
fields := strings.FieldsFunc(line, func(r rune) bool {
return r == ' ' || r == ':'
})
if len(fields) != 4 {
return nil, fmt.Errorf("malformed devices.list rule %s", line)
}
var (
rule deviceRule
node = fields[0]
major = fields[1]
minor = fields[2]
perms = fields[3]
)
// Parse the node type.
switch node {
case "a":
// Super-special case -- "a" always means every device with every
// access mode. In fact, for devices.list this actually indicates that
// the cgroup is in black-list mode.
// TODO: Double-check that the entire file is "a *:* rwm".
return nil, nil
case "b":
rule.meta.node = devices.BlockDevice
case "c":
rule.meta.node = devices.CharDevice
default:
return nil, fmt.Errorf("unknown device type %q", node)
}
// Parse the major number.
if major == "*" {
rule.meta.major = devices.Wildcard
} else {
val, err := strconv.ParseUint(major, 10, 32)
if err != nil {
return nil, fmt.Errorf("invalid major number: %w", err)
}
rule.meta.major = int64(val)
}
// Parse the minor number.
if minor == "*" {
rule.meta.minor = devices.Wildcard
} else {
val, err := strconv.ParseUint(minor, 10, 32)
if err != nil {
return nil, fmt.Errorf("invalid minor number: %w", err)
}
rule.meta.minor = int64(val)
}
// Parse the access permissions.
rule.perms = devices.Permissions(perms)
if !rule.perms.IsValid() || rule.perms.IsEmpty() {
return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
}
return &rule, nil
}
func (e *Emulator) addRule(rule deviceRule) error { //nolint:unparam
if e.rules == nil {
e.rules = make(map[deviceMeta]devices.Permissions)
}
// Merge with any pre-existing permissions.
oldPerms := e.rules[rule.meta]
newPerms := rule.perms.Union(oldPerms)
e.rules[rule.meta] = newPerms
return nil
}
func (e *Emulator) rmRule(rule deviceRule) error {
// Give an error if any of the permissions requested to be removed are
// present in a partially-matching wildcard rule, because such rules will
// be ignored by cgroupv1.
//
// This is a diversion from cgroupv1, but is necessary to avoid leading
// users into a false sense of security. cgroupv1 will silently(!) ignore
// requests to remove partial exceptions, but we really shouldn't do that.
//
// It may seem like we could just "split" wildcard rules which hit this
// issue, but unfortunately there are 2^32 possible major and minor
// numbers, which would exhaust kernel memory quickly if we did this. Not
// to mention it'd be really slow (the kernel side is implemented as a
// linked-list of exceptions).
for _, partialMeta := range []deviceMeta{
{node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor},
{node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard},
{node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard},
} {
// This wildcard rule is equivalent to the requested rule, so skip it.
if rule.meta == partialMeta {
continue
}
// Only give an error if the set of permissions overlap.
partialPerms := e.rules[partialMeta]
if !partialPerms.Intersection(rule.perms).IsEmpty() {
return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
}
}
// Subtract all of the permissions listed from the full match rule. If the
// rule didn't exist, all of this is a no-op.
newPerms := e.rules[rule.meta].Difference(rule.perms)
if newPerms.IsEmpty() {
delete(e.rules, rule.meta)
} else {
e.rules[rule.meta] = newPerms
}
// TODO: The actual cgroup code doesn't care if an exception didn't exist
// during removal, so not erroring out here is /accurate/ but quite
// worrying. Maybe we should do additional validation, but again we
// have to worry about backwards-compatibility.
return nil
}
func (e *Emulator) allow(rule *deviceRule) error {
// This cgroup is configured as a black-list. Reset the entire emulator,
// and put is into black-list mode.
if rule == nil || rule.meta.node == devices.WildcardDevice {
*e = Emulator{
defaultAllow: true,
rules: nil,
}
return nil
}
var err error
if e.defaultAllow {
err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception")
} else {
err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception")
}
return err
}
func (e *Emulator) deny(rule *deviceRule) error {
// This cgroup is configured as a white-list. Reset the entire emulator,
// and put is into white-list mode.
if rule == nil || rule.meta.node == devices.WildcardDevice {
*e = Emulator{
defaultAllow: false,
rules: nil,
}
return nil
}
var err error
if e.defaultAllow {
err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception")
} else {
err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception")
}
return err
}
func (e *Emulator) Apply(rule devices.Rule) error {
if !rule.Type.CanCgroup() {
return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
}
innerRule := &deviceRule{
meta: deviceMeta{
node: rule.Type,
major: rule.Major,
minor: rule.Minor,
},
perms: rule.Permissions,
}
if innerRule.meta.node == devices.WildcardDevice {
innerRule = nil
}
if rule.Allow {
return e.allow(innerRule)
}
return e.deny(innerRule)
}
// EmulatorFromList takes a reader to a "devices.list"-like source, and returns
// a new Emulator that represents the state of the devices cgroup. Note that
// black-list devices cgroups cannot be fully reconstructed, due to limitations
// in the devices cgroup API. Instead, such cgroups are always treated as
// "allow all" cgroups.
func EmulatorFromList(list io.Reader) (*Emulator, error) {
// Normally cgroups are in black-list mode by default, but the way we
// figure out the current mode is whether or not devices.list has an
// allow-all rule. So we default to a white-list, and the existence of an
// "a *:* rwm" entry will tell us otherwise.
e := &Emulator{
defaultAllow: false,
}
// Parse the "devices.list".
s := bufio.NewScanner(list)
for s.Scan() {
line := s.Text()
deviceRule, err := parseLine(line)
if err != nil {
return nil, fmt.Errorf("error parsing line %q: %w", line, err)
}
// "devices.list" is an allow list. Note that this means that in
// black-list mode, we have no idea what rules are in play. As a
// result, we need to be very careful in Transition().
if err := e.allow(deviceRule); err != nil {
return nil, fmt.Errorf("error adding devices.list rule: %w", err)
}
}
if err := s.Err(); err != nil {
return nil, fmt.Errorf("error reading devices.list lines: %w", err)
}
return e, nil
}
// Transition calculates what is the minimally-disruptive set of rules need to
// be applied to a devices cgroup in order to transition to the given target.
// This means that any already-existing rules will not be applied, and
// disruptive rules (like denying all device access) will only be applied if
// necessary.
//
// This function is the sole reason for all of Emulator -- to allow us
// to figure out how to update a containers' cgroups without causing spurious
// device errors (if possible).
func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
var transitionRules []*devices.Rule
oldRules := source.rules
// If the default policy doesn't match, we need to include a "disruptive"
// rule (either allow-all or deny-all) in order to switch the cgroup to the
// correct default policy.
//
// However, due to a limitation in "devices.list" we cannot be sure what
// deny rules are in place in a black-list cgroup. Thus if the source is a
// black-list we also have to include a disruptive rule.
if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
transitionRules = append(transitionRules, &devices.Rule{
Type: 'a',
Major: -1,
Minor: -1,
Permissions: devices.Permissions("rwm"),
Allow: target.defaultAllow,
})
// The old rules are only relevant if we aren't starting out with a
// disruptive rule.
oldRules = nil
}
// NOTE: We traverse through the rules in a sorted order so we always write
// the same set of rules (this is to aid testing).
// First, we create inverse rules for any old rules not in the new set.
// This includes partial-inverse rules for specific permissions. This is a
// no-op if we added a disruptive rule, since oldRules will be empty.
for _, rule := range oldRules.orderedEntries() {
meta, oldPerms := rule.meta, rule.perms
newPerms := target.rules[meta]
droppedPerms := oldPerms.Difference(newPerms)
if !droppedPerms.IsEmpty() {
transitionRules = append(transitionRules, &devices.Rule{
Type: meta.node,
Major: meta.major,
Minor: meta.minor,
Permissions: droppedPerms,
Allow: target.defaultAllow,
})
}
}
// Add any additional rules which weren't in the old set. We happen to
// filter out rules which are present in both sets, though this isn't
// strictly necessary.
for _, rule := range target.rules.orderedEntries() {
meta, newPerms := rule.meta, rule.perms
oldPerms := oldRules[meta]
gainedPerms := newPerms.Difference(oldPerms)
if !gainedPerms.IsEmpty() {
transitionRules = append(transitionRules, &devices.Rule{
Type: meta.node,
Major: meta.major,
Minor: meta.minor,
Permissions: gainedPerms,
Allow: !target.defaultAllow,
})
}
}
return transitionRules, nil
}
// Rules returns the minimum set of rules necessary to convert a *deny-all*
// cgroup to the emulated filter state (note that this is not the same as a
// default cgroupv1 cgroup -- which is allow-all). This is effectively just a
// wrapper around Transition() with the source emulator being an empty cgroup.
func (e *Emulator) Rules() ([]*devices.Rule, error) {
defaultCgroup := &Emulator{defaultAllow: false}
return defaultCgroup.Transition(e)
}
func wrapErr(err error, text string) error {
if err == nil {
return nil
}
return fmt.Errorf(text+": %w", err)
}

View File

@ -1,208 +0,0 @@
// Package devicefilter contains eBPF device filter program
//
// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
//
// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano)
// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397
package devicefilter
import (
"errors"
"fmt"
"math"
"strconv"
"github.com/cilium/ebpf/asm"
devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/devices"
"golang.org/x/sys/unix"
)
const (
// license string format is same as kernel MODULE_LICENSE macro
license = "Apache"
)
// DeviceFilter returns eBPF device filter program and its license string
func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) {
// Generate the minimum ruleset for the device rules we are given. While we
// don't care about minimum transitions in cgroupv2, using the emulator
// gives us a guarantee that the behaviour of devices filtering is the same
// as cgroupv1, including security hardenings to avoid misconfiguration
// (such as punching holes in wildcard rules).
emu := new(devicesemulator.Emulator)
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, "", err
}
}
cleanRules, err := emu.Rules()
if err != nil {
return nil, "", err
}
p := &program{
defaultAllow: emu.IsBlacklist(),
}
p.init()
for idx, rule := range cleanRules {
if rule.Type == devices.WildcardDevice {
// We can safely skip over wildcard entries because there should
// only be one (at most) at the very start to instruct cgroupv1 to
// go into allow-list mode. However we do double-check this here.
if idx != 0 || rule.Allow != emu.IsBlacklist() {
return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString())
}
continue
}
if rule.Allow == p.defaultAllow {
// There should be no rules which have an action equal to the
// default action, the emulator removes those.
return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString())
}
if err := p.appendRule(rule); err != nil {
return nil, "", err
}
}
return p.finalize(), license, nil
}
type program struct {
insts asm.Instructions
defaultAllow bool
blockID int
}
func (p *program) init() {
// struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423
/*
u32 access_type
u32 major
u32 minor
*/
// R2 <- type (lower 16 bit of u32 access_type at R1[0])
p.insts = append(p.insts,
asm.LoadMem(asm.R2, asm.R1, 0, asm.Word),
asm.And.Imm32(asm.R2, 0xFFFF))
// R3 <- access (upper 16 bit of u32 access_type at R1[0])
p.insts = append(p.insts,
asm.LoadMem(asm.R3, asm.R1, 0, asm.Word),
// RSh: bitwise shift right
asm.RSh.Imm32(asm.R3, 16))
// R4 <- major (u32 major at R1[4])
p.insts = append(p.insts,
asm.LoadMem(asm.R4, asm.R1, 4, asm.Word))
// R5 <- minor (u32 minor at R1[8])
p.insts = append(p.insts,
asm.LoadMem(asm.R5, asm.R1, 8, asm.Word))
}
// appendRule rule converts an OCI rule to the relevant eBPF block and adds it
// to the in-progress filter program. In order to operate properly, it must be
// called with a "clean" rule list (generated by devices.Emulator.Rules() --
// with any "a" rules removed).
func (p *program) appendRule(rule *devices.Rule) error {
if p.blockID < 0 {
return errors.New("the program is finalized")
}
var bpfType int32
switch rule.Type {
case devices.CharDevice:
bpfType = int32(unix.BPF_DEVCG_DEV_CHAR)
case devices.BlockDevice:
bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK)
default:
// We do not permit 'a', nor any other types we don't know about.
return fmt.Errorf("invalid type %q", string(rule.Type))
}
if rule.Major > math.MaxUint32 {
return fmt.Errorf("invalid major %d", rule.Major)
}
if rule.Minor > math.MaxUint32 {
return fmt.Errorf("invalid minor %d", rule.Major)
}
hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1
hasMinor := rule.Minor >= 0
bpfAccess := int32(0)
for _, r := range rule.Permissions {
switch r {
case 'r':
bpfAccess |= unix.BPF_DEVCG_ACC_READ
case 'w':
bpfAccess |= unix.BPF_DEVCG_ACC_WRITE
case 'm':
bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD
default:
return fmt.Errorf("unknown device access %v", r)
}
}
// If the access is rwm, skip the check.
hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)
var (
blockSym = "block-" + strconv.Itoa(p.blockID)
nextBlockSym = "block-" + strconv.Itoa(p.blockID+1)
prevBlockLastIdx = len(p.insts) - 1
)
p.insts = append(p.insts,
// if (R2 != bpfType) goto next
asm.JNE.Imm(asm.R2, bpfType, nextBlockSym),
)
if hasAccess {
p.insts = append(p.insts,
// if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next
asm.Mov.Reg32(asm.R1, asm.R3),
asm.And.Imm32(asm.R1, bpfAccess),
asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym),
)
}
if hasMajor {
p.insts = append(p.insts,
// if (R4 != major) goto next
asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym),
)
}
if hasMinor {
p.insts = append(p.insts,
// if (R5 != minor) goto next
asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym),
)
}
p.insts = append(p.insts, acceptBlock(rule.Allow)...)
// set blockSym to the first instruction we added in this iteration
p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym)
p.blockID++
return nil
}
func (p *program) finalize() asm.Instructions {
var v int32
if p.defaultAllow {
v = 1
}
blockSym := "block-" + strconv.Itoa(p.blockID)
p.insts = append(p.insts,
// R0 <- v
asm.Mov.Imm32(asm.R0, v).Sym(blockSym),
asm.Return(),
)
p.blockID = -1
return p.insts
}
func acceptBlock(accept bool) asm.Instructions {
var v int32
if accept {
v = 1
}
return []asm.Instruction{
// R0 <- v
asm.Mov.Imm32(asm.R0, v),
asm.Return(),
}
}

View File

@ -1,253 +0,0 @@
package ebpf
import (
"errors"
"fmt"
"os"
"runtime"
"sync"
"unsafe"
"github.com/cilium/ebpf"
"github.com/cilium/ebpf/asm"
"github.com/cilium/ebpf/link"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
func nilCloser() error {
return nil
}
func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) {
type bpfAttrQuery struct {
TargetFd uint32
AttachType uint32
QueryType uint32
AttachFlags uint32
ProgIds uint64 // __aligned_u64
ProgCnt uint32
}
// Currently you can only have 64 eBPF programs attached to a cgroup.
size := 64
retries := 0
for retries < 10 {
progIds := make([]uint32, size)
query := bpfAttrQuery{
TargetFd: uint32(dirFd),
AttachType: uint32(unix.BPF_CGROUP_DEVICE),
ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))),
ProgCnt: uint32(len(progIds)),
}
// Fetch the list of program ids.
_, _, errno := unix.Syscall(unix.SYS_BPF,
uintptr(unix.BPF_PROG_QUERY),
uintptr(unsafe.Pointer(&query)),
unsafe.Sizeof(query))
size = int(query.ProgCnt)
runtime.KeepAlive(query)
if errno != 0 {
// On ENOSPC we get the correct number of programs.
if errno == unix.ENOSPC {
retries++
continue
}
return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno)
}
// Convert the ids to program handles.
progIds = progIds[:size]
programs := make([]*ebpf.Program, 0, len(progIds))
for _, progId := range progIds {
program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId))
if err != nil {
// We skip over programs that give us -EACCES or -EPERM. This
// is necessary because there may be BPF programs that have
// been attached (such as with --systemd-cgroup) which have an
// LSM label that blocks us from interacting with the program.
//
// Because additional BPF_CGROUP_DEVICE programs only can add
// restrictions, there's no real issue with just ignoring these
// programs (and stops runc from breaking on distributions with
// very strict SELinux policies).
if errors.Is(err, os.ErrPermission) {
logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err)
continue
}
return nil, fmt.Errorf("cannot fetch program from id: %w", err)
}
programs = append(programs, program)
}
runtime.KeepAlive(progIds)
return programs, nil
}
return nil, errors.New("could not get complete list of CGROUP_DEVICE programs")
}
var (
haveBpfProgReplaceBool bool
haveBpfProgReplaceOnce sync.Once
)
// Loosely based on the BPF_F_REPLACE support check in
// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go.
//
// TODO: move this logic to cilium/ebpf
func haveBpfProgReplace() bool {
haveBpfProgReplaceOnce.Do(func() {
prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
License: "MIT",
Instructions: asm.Instructions{
asm.Mov.Imm(asm.R0, 0),
asm.Return(),
},
})
if err != nil {
logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err)
return
}
defer prog.Close()
devnull, err := os.Open("/dev/null")
if err != nil {
logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err)
return
}
defer devnull.Close()
// We know that we have BPF_PROG_ATTACH since we can load
// BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL
// we know that the feature isn't present.
err = link.RawAttachProgram(link.RawAttachProgramOptions{
// We rely on this fd being checked after attachFlags.
Target: int(devnull.Fd()),
// Attempt to "replace" bad fds with this program.
Program: prog,
Attach: ebpf.AttachCGroupDevice,
Flags: unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE,
})
if errors.Is(err, unix.EINVAL) {
// not supported
return
}
// attach_flags test succeeded.
if !errors.Is(err, unix.EBADF) {
logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err)
}
haveBpfProgReplaceBool = true
})
return haveBpfProgReplaceBool
}
// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/<foo> directory.
//
// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 .
//
// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92
func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) {
// Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167).
// This limit is not inherited into the container.
memlockLimit := &unix.Rlimit{
Cur: unix.RLIM_INFINITY,
Max: unix.RLIM_INFINITY,
}
_ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit)
// Get the list of existing programs.
oldProgs, err := findAttachedCgroupDeviceFilters(dirFd)
if err != nil {
return nilCloser, err
}
useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1
// Generate new program.
spec := &ebpf.ProgramSpec{
Type: ebpf.CGroupDevice,
Instructions: insts,
License: license,
}
prog, err := ebpf.NewProgram(spec)
if err != nil {
return nilCloser, err
}
// If there is only one old program, we can just replace it directly.
var (
replaceProg *ebpf.Program
attachFlags uint32 = unix.BPF_F_ALLOW_MULTI
)
if useReplaceProg {
replaceProg = oldProgs[0]
attachFlags |= unix.BPF_F_REPLACE
}
err = link.RawAttachProgram(link.RawAttachProgramOptions{
Target: dirFd,
Program: prog,
Replace: replaceProg,
Attach: ebpf.AttachCGroupDevice,
Flags: attachFlags,
})
if err != nil {
return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err)
}
closer := func() error {
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: prog,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err)
}
// TODO: Should we attach the old filters back in this case? Otherwise
// we fail-open on a security feature, which is a bit scary.
return nil
}
if !useReplaceProg {
logLevel := logrus.DebugLevel
// If there was more than one old program, give a warning (since this
// really shouldn't happen with runc-managed cgroups) and then detach
// all the old programs.
if len(oldProgs) > 1 {
// NOTE: Ideally this should be a warning but it turns out that
// systemd-managed cgroups trigger this warning (apparently
// systemd doesn't delete old non-systemd programs when
// setting properties).
logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs))
logLevel = logrus.InfoLevel
}
for idx, oldProg := range oldProgs {
// Output some extra debug info.
if info, err := oldProg.Info(); err == nil {
fields := logrus.Fields{
"type": info.Type.String(),
"tag": info.Tag,
"name": info.Name,
}
if id, ok := info.ID(); ok {
fields["id"] = id
}
if runCount, ok := info.RunCount(); ok {
fields["run_count"] = runCount
}
if runtime, ok := info.Runtime(); ok {
fields["runtime"] = runtime.String()
}
logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx)
}
err = link.RawDetachProgram(link.RawDetachProgramOptions{
Target: dirFd,
Program: oldProg,
Attach: ebpf.AttachCGroupDevice,
})
if err != nil {
return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err)
}
}
}
return closer, nil
}

View File

@ -1,190 +0,0 @@
package cgroups
import (
"bytes"
"errors"
"fmt"
"os"
"path"
"strconv"
"strings"
"sync"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
// OpenFile opens a cgroup file in a given dir with given flags.
// It is supposed to be used for cgroup files only, and returns
// an error if the file is not a cgroup file.
//
// Arguments dir and file are joined together to form an absolute path
// to a file being opened.
func OpenFile(dir, file string, flags int) (*os.File, error) {
if dir == "" {
return nil, fmt.Errorf("no directory specified for %s", file)
}
return openFile(dir, file, flags)
}
// ReadFile reads data from a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func ReadFile(dir, file string) (string, error) {
fd, err := OpenFile(dir, file, unix.O_RDONLY)
if err != nil {
return "", err
}
defer fd.Close()
var buf bytes.Buffer
_, err = buf.ReadFrom(fd)
return buf.String(), err
}
// WriteFile writes data to a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func WriteFile(dir, file, data string) error {
fd, err := OpenFile(dir, file, unix.O_WRONLY)
if err != nil {
return err
}
defer fd.Close()
if err := retryingWriteFile(fd, data); err != nil {
// Having data in the error message helps in debugging.
return fmt.Errorf("failed to write %q: %w", data, err)
}
return nil
}
func retryingWriteFile(fd *os.File, data string) error {
for {
_, err := fd.Write([]byte(data))
if errors.Is(err, unix.EINTR) {
logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
continue
}
return err
}
}
const (
cgroupfsDir = "/sys/fs/cgroup"
cgroupfsPrefix = cgroupfsDir + "/"
)
var (
// TestMode is set to true by unit tests that need "fake" cgroupfs.
TestMode bool
cgroupFd int = -1
prepOnce sync.Once
prepErr error
resolveFlags uint64
)
func prepareOpenat2() error {
prepOnce.Do(func() {
fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
Flags: unix.O_DIRECTORY | unix.O_PATH,
})
if err != nil {
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare
logrus.Warnf("falling back to securejoin: %s", prepErr)
} else {
logrus.Debug("openat2 not available, falling back to securejoin")
}
return
}
var st unix.Statfs_t
if err = unix.Fstatfs(fd, &st); err != nil {
prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
logrus.Warnf("falling back to securejoin: %s", prepErr)
return
}
cgroupFd = fd
resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
if st.Type == unix.CGROUP2_SUPER_MAGIC {
// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
}
})
return prepErr
}
func openFile(dir, file string, flags int) (*os.File, error) {
mode := os.FileMode(0)
if TestMode && flags&os.O_WRONLY != 0 {
// "emulate" cgroup fs for unit tests
flags |= os.O_TRUNC | os.O_CREATE
mode = 0o600
}
path := path.Join(dir, file)
if prepareOpenat2() != nil {
return openFallback(path, flags, mode)
}
relPath := strings.TrimPrefix(path, cgroupfsPrefix)
if len(relPath) == len(path) { // non-standard path, old system?
return openFallback(path, flags, mode)
}
fd, err := unix.Openat2(cgroupFd, relPath,
&unix.OpenHow{
Resolve: resolveFlags,
Flags: uint64(flags) | unix.O_CLOEXEC,
Mode: uint64(mode),
})
if err != nil {
err = &os.PathError{Op: "openat2", Path: path, Err: err}
// Check if cgroupFd is still opened to cgroupfsDir
// (happens when this package is incorrectly used
// across the chroot/pivot_root/mntns boundary, or
// when /sys/fs/cgroup is remounted).
//
// TODO: if such usage will ever be common, amend this
// to reopen cgroupFd and retry openat2.
fdStr := strconv.Itoa(cgroupFd)
fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
if fdDest != cgroupfsDir {
// Wrap the error so it is clear that cgroupFd
// is opened to an unexpected/wrong directory.
err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w",
fdStr, fdDest, cgroupfsDir, err)
}
return nil, err
}
return os.NewFile(uintptr(fd), path), nil
}
var errNotCgroupfs = errors.New("not a cgroup file")
// Can be changed by unit tests.
var openFallback = openAndCheck
// openAndCheck is used when openat2(2) is not available. It checks the opened
// file is on cgroupfs, returning an error otherwise.
func openAndCheck(path string, flags int, mode os.FileMode) (*os.File, error) {
fd, err := os.OpenFile(path, flags, mode)
if err != nil {
return nil, err
}
if TestMode {
return fd, nil
}
// Check this is a cgroupfs file.
var st unix.Statfs_t
if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
_ = fd.Close()
return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
}
if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
_ = fd.Close()
return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
}
return fd, nil
}

View File

@ -1,311 +0,0 @@
package fs
import (
"bufio"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type BlkioGroup struct {
weightFilename string
weightDeviceFilename string
}
func (s *BlkioGroup) Name() string {
return "blkio"
}
func (s *BlkioGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *BlkioGroup) Set(path string, r *configs.Resources) error {
s.detectWeightFilenames(path)
if r.BlkioWeight != 0 {
if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
return err
}
}
if r.BlkioLeafWeight != 0 {
if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil {
return err
}
}
for _, wd := range r.BlkioWeightDevice {
if wd.Weight != 0 {
if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil {
return err
}
}
if wd.LeafWeight != 0 {
if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil {
return err
}
}
}
for _, td := range r.BlkioThrottleReadBpsDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteBpsDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleReadIOPSDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteIOPSDevice {
if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil {
return err
}
}
return nil
}
/*
examples:
blkio.sectors
8:0 6792
blkio.io_service_bytes
8:0 Read 1282048
8:0 Write 2195456
8:0 Sync 2195456
8:0 Async 1282048
8:0 Total 3477504
Total 3477504
blkio.io_serviced
8:0 Read 124
8:0 Write 104
8:0 Sync 104
8:0 Async 124
8:0 Total 228
Total 228
blkio.io_queued
8:0 Read 0
8:0 Write 0
8:0 Sync 0
8:0 Async 0
8:0 Total 0
Total 0
*/
func splitBlkioStatLine(r rune) bool {
return r == ' ' || r == ':'
}
func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) {
var blkioStats []cgroups.BlkioStatEntry
f, err := cgroups.OpenFile(dir, file, os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return blkioStats, nil
}
return nil, err
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
// format: dev type amount
fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine)
if len(fields) < 3 {
if len(fields) == 2 && fields[0] == "Total" {
// skip total line
continue
} else {
return nil, malformedLine(dir, file, sc.Text())
}
}
v, err := strconv.ParseUint(fields[0], 10, 64)
if err != nil {
return nil, &parseError{Path: dir, File: file, Err: err}
}
major := v
v, err = strconv.ParseUint(fields[1], 10, 64)
if err != nil {
return nil, &parseError{Path: dir, File: file, Err: err}
}
minor := v
op := ""
valueField := 2
if len(fields) == 4 {
op = fields[2]
valueField = 3
}
v, err = strconv.ParseUint(fields[valueField], 10, 64)
if err != nil {
return nil, &parseError{Path: dir, File: file, Err: err}
}
blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v})
}
if err := sc.Err(); err != nil {
return nil, &parseError{Path: dir, File: file, Err: err}
}
return blkioStats, nil
}
func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
type blkioStatInfo struct {
filename string
blkioStatEntriesPtr *[]cgroups.BlkioStatEntry
}
bfqDebugStats := []blkioStatInfo{
{
filename: "blkio.bfq.sectors_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
},
{
filename: "blkio.bfq.io_service_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
},
{
filename: "blkio.bfq.io_wait_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
},
{
filename: "blkio.bfq.io_merged_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
},
{
filename: "blkio.bfq.io_queued_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
},
{
filename: "blkio.bfq.time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
},
{
filename: "blkio.bfq.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.bfq.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
bfqStats := []blkioStatInfo{
{
filename: "blkio.bfq.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.bfq.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
cfqStats := []blkioStatInfo{
{
filename: "blkio.sectors_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
},
{
filename: "blkio.io_service_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
},
{
filename: "blkio.io_wait_time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
},
{
filename: "blkio.io_merged_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
},
{
filename: "blkio.io_queued_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
},
{
filename: "blkio.time_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
},
{
filename: "blkio.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
throttleRecursiveStats := []blkioStatInfo{
{
filename: "blkio.throttle.io_serviced_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.throttle.io_service_bytes_recursive",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
baseStats := []blkioStatInfo{
{
filename: "blkio.throttle.io_serviced",
blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
},
{
filename: "blkio.throttle.io_service_bytes",
blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
},
}
orderedStats := [][]blkioStatInfo{
bfqDebugStats,
bfqStats,
cfqStats,
throttleRecursiveStats,
baseStats,
}
var blkioStats []cgroups.BlkioStatEntry
var err error
for _, statGroup := range orderedStats {
for i, statInfo := range statGroup {
if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil {
// if error occurs on first file, move to next group
if i == 0 {
break
}
return err
}
*statInfo.blkioStatEntriesPtr = blkioStats
// finish if all stats are gathered
if i == len(statGroup)-1 {
return nil
}
}
}
return nil
}
func (s *BlkioGroup) detectWeightFilenames(path string) {
if s.weightFilename != "" {
// Already detected.
return
}
if cgroups.PathExists(filepath.Join(path, "blkio.weight")) {
s.weightFilename = "blkio.weight"
s.weightDeviceFilename = "blkio.weight_device"
} else {
s.weightFilename = "blkio.bfq.weight"
s.weightDeviceFilename = "blkio.bfq.weight_device"
}
}

View File

@ -1,129 +0,0 @@
package fs
import (
"bufio"
"errors"
"fmt"
"os"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
type CpuGroup struct{}
func (s *CpuGroup) Name() string {
return "cpu"
}
func (s *CpuGroup) Apply(path string, r *configs.Resources, pid int) error {
if err := os.MkdirAll(path, 0o755); err != nil {
return err
}
// We should set the real-Time group scheduling settings before moving
// in the process because if the process is already in SCHED_RR mode
// and no RT bandwidth is set, adding it will fail.
if err := s.SetRtSched(path, r); err != nil {
return err
}
// Since we are not using apply(), we need to place the pid
// into the procs file.
return cgroups.WriteCgroupProc(path, pid)
}
func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error {
if r.CpuRtPeriod != 0 {
if err := cgroups.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil {
return err
}
}
if r.CpuRtRuntime != 0 {
if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil {
return err
}
}
return nil
}
func (s *CpuGroup) Set(path string, r *configs.Resources) error {
if r.CpuShares != 0 {
shares := r.CpuShares
if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil {
return err
}
// read it back
sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares")
if err != nil {
return err
}
// ... and check
if shares > sharesRead {
return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead)
} else if shares < sharesRead {
return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead)
}
}
var period string
if r.CpuPeriod != 0 {
period = strconv.FormatUint(r.CpuPeriod, 10)
if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil {
// Sometimes when the period to be set is smaller
// than the current one, it is rejected by the kernel
// (EINVAL) as old_quota/new_period exceeds the parent
// cgroup quota limit. If this happens and the quota is
// going to be set, ignore the error for now and retry
// after setting the quota.
if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 {
return err
}
} else {
period = ""
}
}
if r.CpuQuota != 0 {
if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil {
return err
}
if period != "" {
if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil {
return err
}
}
}
return s.SetRtSched(path, r)
}
func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
const file = "cpu.stat"
f, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return &parseError{Path: path, File: file, Err: err}
}
switch t {
case "nr_periods":
stats.CpuStats.ThrottlingData.Periods = v
case "nr_throttled":
stats.CpuStats.ThrottlingData.ThrottledPeriods = v
case "throttled_time":
stats.CpuStats.ThrottlingData.ThrottledTime = v
}
}
return nil
}

View File

@ -1,166 +0,0 @@
package fs
import (
"bufio"
"os"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
const (
cgroupCpuacctStat = "cpuacct.stat"
cgroupCpuacctUsageAll = "cpuacct.usage_all"
nanosecondsInSecond = 1000000000
userModeColumn = 1
kernelModeColumn = 2
cuacctUsageAllColumnsNumber = 3
// The value comes from `C.sysconf(C._SC_CLK_TCK)`, and
// on Linux it's a constant which is safe to be hard coded,
// so we can avoid using cgo here. For details, see:
// https://github.com/containerd/cgroups/pull/12
clockTicks uint64 = 100
)
type CpuacctGroup struct{}
func (s *CpuacctGroup) Name() string {
return "cpuacct"
}
func (s *CpuacctGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *CpuacctGroup) Set(_ string, _ *configs.Resources) error {
return nil
}
func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
if !cgroups.PathExists(path) {
return nil
}
userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path)
if err != nil {
return err
}
totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage")
if err != nil {
return err
}
percpuUsage, err := getPercpuUsage(path)
if err != nil {
return err
}
percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path)
if err != nil {
return err
}
stats.CpuStats.CpuUsage.TotalUsage = totalUsage
stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage
stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode
stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode
stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage
stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage
return nil
}
// Returns user and kernel usage breakdown in nanoseconds.
func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
var userModeUsage, kernelModeUsage uint64
const (
userField = "user"
systemField = "system"
file = cgroupCpuacctStat
)
// Expected format:
// user <usage in ticks>
// system <usage in ticks>
data, err := cgroups.ReadFile(path, file)
if err != nil {
return 0, 0, err
}
// TODO: use strings.SplitN instead.
fields := strings.Fields(data)
if len(fields) < 4 || fields[0] != userField || fields[2] != systemField {
return 0, 0, malformedLine(path, file, data)
}
if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil {
return 0, 0, &parseError{Path: path, File: file, Err: err}
}
if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil {
return 0, 0, &parseError{Path: path, File: file, Err: err}
}
return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil
}
func getPercpuUsage(path string) ([]uint64, error) {
const file = "cpuacct.usage_percpu"
percpuUsage := []uint64{}
data, err := cgroups.ReadFile(path, file)
if err != nil {
return percpuUsage, err
}
// TODO: use strings.SplitN instead.
for _, value := range strings.Fields(data) {
value, err := strconv.ParseUint(value, 10, 64)
if err != nil {
return percpuUsage, &parseError{Path: path, File: file, Err: err}
}
percpuUsage = append(percpuUsage, value)
}
return percpuUsage, nil
}
func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
usageKernelMode := []uint64{}
usageUserMode := []uint64{}
const file = cgroupCpuacctUsageAll
fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if os.IsNotExist(err) {
return usageKernelMode, usageUserMode, nil
} else if err != nil {
return nil, nil, err
}
defer fd.Close()
scanner := bufio.NewScanner(fd)
scanner.Scan() // skipping header line
for scanner.Scan() {
lineFields := strings.SplitN(scanner.Text(), " ", cuacctUsageAllColumnsNumber+1)
if len(lineFields) != cuacctUsageAllColumnsNumber {
continue
}
usageInKernelMode, err := strconv.ParseUint(lineFields[kernelModeColumn], 10, 64)
if err != nil {
return nil, nil, &parseError{Path: path, File: file, Err: err}
}
usageKernelMode = append(usageKernelMode, usageInKernelMode)
usageInUserMode, err := strconv.ParseUint(lineFields[userModeColumn], 10, 64)
if err != nil {
return nil, nil, &parseError{Path: path, File: file, Err: err}
}
usageUserMode = append(usageUserMode, usageInUserMode)
}
if err := scanner.Err(); err != nil {
return nil, nil, &parseError{Path: path, File: file, Err: err}
}
return usageKernelMode, usageUserMode, nil
}

View File

@ -1,245 +0,0 @@
package fs
import (
"errors"
"os"
"path/filepath"
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type CpusetGroup struct{}
func (s *CpusetGroup) Name() string {
return "cpuset"
}
func (s *CpusetGroup) Apply(path string, r *configs.Resources, pid int) error {
return s.ApplyDir(path, r, pid)
}
func (s *CpusetGroup) Set(path string, r *configs.Resources) error {
if r.CpusetCpus != "" {
if err := cgroups.WriteFile(path, "cpuset.cpus", r.CpusetCpus); err != nil {
return err
}
}
if r.CpusetMems != "" {
if err := cgroups.WriteFile(path, "cpuset.mems", r.CpusetMems); err != nil {
return err
}
}
return nil
}
func getCpusetStat(path string, file string) ([]uint16, error) {
var extracted []uint16
fileContent, err := fscommon.GetCgroupParamString(path, file)
if err != nil {
return extracted, err
}
if len(fileContent) == 0 {
return extracted, &parseError{Path: path, File: file, Err: errors.New("empty file")}
}
for _, s := range strings.Split(fileContent, ",") {
sp := strings.SplitN(s, "-", 3)
switch len(sp) {
case 3:
return extracted, &parseError{Path: path, File: file, Err: errors.New("extra dash")}
case 2:
min, err := strconv.ParseUint(sp[0], 10, 16)
if err != nil {
return extracted, &parseError{Path: path, File: file, Err: err}
}
max, err := strconv.ParseUint(sp[1], 10, 16)
if err != nil {
return extracted, &parseError{Path: path, File: file, Err: err}
}
if min > max {
return extracted, &parseError{Path: path, File: file, Err: errors.New("invalid values, min > max")}
}
for i := min; i <= max; i++ {
extracted = append(extracted, uint16(i))
}
case 1:
value, err := strconv.ParseUint(s, 10, 16)
if err != nil {
return extracted, &parseError{Path: path, File: file, Err: err}
}
extracted = append(extracted, uint16(value))
}
}
return extracted, nil
}
func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
var err error
stats.CPUSetStats.CPUs, err = getCpusetStat(path, "cpuset.cpus")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.cpu_exclusive")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.Mems, err = getCpusetStat(path, "cpuset.mems")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_hardwall")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_exclusive")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_migrate")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_page")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_slab")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_pressure")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, "cpuset.sched_load_balance")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, "cpuset.sched_relax_domain_level")
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
return nil
}
func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error {
// This might happen if we have no cpuset cgroup mounted.
// Just do nothing and don't fail.
if dir == "" {
return nil
}
// 'ensureParent' start with parent because we don't want to
// explicitly inherit from parent, it could conflict with
// 'cpuset.cpu_exclusive'.
if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil {
return err
}
if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) {
return err
}
// We didn't inherit cpuset configs from parent, but we have
// to ensure cpuset configs are set before moving task into the
// cgroup.
// The logic is, if user specified cpuset configs, use these
// specified configs, otherwise, inherit from parent. This makes
// cpuset configs work correctly with 'cpuset.cpu_exclusive', and
// keep backward compatibility.
if err := s.ensureCpusAndMems(dir, r); err != nil {
return err
}
// Since we are not using apply(), we need to place the pid
// into the procs file.
return cgroups.WriteCgroupProc(dir, pid)
}
func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) {
if cpus, err = cgroups.ReadFile(parent, "cpuset.cpus"); err != nil {
return
}
if mems, err = cgroups.ReadFile(parent, "cpuset.mems"); err != nil {
return
}
return cpus, mems, nil
}
// cpusetEnsureParent makes sure that the parent directories of current
// are created and populated with the proper cpus and mems files copied
// from their respective parent. It does that recursively, starting from
// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point).
func cpusetEnsureParent(current string) error {
var st unix.Statfs_t
parent := filepath.Dir(current)
err := unix.Statfs(parent, &st)
if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC {
return nil
}
// Treat non-existing directory as cgroupfs as it will be created,
// and the root cpuset directory obviously exists.
if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare
return &os.PathError{Op: "statfs", Path: parent, Err: err}
}
if err := cpusetEnsureParent(parent); err != nil {
return err
}
if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) {
return err
}
return cpusetCopyIfNeeded(current, parent)
}
// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
// directory to the current directory if the file's contents are 0
func cpusetCopyIfNeeded(current, parent string) error {
currentCpus, currentMems, err := getCpusetSubsystemSettings(current)
if err != nil {
return err
}
parentCpus, parentMems, err := getCpusetSubsystemSettings(parent)
if err != nil {
return err
}
if isEmptyCpuset(currentCpus) {
if err := cgroups.WriteFile(current, "cpuset.cpus", parentCpus); err != nil {
return err
}
}
if isEmptyCpuset(currentMems) {
if err := cgroups.WriteFile(current, "cpuset.mems", parentMems); err != nil {
return err
}
}
return nil
}
func isEmptyCpuset(str string) bool {
return str == "" || str == "\n"
}
func (s *CpusetGroup) ensureCpusAndMems(path string, r *configs.Resources) error {
if err := s.Set(path, r); err != nil {
return err
}
return cpusetCopyIfNeeded(path, filepath.Dir(path))
}

View File

@ -1,109 +0,0 @@
package fs
import (
"bytes"
"errors"
"reflect"
"github.com/opencontainers/runc/libcontainer/cgroups"
cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
)
type DevicesGroup struct {
TestingSkipFinalCheck bool
}
func (s *DevicesGroup) Name() string {
return "devices"
}
func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error {
if r.SkipDevices {
return nil
}
if path == "" {
// Return error here, since devices cgroup
// is a hard requirement for container's security.
return errSubsystemDoesNotExist
}
return apply(path, pid)
}
func loadEmulator(path string) (*cgroupdevices.Emulator, error) {
list, err := cgroups.ReadFile(path, "devices.list")
if err != nil {
return nil, err
}
return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list))
}
func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) {
// This defaults to a white-list -- which is what we want!
emu := &cgroupdevices.Emulator{}
for _, rule := range rules {
if err := emu.Apply(*rule); err != nil {
return nil, err
}
}
return emu, nil
}
func (s *DevicesGroup) Set(path string, r *configs.Resources) error {
if userns.RunningInUserNS() || r.SkipDevices {
return nil
}
// Generate two emulators, one for the current state of the cgroup and one
// for the requested state by the user.
current, err := loadEmulator(path)
if err != nil {
return err
}
target, err := buildEmulator(r.Devices)
if err != nil {
return err
}
// Compute the minimal set of transition rules needed to achieve the
// requested state.
transitionRules, err := current.Transition(target)
if err != nil {
return err
}
for _, rule := range transitionRules {
file := "devices.deny"
if rule.Allow {
file = "devices.allow"
}
if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil {
return err
}
}
// Final safety check -- ensure that the resulting state is what was
// requested. This is only really correct for white-lists, but for
// black-lists we can at least check that the cgroup is in the right mode.
//
// This safety-check is skipped for the unit tests because we cannot
// currently mock devices.list correctly.
if !s.TestingSkipFinalCheck {
currentAfter, err := loadEmulator(path)
if err != nil {
return err
}
if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) {
return errors.New("resulting devices cgroup doesn't precisely match target")
} else if target.IsBlacklist() != currentAfter.IsBlacklist() {
return errors.New("resulting devices cgroup doesn't match target mode")
}
}
return nil
}
func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View File

@ -1,15 +0,0 @@
package fs
import (
"fmt"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
)
type parseError = fscommon.ParseError
// malformedLine is used by all cgroupfs file parsers that expect a line
// in a particular format but get some garbage instead.
func malformedLine(path, file, line string) error {
return &parseError{Path: path, File: file, Err: fmt.Errorf("malformed line: %s", line)}
}

View File

@ -1,158 +0,0 @@
package fs
import (
"errors"
"fmt"
"os"
"strings"
"time"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
type FreezerGroup struct{}
func (s *FreezerGroup) Name() string {
return "freezer"
}
func (s *FreezerGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) {
switch r.Freezer {
case configs.Frozen:
defer func() {
if Err != nil {
// Freezing failed, and it is bad and dangerous
// to leave the cgroup in FROZEN or FREEZING
// state, so (try to) thaw it back.
_ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
}
}()
// As per older kernel docs (freezer-subsystem.txt before
// kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
// userspace should either retry or thaw. While current
// kernel cgroup v1 docs no longer mention a need to retry,
// even a recent kernel (v5.4, Ubuntu 20.04) can't reliably
// freeze a cgroup v1 while new processes keep appearing in it
// (either via fork/clone or by writing new PIDs to
// cgroup.procs).
//
// The numbers below are empirically chosen to have a decent
// chance to succeed in various scenarios ("runc pause/unpause
// with parallel runc exec" and "bare freeze/unfreeze on a very
// slow system"), tested on RHEL7 and Ubuntu 20.04 kernels.
//
// Adding any amount of sleep in between retries did not
// increase the chances of successful freeze in "pause/unpause
// with parallel exec" reproducer. OTOH, adding an occasional
// sleep helped for the case where the system is extremely slow
// (CentOS 7 VM on GHA CI).
//
// Alas, this is still a game of chances, since the real fix
// belong to the kernel (cgroup v2 do not have this bug).
for i := 0; i < 1000; i++ {
if i%50 == 49 {
// Occasional thaw and sleep improves
// the chances to succeed in freezing
// in case new processes keep appearing
// in the cgroup.
_ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
time.Sleep(10 * time.Millisecond)
}
if err := cgroups.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil {
return err
}
if i%25 == 24 {
// Occasional short sleep before reading
// the state back also improves the chances to
// succeed in freezing in case of a very slow
// system.
time.Sleep(10 * time.Microsecond)
}
state, err := cgroups.ReadFile(path, "freezer.state")
if err != nil {
return err
}
state = strings.TrimSpace(state)
switch state {
case "FREEZING":
continue
case string(configs.Frozen):
if i > 1 {
logrus.Debugf("frozen after %d retries", i)
}
return nil
default:
// should never happen
return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state))
}
}
// Despite our best efforts, it got stuck in FREEZING.
return errors.New("unable to freeze")
case configs.Thawed:
return cgroups.WriteFile(path, "freezer.state", string(configs.Thawed))
case configs.Undefined:
return nil
default:
return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer))
}
}
func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}
func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) {
for {
state, err := cgroups.ReadFile(path, "freezer.state")
if err != nil {
// If the kernel is too old, then we just treat the freezer as
// being in an "undefined" state.
if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
err = nil
}
return configs.Undefined, err
}
switch strings.TrimSpace(state) {
case "THAWED":
return configs.Thawed, nil
case "FROZEN":
// Find out whether the cgroup is frozen directly,
// or indirectly via an ancestor.
self, err := cgroups.ReadFile(path, "freezer.self_freezing")
if err != nil {
// If the kernel is too old, then we just treat
// it as being frozen.
if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) {
err = nil
}
return configs.Frozen, err
}
switch self {
case "0\n":
return configs.Thawed, nil
case "1\n":
return configs.Frozen, nil
default:
return configs.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self)
}
case "FREEZING":
// Make sure we get a stable freezer state, so retry if the cgroup
// is still undergoing freezing. This should be a temporary delay.
time.Sleep(1 * time.Millisecond)
continue
default:
return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state)
}
}
}

View File

@ -1,265 +0,0 @@
package fs
import (
"errors"
"fmt"
"os"
"sync"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
var subsystems = []subsystem{
&CpusetGroup{},
&DevicesGroup{},
&MemoryGroup{},
&CpuGroup{},
&CpuacctGroup{},
&PidsGroup{},
&BlkioGroup{},
&HugetlbGroup{},
&NetClsGroup{},
&NetPrioGroup{},
&PerfEventGroup{},
&FreezerGroup{},
&RdmaGroup{},
&NameGroup{GroupName: "name=systemd", Join: true},
&NameGroup{GroupName: "misc", Join: true},
}
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
func init() {
// If using cgroups-hybrid mode then add a "" controller indicating
// it should join the cgroups v2.
if cgroups.IsCgroup2HybridMode() {
subsystems = append(subsystems, &NameGroup{GroupName: "", Join: true})
}
}
type subsystem interface {
// Name returns the name of the subsystem.
Name() string
// GetStats fills in the stats for the subsystem.
GetStats(path string, stats *cgroups.Stats) error
// Apply creates and joins a cgroup, adding pid into it. Some
// subsystems use resources to pre-configure the cgroup parents
// before creating or joining it.
Apply(path string, r *configs.Resources, pid int) error
// Set sets the cgroup resources.
Set(path string, r *configs.Resources) error
}
type manager struct {
mu sync.Mutex
cgroups *configs.Cgroup
paths map[string]string
}
func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
// Some v1 controllers (cpu, cpuset, and devices) expect
// cgroups.Resources to not be nil in Apply.
if cg.Resources == nil {
return nil, errors.New("cgroup v1 manager needs configs.Resources to be set during manager creation")
}
if cg.Resources.Unified != nil {
return nil, cgroups.ErrV1NoUnified
}
if paths == nil {
var err error
paths, err = initPaths(cg)
if err != nil {
return nil, err
}
}
return &manager{
cgroups: cg,
paths: paths,
}, nil
}
// isIgnorableError returns whether err is a permission error (in the loose
// sense of the word). This includes EROFS (which for an unprivileged user is
// basically a permission error) and EACCES (for similar reasons) as well as
// the normal EPERM.
func isIgnorableError(rootless bool, err error) bool {
// We do not ignore errors if we are root.
if !rootless {
return false
}
// Is it an ordinary EPERM?
if errors.Is(err, os.ErrPermission) {
return true
}
// Handle some specific syscall errors.
var errno unix.Errno
if errors.As(err, &errno) {
return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
}
return false
}
func (m *manager) Apply(pid int) (err error) {
m.mu.Lock()
defer m.mu.Unlock()
c := m.cgroups
for _, sys := range subsystems {
name := sys.Name()
p, ok := m.paths[name]
if !ok {
continue
}
if err := sys.Apply(p, c.Resources, pid); err != nil {
// In the case of rootless (including euid=0 in userns), where an
// explicit cgroup path hasn't been set, we don't bail on error in
// case of permission problems here, but do delete the path from
// the m.paths map, since it is either non-existent and could not
// be created, or the pid could not be added to it.
//
// Cases where limits for the subsystem have been set are handled
// later by Set, which fails with a friendly error (see
// if path == "" in Set).
if isIgnorableError(c.Rootless, err) && c.Path == "" {
delete(m.paths, name)
continue
}
return err
}
}
return nil
}
func (m *manager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
return cgroups.RemovePaths(m.paths)
}
func (m *manager) Path(subsys string) string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths[subsys]
}
func (m *manager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
stats := cgroups.NewStats()
for _, sys := range subsystems {
path := m.paths[sys.Name()]
if path == "" {
continue
}
if err := sys.GetStats(path, stats); err != nil {
return nil, err
}
}
return stats, nil
}
func (m *manager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
if r.Unified != nil {
return cgroups.ErrV1NoUnified
}
m.mu.Lock()
defer m.mu.Unlock()
for _, sys := range subsystems {
path := m.paths[sys.Name()]
if err := sys.Set(path, r); err != nil {
// When rootless is true, errors from the device subsystem
// are ignored, as it is really not expected to work.
if m.cgroups.Rootless && sys.Name() == "devices" {
continue
}
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if path == "" {
// We never created a path for this cgroup, so we cannot set
// limits for it (though we have already tried at this point).
return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name())
}
return err
}
}
return nil
}
// Freeze toggles the container's freezer cgroup depending on the state
// provided
func (m *manager) Freeze(state configs.FreezerState) error {
path := m.Path("freezer")
if path == "" {
return errors.New("cannot toggle freezer: cgroups not configured for container")
}
prevState := m.cgroups.Resources.Freezer
m.cgroups.Resources.Freezer = state
freezer := &FreezerGroup{}
if err := freezer.Set(path, m.cgroups.Resources); err != nil {
m.cgroups.Resources.Freezer = prevState
return err
}
return nil
}
func (m *manager) GetPids() ([]int, error) {
return cgroups.GetPids(m.Path("devices"))
}
func (m *manager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(m.Path("devices"))
}
func (m *manager) GetPaths() map[string]string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths
}
func (m *manager) GetCgroups() (*configs.Cgroup, error) {
return m.cgroups, nil
}
func (m *manager) GetFreezerState() (configs.FreezerState, error) {
dir := m.Path("freezer")
// If the container doesn't have the freezer cgroup, say it's undefined.
if dir == "" {
return configs.Undefined, nil
}
freezer := &FreezerGroup{}
return freezer.GetState(dir)
}
func (m *manager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}
func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.Path("memory"))
// Ignore ENOENT when rootless as it couldn't create cgroup.
if err != nil && m.cgroups.Rootless && os.IsNotExist(err) {
err = nil
}
return c, err
}

View File

@ -1,62 +0,0 @@
package fs
import (
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type HugetlbGroup struct{}
func (s *HugetlbGroup) Name() string {
return "hugetlb"
}
func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *HugetlbGroup) Set(path string, r *configs.Resources) error {
for _, hugetlb := range r.HugetlbLimit {
if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}
}
return nil
}
func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
if !cgroups.PathExists(path) {
return nil
}
hugetlbStats := cgroups.HugetlbStats{}
for _, pageSize := range cgroups.HugePageSizes() {
usage := "hugetlb." + pageSize + ".usage_in_bytes"
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
return err
}
hugetlbStats.Usage = value
maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes"
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil {
return err
}
hugetlbStats.MaxUsage = value
failcnt := "hugetlb." + pageSize + ".failcnt"
value, err = fscommon.GetCgroupParamUint(path, failcnt)
if err != nil {
return err
}
hugetlbStats.Failcnt = value
stats.HugetlbStats[pageSize] = hugetlbStats
}
return nil
}

View File

@ -1,348 +0,0 @@
package fs
import (
"bufio"
"errors"
"fmt"
"math"
"os"
"path/filepath"
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
const (
cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
cgroupMemoryLimit = "memory.limit_in_bytes"
cgroupMemoryUsage = "memory.usage_in_bytes"
cgroupMemoryMaxUsage = "memory.max_usage_in_bytes"
)
type MemoryGroup struct{}
func (s *MemoryGroup) Name() string {
return "memory"
}
func (s *MemoryGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func setMemory(path string, val int64) error {
if val == 0 {
return nil
}
err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10))
if !errors.Is(err, unix.EBUSY) {
return err
}
// EBUSY means the kernel can't set new limit as it's too low
// (lower than the current usage). Return more specific error.
usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage)
if err != nil {
return err
}
max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage)
if err != nil {
return err
}
return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max)
}
func setSwap(path string, val int64) error {
if val == 0 {
return nil
}
return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10))
}
func setMemoryAndSwap(path string, r *configs.Resources) error {
// If the memory update is set to -1 and the swap is not explicitly
// set, we should also set swap to -1, it means unlimited memory.
if r.Memory == -1 && r.MemorySwap == 0 {
// Only set swap if it's enabled in kernel
if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
r.MemorySwap = -1
}
}
// When memory and swap memory are both set, we need to handle the cases
// for updating container.
if r.Memory != 0 && r.MemorySwap != 0 {
curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit)
if err != nil {
return err
}
// When update memory limit, we should adapt the write sequence
// for memory and swap memory, so it won't fail because the new
// value and the old value don't fit kernel's validation.
if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) {
if err := setSwap(path, r.MemorySwap); err != nil {
return err
}
if err := setMemory(path, r.Memory); err != nil {
return err
}
return nil
}
}
if err := setMemory(path, r.Memory); err != nil {
return err
}
if err := setSwap(path, r.MemorySwap); err != nil {
return err
}
return nil
}
func (s *MemoryGroup) Set(path string, r *configs.Resources) error {
if err := setMemoryAndSwap(path, r); err != nil {
return err
}
// ignore KernelMemory and KernelMemoryTCP
if r.MemoryReservation != 0 {
if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil {
return err
}
}
if r.OomKillDisable {
if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil {
return err
}
}
if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 {
return nil
} else if *r.MemorySwappiness <= 100 {
if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil {
return err
}
} else {
return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness)
}
return nil
}
func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
const file = "memory.stat"
statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
defer statsFile.Close()
sc := bufio.NewScanner(statsFile)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return &parseError{Path: path, File: file, Err: err}
}
stats.MemoryStats.Stats[t] = v
}
stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"]
memoryUsage, err := getMemoryData(path, "")
if err != nil {
return err
}
stats.MemoryStats.Usage = memoryUsage
swapUsage, err := getMemoryData(path, "memsw")
if err != nil {
return err
}
stats.MemoryStats.SwapUsage = swapUsage
kernelUsage, err := getMemoryData(path, "kmem")
if err != nil {
return err
}
stats.MemoryStats.KernelUsage = kernelUsage
kernelTCPUsage, err := getMemoryData(path, "kmem.tcp")
if err != nil {
return err
}
stats.MemoryStats.KernelTCPUsage = kernelTCPUsage
value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy")
if err != nil {
return err
}
if value == 1 {
stats.MemoryStats.UseHierarchy = true
}
pagesByNUMA, err := getPageUsageByNUMA(path)
if err != nil {
return err
}
stats.MemoryStats.PageUsageByNUMA = pagesByNUMA
return nil
}
func getMemoryData(path, name string) (cgroups.MemoryData, error) {
memoryData := cgroups.MemoryData{}
moduleName := "memory"
if name != "" {
moduleName = "memory." + name
}
var (
usage = moduleName + ".usage_in_bytes"
maxUsage = moduleName + ".max_usage_in_bytes"
failcnt = moduleName + ".failcnt"
limit = moduleName + ".limit_in_bytes"
)
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
if name != "" && os.IsNotExist(err) {
// Ignore ENOENT as swap and kmem controllers
// are optional in the kernel.
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, err
}
memoryData.Usage = value
value, err = fscommon.GetCgroupParamUint(path, maxUsage)
if err != nil {
return cgroups.MemoryData{}, err
}
memoryData.MaxUsage = value
value, err = fscommon.GetCgroupParamUint(path, failcnt)
if err != nil {
return cgroups.MemoryData{}, err
}
memoryData.Failcnt = value
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
return cgroups.MemoryData{}, err
}
memoryData.Limit = value
return memoryData, nil
}
func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) {
const (
maxColumns = math.MaxUint8 + 1
file = "memory.numa_stat"
)
stats := cgroups.PageUsageByNUMA{}
fd, err := cgroups.OpenFile(path, file, os.O_RDONLY)
if os.IsNotExist(err) {
return stats, nil
} else if err != nil {
return stats, err
}
defer fd.Close()
// File format is documented in linux/Documentation/cgroup-v1/memory.txt
// and it looks like this:
//
// total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
// file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
// anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
// unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
// hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
scanner := bufio.NewScanner(fd)
for scanner.Scan() {
var field *cgroups.PageStats
line := scanner.Text()
columns := strings.SplitN(line, " ", maxColumns)
for i, column := range columns {
byNode := strings.SplitN(column, "=", 2)
// Some custom kernels have non-standard fields, like
// numa_locality 0 0 0 0 0 0 0 0 0 0
// numa_exectime 0
if len(byNode) < 2 {
if i == 0 {
// Ignore/skip those.
break
} else {
// The first column was already validated,
// so be strict to the rest.
return stats, malformedLine(path, file, line)
}
}
key, val := byNode[0], byNode[1]
if i == 0 { // First column: key is name, val is total.
field = getNUMAField(&stats, key)
if field == nil { // unknown field (new kernel?)
break
}
field.Total, err = strconv.ParseUint(val, 0, 64)
if err != nil {
return stats, &parseError{Path: path, File: file, Err: err}
}
field.Nodes = map[uint8]uint64{}
} else { // Subsequent columns: key is N<id>, val is usage.
if len(key) < 2 || key[0] != 'N' {
// This is definitely an error.
return stats, malformedLine(path, file, line)
}
n, err := strconv.ParseUint(key[1:], 10, 8)
if err != nil {
return stats, &parseError{Path: path, File: file, Err: err}
}
usage, err := strconv.ParseUint(val, 10, 64)
if err != nil {
return stats, &parseError{Path: path, File: file, Err: err}
}
field.Nodes[uint8(n)] = usage
}
}
}
if err := scanner.Err(); err != nil {
return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err}
}
return stats, nil
}
func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats {
switch name {
case "total":
return &stats.Total
case "file":
return &stats.File
case "anon":
return &stats.Anon
case "unevictable":
return &stats.Unevictable
case "hierarchical_total":
return &stats.Hierarchical.Total
case "hierarchical_file":
return &stats.Hierarchical.File
case "hierarchical_anon":
return &stats.Hierarchical.Anon
case "hierarchical_unevictable":
return &stats.Hierarchical.Unevictable
}
return nil
}

View File

@ -1,31 +0,0 @@
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type NameGroup struct {
GroupName string
Join bool
}
func (s *NameGroup) Name() string {
return s.GroupName
}
func (s *NameGroup) Apply(path string, _ *configs.Resources, pid int) error {
if s.Join {
// Ignore errors if the named cgroup does not exist.
_ = apply(path, pid)
}
return nil
}
func (s *NameGroup) Set(_ string, _ *configs.Resources) error {
return nil
}
func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View File

@ -1,32 +0,0 @@
package fs
import (
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type NetClsGroup struct{}
func (s *NetClsGroup) Name() string {
return "net_cls"
}
func (s *NetClsGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *NetClsGroup) Set(path string, r *configs.Resources) error {
if r.NetClsClassid != 0 {
if err := cgroups.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil {
return err
}
}
return nil
}
func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View File

@ -1,30 +0,0 @@
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type NetPrioGroup struct{}
func (s *NetPrioGroup) Name() string {
return "net_prio"
}
func (s *NetPrioGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *NetPrioGroup) Set(path string, r *configs.Resources) error {
for _, prioMap := range r.NetPrioIfpriomap {
if err := cgroups.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil {
return err
}
}
return nil
}
func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View File

@ -1,186 +0,0 @@
package fs
import (
"errors"
"os"
"path/filepath"
"sync"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
)
// The absolute path to the root of the cgroup hierarchies.
var (
cgroupRootLock sync.Mutex
cgroupRoot string
)
const defaultCgroupRoot = "/sys/fs/cgroup"
func initPaths(cg *configs.Cgroup) (map[string]string, error) {
root, err := rootPath()
if err != nil {
return nil, err
}
inner, err := innerPath(cg)
if err != nil {
return nil, err
}
paths := make(map[string]string)
for _, sys := range subsystems {
name := sys.Name()
path, err := subsysPath(root, inner, name)
if err != nil {
// The non-presence of the devices subsystem
// is considered fatal for security reasons.
if cgroups.IsNotFound(err) && (cg.SkipDevices || name != "devices") {
continue
}
return nil, err
}
paths[name] = path
}
return paths, nil
}
func tryDefaultCgroupRoot() string {
var st, pst unix.Stat_t
// (1) it should be a directory...
err := unix.Lstat(defaultCgroupRoot, &st)
if err != nil || st.Mode&unix.S_IFDIR == 0 {
return ""
}
// (2) ... and a mount point ...
err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst)
if err != nil {
return ""
}
if st.Dev == pst.Dev {
// parent dir has the same dev -- not a mount point
return ""
}
// (3) ... of 'tmpfs' fs type.
var fst unix.Statfs_t
err = unix.Statfs(defaultCgroupRoot, &fst)
if err != nil || fst.Type != unix.TMPFS_MAGIC {
return ""
}
// (4) it should have at least 1 entry ...
dir, err := os.Open(defaultCgroupRoot)
if err != nil {
return ""
}
names, err := dir.Readdirnames(1)
if err != nil {
return ""
}
if len(names) < 1 {
return ""
}
// ... which is a cgroup mount point.
err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst)
if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
return ""
}
return defaultCgroupRoot
}
// rootPath finds and returns path to the root of the cgroup hierarchies.
func rootPath() (string, error) {
cgroupRootLock.Lock()
defer cgroupRootLock.Unlock()
if cgroupRoot != "" {
return cgroupRoot, nil
}
// fast path
cgroupRoot = tryDefaultCgroupRoot()
if cgroupRoot != "" {
return cgroupRoot, nil
}
// slow path: parse mountinfo
mi, err := cgroups.GetCgroupMounts(false)
if err != nil {
return "", err
}
if len(mi) < 1 {
return "", errors.New("no cgroup mount found in mountinfo")
}
// Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"),
// use its parent directory.
root := filepath.Dir(mi[0].Mountpoint)
if _, err := os.Stat(root); err != nil {
return "", err
}
cgroupRoot = root
return cgroupRoot, nil
}
func innerPath(c *configs.Cgroup) (string, error) {
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return "", errors.New("cgroup: either Path or Name and Parent should be used")
}
// XXX: Do not remove CleanPath. Path safety is important! -- cyphar
innerPath := utils.CleanPath(c.Path)
if innerPath == "" {
cgParent := utils.CleanPath(c.Parent)
cgName := utils.CleanPath(c.Name)
innerPath = filepath.Join(cgParent, cgName)
}
return innerPath, nil
}
func subsysPath(root, inner, subsystem string) (string, error) {
// If the cgroup name/path is absolute do not look relative to the cgroup of the init process.
if filepath.IsAbs(inner) {
mnt, err := cgroups.FindCgroupMountpoint(root, subsystem)
// If we didn't mount the subsystem, there is no point we make the path.
if err != nil {
return "", err
}
// Sometimes subsystems can be mounted together as 'cpu,cpuacct'.
return filepath.Join(root, filepath.Base(mnt), inner), nil
}
// Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating
// process could in container and shared pid namespace with host, and
// /proc/1/cgroup could point to whole other world of cgroups.
parentPath, err := cgroups.GetOwnCgroupPath(subsystem)
if err != nil {
return "", err
}
return filepath.Join(parentPath, inner), nil
}
func apply(path string, pid int) error {
if path == "" {
return nil
}
if err := os.MkdirAll(path, 0o755); err != nil {
return err
}
return cgroups.WriteCgroupProc(path, pid)
}

View File

@ -1,24 +0,0 @@
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
type PerfEventGroup struct{}
func (s *PerfEventGroup) Name() string {
return "perf_event"
}
func (s *PerfEventGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *PerfEventGroup) Set(_ string, _ *configs.Resources) error {
return nil
}
func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error {
return nil
}

View File

@ -1,62 +0,0 @@
package fs
import (
"math"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type PidsGroup struct{}
func (s *PidsGroup) Name() string {
return "pids"
}
func (s *PidsGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *PidsGroup) Set(path string, r *configs.Resources) error {
if r.PidsLimit != 0 {
// "max" is the fallback value.
limit := "max"
if r.PidsLimit > 0 {
limit = strconv.FormatInt(r.PidsLimit, 10)
}
if err := cgroups.WriteFile(path, "pids.max", limit); err != nil {
return err
}
}
return nil
}
func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error {
if !cgroups.PathExists(path) {
return nil
}
current, err := fscommon.GetCgroupParamUint(path, "pids.current")
if err != nil {
return err
}
max, err := fscommon.GetCgroupParamUint(path, "pids.max")
if err != nil {
return err
}
// If no limit is set, read from pids.max returns "max", which is
// converted to MaxUint64 by GetCgroupParamUint. Historically, we
// represent "no limit" for pids as 0, thus this conversion.
if max == math.MaxUint64 {
max = 0
}
stats.PidsStats.Current = current
stats.PidsStats.Limit = max
return nil
}

View File

@ -1,25 +0,0 @@
package fs
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type RdmaGroup struct{}
func (s *RdmaGroup) Name() string {
return "rdma"
}
func (s *RdmaGroup) Apply(path string, _ *configs.Resources, pid int) error {
return apply(path, pid)
}
func (s *RdmaGroup) Set(path string, r *configs.Resources) error {
return fscommon.RdmaSet(path, r)
}
func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error {
return fscommon.RdmaGetStats(path, stats)
}

View File

@ -1,87 +0,0 @@
package fs2
import (
"bufio"
"os"
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
func isCpuSet(r *configs.Resources) bool {
return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0
}
func setCpu(dirPath string, r *configs.Resources) error {
if !isCpuSet(r) {
return nil
}
// NOTE: .CpuShares is not used here. Conversion is the caller's responsibility.
if r.CpuWeight != 0 {
if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil {
return err
}
}
if r.CpuQuota != 0 || r.CpuPeriod != 0 {
str := "max"
if r.CpuQuota > 0 {
str = strconv.FormatInt(r.CpuQuota, 10)
}
period := r.CpuPeriod
if period == 0 {
// This default value is documented in
// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
period = 100000
}
str += " " + strconv.FormatUint(period, 10)
if err := cgroups.WriteFile(dirPath, "cpu.max", str); err != nil {
return err
}
}
return nil
}
func statCpu(dirPath string, stats *cgroups.Stats) error {
const file = "cpu.stat"
f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
if err != nil {
return err
}
defer f.Close()
sc := bufio.NewScanner(f)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
switch t {
case "usage_usec":
stats.CpuStats.CpuUsage.TotalUsage = v * 1000
case "user_usec":
stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000
case "system_usec":
stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000
case "nr_periods":
stats.CpuStats.ThrottlingData.Periods = v
case "nr_throttled":
stats.CpuStats.ThrottlingData.ThrottledPeriods = v
case "throttled_usec":
stats.CpuStats.ThrottlingData.ThrottledTime = v * 1000
}
}
if err := sc.Err(); err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
return nil
}

View File

@ -1,28 +0,0 @@
package fs2
import (
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
func isCpusetSet(r *configs.Resources) bool {
return r.CpusetCpus != "" || r.CpusetMems != ""
}
func setCpuset(dirPath string, r *configs.Resources) error {
if !isCpusetSet(r) {
return nil
}
if r.CpusetCpus != "" {
if err := cgroups.WriteFile(dirPath, "cpuset.cpus", r.CpusetCpus); err != nil {
return err
}
}
if r.CpusetMems != "" {
if err := cgroups.WriteFile(dirPath, "cpuset.mems", r.CpusetMems); err != nil {
return err
}
}
return nil
}

View File

@ -1,152 +0,0 @@
package fs2
import (
"fmt"
"os"
"path/filepath"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
func supportedControllers() (string, error) {
return cgroups.ReadFile(UnifiedMountpoint, "/cgroup.controllers")
}
// needAnyControllers returns whether we enable some supported controllers or not,
// based on (1) controllers available and (2) resources that are being set.
// We don't check "pseudo" controllers such as
// "freezer" and "devices".
func needAnyControllers(r *configs.Resources) (bool, error) {
if r == nil {
return false, nil
}
// list of all available controllers
content, err := supportedControllers()
if err != nil {
return false, err
}
avail := make(map[string]struct{})
for _, ctr := range strings.Fields(content) {
avail[ctr] = struct{}{}
}
// check whether the controller if available or not
have := func(controller string) bool {
_, ok := avail[controller]
return ok
}
if isPidsSet(r) && have("pids") {
return true, nil
}
if isMemorySet(r) && have("memory") {
return true, nil
}
if isIoSet(r) && have("io") {
return true, nil
}
if isCpuSet(r) && have("cpu") {
return true, nil
}
if isCpusetSet(r) && have("cpuset") {
return true, nil
}
if isHugeTlbSet(r) && have("hugetlb") {
return true, nil
}
return false, nil
}
// containsDomainController returns whether the current config contains domain controller or not.
// Refer to: http://man7.org/linux/man-pages/man7/cgroups.7.html
// As at Linux 4.19, the following controllers are threaded: cpu, perf_event, and pids.
func containsDomainController(r *configs.Resources) bool {
return isMemorySet(r) || isIoSet(r) || isCpuSet(r) || isHugeTlbSet(r)
}
// CreateCgroupPath creates cgroupv2 path, enabling all the supported controllers.
func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
if !strings.HasPrefix(path, UnifiedMountpoint) {
return fmt.Errorf("invalid cgroup path %s", path)
}
content, err := supportedControllers()
if err != nil {
return err
}
const (
cgTypeFile = "cgroup.type"
cgStCtlFile = "cgroup.subtree_control"
)
ctrs := strings.Fields(content)
res := "+" + strings.Join(ctrs, " +")
elements := strings.Split(path, "/")
elements = elements[3:]
current := "/sys/fs"
for i, e := range elements {
current = filepath.Join(current, e)
if i > 0 {
if err := os.Mkdir(current, 0o755); err != nil {
if !os.IsExist(err) {
return err
}
} else {
// If the directory was created, be sure it is not left around on errors.
current := current
defer func() {
if Err != nil {
os.Remove(current)
}
}()
}
cgType, _ := cgroups.ReadFile(current, cgTypeFile)
cgType = strings.TrimSpace(cgType)
switch cgType {
// If the cgroup is in an invalid mode (usually this means there's an internal
// process in the cgroup tree, because we created a cgroup under an
// already-populated-by-other-processes cgroup), then we have to error out if
// the user requested controllers which are not thread-aware. However, if all
// the controllers requested are thread-aware we can simply put the cgroup into
// threaded mode.
case "domain invalid":
if containsDomainController(c.Resources) {
return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in an invalid state", current)
} else {
// Not entirely correct (in theory we'd always want to be a domain --
// since that means we're a properly delegated cgroup subtree) but in
// this case there's not much we can do and it's better than giving an
// error.
_ = cgroups.WriteFile(current, cgTypeFile, "threaded")
}
// If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers
// (and you cannot usually take a cgroup out of threaded mode).
case "domain threaded":
fallthrough
case "threaded":
if containsDomainController(c.Resources) {
return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType)
}
}
}
// enable all supported controllers
if i < len(elements)-1 {
if err := cgroups.WriteFile(current, cgStCtlFile, res); err != nil {
// try write one by one
allCtrs := strings.Split(res, " ")
for _, ctr := range allCtrs {
_ = cgroups.WriteFile(current, cgStCtlFile, ctr)
}
}
// Some controllers might not be enabled when rootless or containerized,
// but we don't catch the error here. (Caught in setXXX() functions.)
}
}
return nil
}

View File

@ -1,99 +0,0 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package fs2
import (
"bufio"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/utils"
)
const UnifiedMountpoint = "/sys/fs/cgroup"
func defaultDirPath(c *configs.Cgroup) (string, error) {
if (c.Name != "" || c.Parent != "") && c.Path != "" {
return "", fmt.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c)
}
return _defaultDirPath(UnifiedMountpoint, c.Path, c.Parent, c.Name)
}
func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) {
if (cgName != "" || cgParent != "") && cgPath != "" {
return "", errors.New("cgroup: either Path or Name and Parent should be used")
}
// XXX: Do not remove CleanPath. Path safety is important! -- cyphar
innerPath := utils.CleanPath(cgPath)
if innerPath == "" {
cgParent := utils.CleanPath(cgParent)
cgName := utils.CleanPath(cgName)
innerPath = filepath.Join(cgParent, cgName)
}
if filepath.IsAbs(innerPath) {
return filepath.Join(root, innerPath), nil
}
ownCgroup, err := parseCgroupFile("/proc/self/cgroup")
if err != nil {
return "", err
}
// The current user scope most probably has tasks in it already,
// making it impossible to enable controllers for its sub-cgroup.
// A parent cgroup (with no tasks in it) is what we need.
ownCgroup = filepath.Dir(ownCgroup)
return filepath.Join(root, ownCgroup, innerPath), nil
}
// parseCgroupFile parses /proc/PID/cgroup file and return string
func parseCgroupFile(path string) (string, error) {
f, err := os.Open(path)
if err != nil {
return "", err
}
defer f.Close()
return parseCgroupFromReader(f)
}
func parseCgroupFromReader(r io.Reader) (string, error) {
s := bufio.NewScanner(r)
for s.Scan() {
var (
text = s.Text()
parts = strings.SplitN(text, ":", 3)
)
if len(parts) < 3 {
return "", fmt.Errorf("invalid cgroup entry: %q", text)
}
// text is like "0::/user.slice/user-1001.slice/session-1.scope"
if parts[0] == "0" && parts[1] == "" {
return parts[2], nil
}
}
if err := s.Err(); err != nil {
return "", err
}
return "", errors.New("cgroup path not found")
}

View File

@ -1,75 +0,0 @@
package fs2
import (
"fmt"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups/ebpf"
"github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/userns"
)
func isRWM(perms devices.Permissions) bool {
var r, w, m bool
for _, perm := range perms {
switch perm {
case 'r':
r = true
case 'w':
w = true
case 'm':
m = true
}
}
return r && w && m
}
// This is similar to the logic applied in crun for handling errors from bpf(2)
// <https://github.com/containers/crun/blob/0.17/src/libcrun/cgroup.c#L2438-L2470>.
func canSkipEBPFError(r *configs.Resources) bool {
// If we're running in a user namespace we can ignore eBPF rules because we
// usually cannot use bpf(2), as well as rootless containers usually don't
// have the necessary privileges to mknod(2) device inodes or access
// host-level instances (though ideally we would be blocking device access
// for rootless containers anyway).
if userns.RunningInUserNS() {
return true
}
// We cannot ignore an eBPF load error if any rule if is a block rule or it
// doesn't permit all access modes.
//
// NOTE: This will sometimes trigger in cases where access modes are split
// between different rules but to handle this correctly would require
// using ".../libcontainer/cgroup/devices".Emulator.
for _, dev := range r.Devices {
if !dev.Allow || !isRWM(dev.Permissions) {
return false
}
}
return true
}
func setDevices(dirPath string, r *configs.Resources) error {
if r.SkipDevices {
return nil
}
insts, license, err := devicefilter.DeviceFilter(r.Devices)
if err != nil {
return err
}
dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0o600)
if err != nil {
return fmt.Errorf("cannot get dir FD for %s", dirPath)
}
defer unix.Close(dirFD)
if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
if !canSkipEBPFError(r) {
return err
}
}
return nil
}

View File

@ -1,127 +0,0 @@
package fs2
import (
"bufio"
"errors"
"fmt"
"os"
"strings"
"time"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
func setFreezer(dirPath string, state configs.FreezerState) error {
var stateStr string
switch state {
case configs.Undefined:
return nil
case configs.Frozen:
stateStr = "1"
case configs.Thawed:
stateStr = "0"
default:
return fmt.Errorf("invalid freezer state %q requested", state)
}
fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDWR)
if err != nil {
// We can ignore this request as long as the user didn't ask us to
// freeze the container (since without the freezer cgroup, that's a
// no-op).
if state != configs.Frozen {
return nil
}
return fmt.Errorf("freezer not supported: %w", err)
}
defer fd.Close()
if _, err := fd.WriteString(stateStr); err != nil {
return err
}
// Confirm that the cgroup did actually change states.
if actualState, err := readFreezer(dirPath, fd); err != nil {
return err
} else if actualState != state {
return fmt.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState)
}
return nil
}
func getFreezer(dirPath string) (configs.FreezerState, error) {
fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDONLY)
if err != nil {
// If the kernel is too old, then we just treat the freezer as being in
// an "undefined" state.
if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
err = nil
}
return configs.Undefined, err
}
defer fd.Close()
return readFreezer(dirPath, fd)
}
func readFreezer(dirPath string, fd *os.File) (configs.FreezerState, error) {
if _, err := fd.Seek(0, 0); err != nil {
return configs.Undefined, err
}
state := make([]byte, 2)
if _, err := fd.Read(state); err != nil {
return configs.Undefined, err
}
switch string(state) {
case "0\n":
return configs.Thawed, nil
case "1\n":
return waitFrozen(dirPath)
default:
return configs.Undefined, fmt.Errorf(`unknown "cgroup.freeze" state: %q`, state)
}
}
// waitFrozen polls cgroup.events until it sees "frozen 1" in it.
func waitFrozen(dirPath string) (configs.FreezerState, error) {
fd, err := cgroups.OpenFile(dirPath, "cgroup.events", unix.O_RDONLY)
if err != nil {
return configs.Undefined, err
}
defer fd.Close()
// XXX: Simple wait/read/retry is used here. An implementation
// based on poll(2) or inotify(7) is possible, but it makes the code
// much more complicated. Maybe address this later.
const (
// Perform maxIter with waitTime in between iterations.
waitTime = 10 * time.Millisecond
maxIter = 1000
)
scanner := bufio.NewScanner(fd)
for i := 0; scanner.Scan(); {
if i == maxIter {
return configs.Undefined, fmt.Errorf("timeout of %s reached waiting for the cgroup to freeze", waitTime*maxIter)
}
line := scanner.Text()
val := strings.TrimPrefix(line, "frozen ")
if val != line { // got prefix
if val[0] == '1' {
return configs.Frozen, nil
}
i++
// wait, then re-read
time.Sleep(waitTime)
_, err := fd.Seek(0, 0)
if err != nil {
return configs.Undefined, err
}
}
}
// Should only reach here either on read error,
// or if the file does not contain "frozen " line.
return configs.Undefined, scanner.Err()
}

View File

@ -1,259 +0,0 @@
package fs2
import (
"errors"
"fmt"
"os"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
type parseError = fscommon.ParseError
type manager struct {
config *configs.Cgroup
// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
dirPath string
// controllers is content of "cgroup.controllers" file.
// excludes pseudo-controllers ("devices" and "freezer").
controllers map[string]struct{}
}
// NewManager creates a manager for cgroup v2 unified hierarchy.
// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope".
// If dirPath is empty, it is automatically set using config.
func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error) {
if dirPath == "" {
var err error
dirPath, err = defaultDirPath(config)
if err != nil {
return nil, err
}
}
m := &manager{
config: config,
dirPath: dirPath,
}
return m, nil
}
func (m *manager) getControllers() error {
if m.controllers != nil {
return nil
}
data, err := cgroups.ReadFile(m.dirPath, "cgroup.controllers")
if err != nil {
if m.config.Rootless && m.config.Path == "" {
return nil
}
return err
}
fields := strings.Fields(data)
m.controllers = make(map[string]struct{}, len(fields))
for _, c := range fields {
m.controllers[c] = struct{}{}
}
return nil
}
func (m *manager) Apply(pid int) error {
if err := CreateCgroupPath(m.dirPath, m.config); err != nil {
// Related tests:
// - "runc create (no limits + no cgrouppath + no permission) succeeds"
// - "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error"
// - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if m.config.Rootless {
if m.config.Path == "" {
if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed {
return nil
}
return fmt.Errorf("rootless needs no limits + no cgrouppath when no permission is granted for cgroups: %w", err)
}
}
return err
}
if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil {
return err
}
return nil
}
func (m *manager) GetPids() ([]int, error) {
return cgroups.GetPids(m.dirPath)
}
func (m *manager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(m.dirPath)
}
func (m *manager) GetStats() (*cgroups.Stats, error) {
var errs []error
st := cgroups.NewStats()
// pids (since kernel 4.5)
if err := statPids(m.dirPath, st); err != nil {
errs = append(errs, err)
}
// memory (since kernel 4.5)
if err := statMemory(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
// io (since kernel 4.5)
if err := statIo(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
// cpu (since kernel 4.15)
// Note cpu.stat is available even if the controller is not enabled.
if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
// hugetlb (since kernel 5.6)
if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
// rdma (since kernel 4.11)
if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) {
errs = append(errs, err)
}
if len(errs) > 0 && !m.config.Rootless {
return st, fmt.Errorf("error while statting cgroup v2: %+v", errs)
}
return st, nil
}
func (m *manager) Freeze(state configs.FreezerState) error {
if m.config.Resources == nil {
return errors.New("cannot toggle freezer: cgroups not configured for container")
}
if err := setFreezer(m.dirPath, state); err != nil {
return err
}
m.config.Resources.Freezer = state
return nil
}
func (m *manager) Destroy() error {
return cgroups.RemovePath(m.dirPath)
}
func (m *manager) Path(_ string) string {
return m.dirPath
}
func (m *manager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
if err := m.getControllers(); err != nil {
return err
}
// pids (since kernel 4.5)
if err := setPids(m.dirPath, r); err != nil {
return err
}
// memory (since kernel 4.5)
if err := setMemory(m.dirPath, r); err != nil {
return err
}
// io (since kernel 4.5)
if err := setIo(m.dirPath, r); err != nil {
return err
}
// cpu (since kernel 4.15)
if err := setCpu(m.dirPath, r); err != nil {
return err
}
// devices (since kernel 4.15, pseudo-controller)
//
// When rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
// However, errors from other subsystems are not ignored.
// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
if err := setDevices(m.dirPath, r); err != nil && !m.config.Rootless {
return err
}
// cpuset (since kernel 5.0)
if err := setCpuset(m.dirPath, r); err != nil {
return err
}
// hugetlb (since kernel 5.6)
if err := setHugeTlb(m.dirPath, r); err != nil {
return err
}
// rdma (since kernel 4.11)
if err := fscommon.RdmaSet(m.dirPath, r); err != nil {
return err
}
// freezer (since kernel 5.2, pseudo-controller)
if err := setFreezer(m.dirPath, r.Freezer); err != nil {
return err
}
if err := m.setUnified(r.Unified); err != nil {
return err
}
m.config.Resources = r
return nil
}
func (m *manager) setUnified(res map[string]string) error {
for k, v := range res {
if strings.Contains(k, "/") {
return fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
}
if err := cgroups.WriteFile(m.dirPath, k, v); err != nil {
// Check for both EPERM and ENOENT since O_CREAT is used by WriteFile.
if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) {
// Check if a controller is available,
// to give more specific error if not.
sk := strings.SplitN(k, ".", 2)
if len(sk) != 2 {
return fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
}
c := sk[0]
if _, ok := m.controllers[c]; !ok && c != "cgroup" {
return fmt.Errorf("unified resource %q can't be set: controller %q not available", k, c)
}
}
return fmt.Errorf("unable to set unified resource %q: %w", k, err)
}
}
return nil
}
func (m *manager) GetPaths() map[string]string {
paths := make(map[string]string, 1)
paths[""] = m.dirPath
return paths
}
func (m *manager) GetCgroups() (*configs.Cgroup, error) {
return m.config, nil
}
func (m *manager) GetFreezerState() (configs.FreezerState, error) {
return getFreezer(m.dirPath)
}
func (m *manager) Exists() bool {
return cgroups.PathExists(m.dirPath)
}
func OOMKillCount(path string) (uint64, error) {
return fscommon.GetValueByKey(path, "memory.events", "oom_kill")
}
func (m *manager) OOMKillCount() (uint64, error) {
c, err := OOMKillCount(m.dirPath)
if err != nil && m.config.Rootless && os.IsNotExist(err) {
err = nil
}
return c, err
}

View File

@ -1,48 +0,0 @@
package fs2
import (
"strconv"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
func isHugeTlbSet(r *configs.Resources) bool {
return len(r.HugetlbLimit) > 0
}
func setHugeTlb(dirPath string, r *configs.Resources) error {
if !isHugeTlbSet(r) {
return nil
}
for _, hugetlb := range r.HugetlbLimit {
if err := cgroups.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
return err
}
}
return nil
}
func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
hugetlbStats := cgroups.HugetlbStats{}
for _, pagesize := range cgroups.HugePageSizes() {
value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current")
if err != nil {
return err
}
hugetlbStats.Usage = value
fileName := "hugetlb." + pagesize + ".events"
value, err = fscommon.GetValueByKey(dirPath, fileName, "max")
if err != nil {
return err
}
hugetlbStats.Failcnt = value
stats.HugetlbStats[pagesize] = hugetlbStats
}
return nil
}

View File

@ -1,193 +0,0 @@
package fs2
import (
"bufio"
"bytes"
"fmt"
"os"
"strconv"
"strings"
"github.com/sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
)
func isIoSet(r *configs.Resources) bool {
return r.BlkioWeight != 0 ||
len(r.BlkioWeightDevice) > 0 ||
len(r.BlkioThrottleReadBpsDevice) > 0 ||
len(r.BlkioThrottleWriteBpsDevice) > 0 ||
len(r.BlkioThrottleReadIOPSDevice) > 0 ||
len(r.BlkioThrottleWriteIOPSDevice) > 0
}
// bfqDeviceWeightSupported checks for per-device BFQ weight support (added
// in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight".
func bfqDeviceWeightSupported(bfq *os.File) bool {
if bfq == nil {
return false
}
_, _ = bfq.Seek(0, 0)
buf := make([]byte, 32)
_, _ = bfq.Read(buf)
// If only a single number (default weight) if read back, we have older kernel.
_, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64)
return err != nil
}
func setIo(dirPath string, r *configs.Resources) error {
if !isIoSet(r) {
return nil
}
// If BFQ IO scheduler is available, use it.
var bfq *os.File
if r.BlkioWeight != 0 || len(r.BlkioWeightDevice) > 0 {
var err error
bfq, err = cgroups.OpenFile(dirPath, "io.bfq.weight", os.O_RDWR)
if err == nil {
defer bfq.Close()
} else if !os.IsNotExist(err) {
return err
}
}
if r.BlkioWeight != 0 {
if bfq != nil { // Use BFQ.
if _, err := bfq.WriteString(strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil {
return err
}
} else {
// Fallback to io.weight with a conversion scheme.
v := cgroups.ConvertBlkIOToIOWeightValue(r.BlkioWeight)
if err := cgroups.WriteFile(dirPath, "io.weight", strconv.FormatUint(v, 10)); err != nil {
return err
}
}
}
if bfqDeviceWeightSupported(bfq) {
for _, wd := range r.BlkioWeightDevice {
if _, err := bfq.WriteString(wd.WeightString() + "\n"); err != nil {
return fmt.Errorf("setting device weight %q: %w", wd.WeightString(), err)
}
}
}
for _, td := range r.BlkioThrottleReadBpsDevice {
if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteBpsDevice {
if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleReadIOPSDevice {
if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil {
return err
}
}
for _, td := range r.BlkioThrottleWriteIOPSDevice {
if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil {
return err
}
}
return nil
}
func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) {
ret := map[string][]string{}
f, err := cgroups.OpenFile(dirPath, name, os.O_RDONLY)
if err != nil {
return nil, err
}
defer f.Close()
scanner := bufio.NewScanner(f)
for scanner.Scan() {
line := scanner.Text()
parts := strings.Fields(line)
if len(parts) < 2 {
continue
}
ret[parts[0]] = parts[1:]
}
if err := scanner.Err(); err != nil {
return nil, &parseError{Path: dirPath, File: name, Err: err}
}
return ret, nil
}
func statIo(dirPath string, stats *cgroups.Stats) error {
const file = "io.stat"
values, err := readCgroup2MapFile(dirPath, file)
if err != nil {
return err
}
// more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt
var parsedStats cgroups.BlkioStats
for k, v := range values {
d := strings.Split(k, ":")
if len(d) != 2 {
continue
}
major, err := strconv.ParseUint(d[0], 10, 64)
if err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
minor, err := strconv.ParseUint(d[1], 10, 64)
if err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
for _, item := range v {
d := strings.Split(item, "=")
if len(d) != 2 {
continue
}
op := d[0]
// Map to the cgroupv1 naming and layout (in separate tables).
var targetTable *[]cgroups.BlkioStatEntry
switch op {
// Equivalent to cgroupv1's blkio.io_service_bytes.
case "rbytes":
op = "Read"
targetTable = &parsedStats.IoServiceBytesRecursive
case "wbytes":
op = "Write"
targetTable = &parsedStats.IoServiceBytesRecursive
// Equivalent to cgroupv1's blkio.io_serviced.
case "rios":
op = "Read"
targetTable = &parsedStats.IoServicedRecursive
case "wios":
op = "Write"
targetTable = &parsedStats.IoServicedRecursive
default:
// Skip over entries we cannot map to cgroupv1 stats for now.
// In the future we should expand the stats struct to include
// them.
logrus.Debugf("cgroupv2 io stats: skipping over unmappable %s entry", item)
continue
}
value, err := strconv.ParseUint(d[1], 10, 64)
if err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
entry := cgroups.BlkioStatEntry{
Op: op,
Major: major,
Minor: minor,
Value: value,
}
*targetTable = append(*targetTable, entry)
}
}
stats.BlkioStats = parsedStats
return nil
}

View File

@ -1,220 +0,0 @@
package fs2
import (
"bufio"
"errors"
"math"
"os"
"strconv"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
// numToStr converts an int64 value to a string for writing to a
// cgroupv2 files with .min, .max, .low, or .high suffix.
// The value of -1 is converted to "max" for cgroupv1 compatibility
// (which used to write -1 to remove the limit).
func numToStr(value int64) (ret string) {
switch {
case value == 0:
ret = ""
case value == -1:
ret = "max"
default:
ret = strconv.FormatInt(value, 10)
}
return ret
}
func isMemorySet(r *configs.Resources) bool {
return r.MemoryReservation != 0 || r.Memory != 0 || r.MemorySwap != 0
}
func setMemory(dirPath string, r *configs.Resources) error {
if !isMemorySet(r) {
return nil
}
swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
if err != nil {
return err
}
swapStr := numToStr(swap)
if swapStr == "" && swap == 0 && r.MemorySwap > 0 {
// memory and memorySwap set to the same value -- disable swap
swapStr = "0"
}
// never write empty string to `memory.swap.max`, it means set to 0.
if swapStr != "" {
if err := cgroups.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil {
return err
}
}
if val := numToStr(r.Memory); val != "" {
if err := cgroups.WriteFile(dirPath, "memory.max", val); err != nil {
return err
}
}
// cgroup.Resources.KernelMemory is ignored
if val := numToStr(r.MemoryReservation); val != "" {
if err := cgroups.WriteFile(dirPath, "memory.low", val); err != nil {
return err
}
}
return nil
}
func statMemory(dirPath string, stats *cgroups.Stats) error {
const file = "memory.stat"
statsFile, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY)
if err != nil {
return err
}
defer statsFile.Close()
sc := bufio.NewScanner(statsFile)
for sc.Scan() {
t, v, err := fscommon.ParseKeyValue(sc.Text())
if err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
stats.MemoryStats.Stats[t] = v
}
if err := sc.Err(); err != nil {
return &parseError{Path: dirPath, File: file, Err: err}
}
stats.MemoryStats.Cache = stats.MemoryStats.Stats["file"]
// Unlike cgroup v1 which has memory.use_hierarchy binary knob,
// cgroup v2 is always hierarchical.
stats.MemoryStats.UseHierarchy = true
memoryUsage, err := getMemoryDataV2(dirPath, "")
if err != nil {
if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint {
// The root cgroup does not have memory.{current,max}
// so emulate those using data from /proc/meminfo and
// /sys/fs/cgroup/memory.stat
return rootStatsFromMeminfo(stats)
}
return err
}
stats.MemoryStats.Usage = memoryUsage
swapUsage, err := getMemoryDataV2(dirPath, "swap")
if err != nil {
return err
}
// As cgroup v1 reports SwapUsage values as mem+swap combined,
// while in cgroup v2 swap values do not include memory,
// report combined mem+swap for v1 compatibility.
swapUsage.Usage += memoryUsage.Usage
if swapUsage.Limit != math.MaxUint64 {
swapUsage.Limit += memoryUsage.Limit
}
stats.MemoryStats.SwapUsage = swapUsage
return nil
}
func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {
memoryData := cgroups.MemoryData{}
moduleName := "memory"
if name != "" {
moduleName = "memory." + name
}
usage := moduleName + ".current"
limit := moduleName + ".max"
value, err := fscommon.GetCgroupParamUint(path, usage)
if err != nil {
if name != "" && os.IsNotExist(err) {
// Ignore EEXIST as there's no swap accounting
// if kernel CONFIG_MEMCG_SWAP is not set or
// swapaccount=0 kernel boot parameter is given.
return cgroups.MemoryData{}, nil
}
return cgroups.MemoryData{}, err
}
memoryData.Usage = value
value, err = fscommon.GetCgroupParamUint(path, limit)
if err != nil {
return cgroups.MemoryData{}, err
}
memoryData.Limit = value
return memoryData, nil
}
func rootStatsFromMeminfo(stats *cgroups.Stats) error {
const file = "/proc/meminfo"
f, err := os.Open(file)
if err != nil {
return err
}
defer f.Close()
// Fields we are interested in.
var (
swap_free uint64
swap_total uint64
)
mem := map[string]*uint64{
"SwapFree": &swap_free,
"SwapTotal": &swap_total,
}
found := 0
sc := bufio.NewScanner(f)
for sc.Scan() {
parts := strings.SplitN(sc.Text(), ":", 3)
if len(parts) != 2 {
// Should not happen.
continue
}
k := parts[0]
p, ok := mem[k]
if !ok {
// Unknown field -- not interested.
continue
}
vStr := strings.TrimSpace(strings.TrimSuffix(parts[1], " kB"))
*p, err = strconv.ParseUint(vStr, 10, 64)
if err != nil {
return &parseError{File: file, Err: errors.New("bad value for " + k)}
}
found++
if found == len(mem) {
// Got everything we need -- skip the rest.
break
}
}
if err := sc.Err(); err != nil {
return &parseError{Path: "", File: file, Err: err}
}
// cgroup v1 `usage_in_bytes` reports memory usage as the sum of
// - rss (NR_ANON_MAPPED)
// - cache (NR_FILE_PAGES)
// cgroup v1 reports SwapUsage values as mem+swap combined
// cgroup v2 reports rss and cache as anon and file.
// sum `anon` + `file` to report the same value as `usage_in_bytes` in v1.
// sum swap usage as combined mem+swap usage for consistency as well.
stats.MemoryStats.Usage.Usage = stats.MemoryStats.Stats["anon"] + stats.MemoryStats.Stats["file"]
stats.MemoryStats.Usage.Limit = math.MaxUint64
stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024
stats.MemoryStats.SwapUsage.Limit = math.MaxUint64
stats.MemoryStats.SwapUsage.Usage += stats.MemoryStats.Usage.Usage
return nil
}

View File

@ -1,72 +0,0 @@
package fs2
import (
"errors"
"math"
"os"
"strings"
"golang.org/x/sys/unix"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/configs"
)
func isPidsSet(r *configs.Resources) bool {
return r.PidsLimit != 0
}
func setPids(dirPath string, r *configs.Resources) error {
if !isPidsSet(r) {
return nil
}
if val := numToStr(r.PidsLimit); val != "" {
if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil {
return err
}
}
return nil
}
func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error {
// if the controller is not enabled, let's read PIDS from cgroups.procs
// (or threads if cgroup.threads is enabled)
contents, err := cgroups.ReadFile(dirPath, "cgroup.procs")
if errors.Is(err, unix.ENOTSUP) {
contents, err = cgroups.ReadFile(dirPath, "cgroup.threads")
}
if err != nil {
return err
}
pids := strings.Count(contents, "\n")
stats.PidsStats.Current = uint64(pids)
stats.PidsStats.Limit = 0
return nil
}
func statPids(dirPath string, stats *cgroups.Stats) error {
current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current")
if err != nil {
if os.IsNotExist(err) {
return statPidsFromCgroupProcs(dirPath, stats)
}
return err
}
max, err := fscommon.GetCgroupParamUint(dirPath, "pids.max")
if err != nil {
return err
}
// If no limit is set, read from pids.max returns "max", which is
// converted to MaxUint64 by GetCgroupParamUint. Historically, we
// represent "no limit" for pids as 0, thus this conversion.
if max == math.MaxUint64 {
max = 0
}
stats.PidsStats.Current = current
stats.PidsStats.Limit = max
return nil
}

View File

@ -1,121 +0,0 @@
package fscommon
import (
"bufio"
"errors"
"math"
"os"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
// parseRdmaKV parses raw string to RdmaEntry.
func parseRdmaKV(raw string, entry *cgroups.RdmaEntry) error {
var value uint32
parts := strings.SplitN(raw, "=", 3)
if len(parts) != 2 {
return errors.New("Unable to parse RDMA entry")
}
k, v := parts[0], parts[1]
if v == "max" {
value = math.MaxUint32
} else {
val64, err := strconv.ParseUint(v, 10, 32)
if err != nil {
return err
}
value = uint32(val64)
}
if k == "hca_handle" {
entry.HcaHandles = value
} else if k == "hca_object" {
entry.HcaObjects = value
}
return nil
}
// readRdmaEntries reads and converts array of rawstrings to RdmaEntries from file.
// example entry: mlx4_0 hca_handle=2 hca_object=2000
func readRdmaEntries(dir, file string) ([]cgroups.RdmaEntry, error) {
rdmaEntries := make([]cgroups.RdmaEntry, 0)
fd, err := cgroups.OpenFile(dir, file, unix.O_RDONLY)
if err != nil {
return nil, err
}
defer fd.Close() //nolint:errorlint
scanner := bufio.NewScanner(fd)
for scanner.Scan() {
parts := strings.SplitN(scanner.Text(), " ", 4)
if len(parts) == 3 {
entry := new(cgroups.RdmaEntry)
entry.Device = parts[0]
err = parseRdmaKV(parts[1], entry)
if err != nil {
continue
}
err = parseRdmaKV(parts[2], entry)
if err != nil {
continue
}
rdmaEntries = append(rdmaEntries, *entry)
}
}
return rdmaEntries, scanner.Err()
}
// RdmaGetStats returns rdma stats such as totalLimit and current entries.
func RdmaGetStats(path string, stats *cgroups.Stats) error {
currentEntries, err := readRdmaEntries(path, "rdma.current")
if err != nil {
if errors.Is(err, os.ErrNotExist) {
err = nil
}
return err
}
maxEntries, err := readRdmaEntries(path, "rdma.max")
if err != nil {
return err
}
// If device got removed between reading two files, ignore returning stats.
if len(currentEntries) != len(maxEntries) {
return nil
}
stats.RdmaStats = cgroups.RdmaStats{
RdmaLimit: maxEntries,
RdmaCurrent: currentEntries,
}
return nil
}
func createCmdString(device string, limits configs.LinuxRdma) string {
cmdString := device
if limits.HcaHandles != nil {
cmdString += " hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10)
}
if limits.HcaObjects != nil {
cmdString += " hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10)
}
return cmdString
}
// RdmaSet sets RDMA resources.
func RdmaSet(path string, r *configs.Resources) error {
for device, limits := range r.Rdma {
if err := cgroups.WriteFile(path, "rdma.max", createCmdString(device, limits)); err != nil {
return err
}
}
return nil
}

View File

@ -1,145 +0,0 @@
package fscommon
import (
"errors"
"fmt"
"math"
"path"
"strconv"
"strings"
"github.com/opencontainers/runc/libcontainer/cgroups"
)
var (
// Deprecated: use cgroups.OpenFile instead.
OpenFile = cgroups.OpenFile
// Deprecated: use cgroups.ReadFile instead.
ReadFile = cgroups.ReadFile
// Deprecated: use cgroups.WriteFile instead.
WriteFile = cgroups.WriteFile
)
// ParseError records a parse error details, including the file path.
type ParseError struct {
Path string
File string
Err error
}
func (e *ParseError) Error() string {
return "unable to parse " + path.Join(e.Path, e.File) + ": " + e.Err.Error()
}
func (e *ParseError) Unwrap() error { return e.Err }
// ParseUint converts a string to an uint64 integer.
// Negative values are returned at zero as, due to kernel bugs,
// some of the memory cgroup stats can be negative.
func ParseUint(s string, base, bitSize int) (uint64, error) {
value, err := strconv.ParseUint(s, base, bitSize)
if err != nil {
intValue, intErr := strconv.ParseInt(s, base, bitSize)
// 1. Handle negative values greater than MinInt64 (and)
// 2. Handle negative values lesser than MinInt64
if intErr == nil && intValue < 0 {
return 0, nil
} else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 {
return 0, nil
}
return value, err
}
return value, nil
}
// ParseKeyValue parses a space-separated "name value" kind of cgroup
// parameter and returns its key as a string, and its value as uint64
// (ParseUint is used to convert the value). For example,
// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234.
func ParseKeyValue(t string) (string, uint64, error) {
parts := strings.SplitN(t, " ", 3)
if len(parts) != 2 {
return "", 0, fmt.Errorf("line %q is not in key value format", t)
}
value, err := ParseUint(parts[1], 10, 64)
if err != nil {
return "", 0, err
}
return parts[0], value, nil
}
// GetValueByKey reads a key-value pairs from the specified cgroup file,
// and returns a value of the specified key. ParseUint is used for value
// conversion.
func GetValueByKey(path, file, key string) (uint64, error) {
content, err := cgroups.ReadFile(path, file)
if err != nil {
return 0, err
}
lines := strings.Split(content, "\n")
for _, line := range lines {
arr := strings.Split(line, " ")
if len(arr) == 2 && arr[0] == key {
val, err := ParseUint(arr[1], 10, 64)
if err != nil {
err = &ParseError{Path: path, File: file, Err: err}
}
return val, err
}
}
return 0, nil
}
// GetCgroupParamUint reads a single uint64 value from the specified cgroup file.
// If the value read is "max", the math.MaxUint64 is returned.
func GetCgroupParamUint(path, file string) (uint64, error) {
contents, err := GetCgroupParamString(path, file)
if err != nil {
return 0, err
}
contents = strings.TrimSpace(contents)
if contents == "max" {
return math.MaxUint64, nil
}
res, err := ParseUint(contents, 10, 64)
if err != nil {
return res, &ParseError{Path: path, File: file, Err: err}
}
return res, nil
}
// GetCgroupParamInt reads a single int64 value from specified cgroup file.
// If the value read is "max", the math.MaxInt64 is returned.
func GetCgroupParamInt(path, file string) (int64, error) {
contents, err := cgroups.ReadFile(path, file)
if err != nil {
return 0, err
}
contents = strings.TrimSpace(contents)
if contents == "max" {
return math.MaxInt64, nil
}
res, err := strconv.ParseInt(contents, 10, 64)
if err != nil {
return res, &ParseError{Path: path, File: file, Err: err}
}
return res, nil
}
// GetCgroupParamString reads a string from the specified cgroup file.
func GetCgroupParamString(path, file string) (string, error) {
contents, err := cgroups.ReadFile(path, file)
if err != nil {
return "", err
}
return strings.TrimSpace(contents), nil
}

View File

@ -1,27 +0,0 @@
package cgroups
import (
"io/fs"
"path/filepath"
)
// GetAllPids returns all pids from the cgroup identified by path, and all its
// sub-cgroups.
func GetAllPids(path string) ([]int, error) {
var pids []int
err := filepath.WalkDir(path, func(p string, d fs.DirEntry, iErr error) error {
if iErr != nil {
return iErr
}
if !d.IsDir() {
return nil
}
cPids, err := readProcsFile(p)
if err != nil {
return err
}
pids = append(pids, cPids...)
return nil
})
return pids, err
}

View File

@ -1,173 +0,0 @@
package cgroups
type ThrottlingData struct {
// Number of periods with throttling active
Periods uint64 `json:"periods,omitempty"`
// Number of periods when the container hit its throttling limit.
ThrottledPeriods uint64 `json:"throttled_periods,omitempty"`
// Aggregate time the container was throttled for in nanoseconds.
ThrottledTime uint64 `json:"throttled_time,omitempty"`
}
// CpuUsage denotes the usage of a CPU.
// All CPU stats are aggregate since container inception.
type CpuUsage struct {
// Total CPU time consumed.
// Units: nanoseconds.
TotalUsage uint64 `json:"total_usage,omitempty"`
// Total CPU time consumed per core.
// Units: nanoseconds.
PercpuUsage []uint64 `json:"percpu_usage,omitempty"`
// CPU time consumed per core in kernel mode
// Units: nanoseconds.
PercpuUsageInKernelmode []uint64 `json:"percpu_usage_in_kernelmode"`
// CPU time consumed per core in user mode
// Units: nanoseconds.
PercpuUsageInUsermode []uint64 `json:"percpu_usage_in_usermode"`
// Time spent by tasks of the cgroup in kernel mode.
// Units: nanoseconds.
UsageInKernelmode uint64 `json:"usage_in_kernelmode"`
// Time spent by tasks of the cgroup in user mode.
// Units: nanoseconds.
UsageInUsermode uint64 `json:"usage_in_usermode"`
}
type CpuStats struct {
CpuUsage CpuUsage `json:"cpu_usage,omitempty"`
ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
}
type CPUSetStats struct {
// List of the physical numbers of the CPUs on which processes
// in that cpuset are allowed to execute
CPUs []uint16 `json:"cpus,omitempty"`
// cpu_exclusive flag
CPUExclusive uint64 `json:"cpu_exclusive"`
// List of memory nodes on which processes in that cpuset
// are allowed to allocate memory
Mems []uint16 `json:"mems,omitempty"`
// mem_hardwall flag
MemHardwall uint64 `json:"mem_hardwall"`
// mem_exclusive flag
MemExclusive uint64 `json:"mem_exclusive"`
// memory_migrate flag
MemoryMigrate uint64 `json:"memory_migrate"`
// memory_spread page flag
MemorySpreadPage uint64 `json:"memory_spread_page"`
// memory_spread slab flag
MemorySpreadSlab uint64 `json:"memory_spread_slab"`
// memory_pressure
MemoryPressure uint64 `json:"memory_pressure"`
// sched_load balance flag
SchedLoadBalance uint64 `json:"sched_load_balance"`
// sched_relax_domain_level
SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
}
type MemoryData struct {
Usage uint64 `json:"usage,omitempty"`
MaxUsage uint64 `json:"max_usage,omitempty"`
Failcnt uint64 `json:"failcnt"`
Limit uint64 `json:"limit"`
}
type MemoryStats struct {
// memory used for cache
Cache uint64 `json:"cache,omitempty"`
// usage of memory
Usage MemoryData `json:"usage,omitempty"`
// usage of memory + swap
SwapUsage MemoryData `json:"swap_usage,omitempty"`
// usage of kernel memory
KernelUsage MemoryData `json:"kernel_usage,omitempty"`
// usage of kernel TCP memory
KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
// usage of memory pages by NUMA node
// see chapter 5.6 of memory controller documentation
PageUsageByNUMA PageUsageByNUMA `json:"page_usage_by_numa,omitempty"`
// if true, memory usage is accounted for throughout a hierarchy of cgroups.
UseHierarchy bool `json:"use_hierarchy"`
Stats map[string]uint64 `json:"stats,omitempty"`
}
type PageUsageByNUMA struct {
// Embedding is used as types can't be recursive.
PageUsageByNUMAInner
Hierarchical PageUsageByNUMAInner `json:"hierarchical,omitempty"`
}
type PageUsageByNUMAInner struct {
Total PageStats `json:"total,omitempty"`
File PageStats `json:"file,omitempty"`
Anon PageStats `json:"anon,omitempty"`
Unevictable PageStats `json:"unevictable,omitempty"`
}
type PageStats struct {
Total uint64 `json:"total,omitempty"`
Nodes map[uint8]uint64 `json:"nodes,omitempty"`
}
type PidsStats struct {
// number of pids in the cgroup
Current uint64 `json:"current,omitempty"`
// active pids hard limit
Limit uint64 `json:"limit,omitempty"`
}
type BlkioStatEntry struct {
Major uint64 `json:"major,omitempty"`
Minor uint64 `json:"minor,omitempty"`
Op string `json:"op,omitempty"`
Value uint64 `json:"value,omitempty"`
}
type BlkioStats struct {
// number of bytes transferred to and from the block device
IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"`
IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"`
IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"`
IoServiceTimeRecursive []BlkioStatEntry `json:"io_service_time_recursive,omitempty"`
IoWaitTimeRecursive []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"`
IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"`
IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"`
SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"`
}
type HugetlbStats struct {
// current res_counter usage for hugetlb
Usage uint64 `json:"usage,omitempty"`
// maximum usage ever recorded.
MaxUsage uint64 `json:"max_usage,omitempty"`
// number of times hugetlb usage allocation failure.
Failcnt uint64 `json:"failcnt"`
}
type RdmaEntry struct {
Device string `json:"device,omitempty"`
HcaHandles uint32 `json:"hca_handles,omitempty"`
HcaObjects uint32 `json:"hca_objects,omitempty"`
}
type RdmaStats struct {
RdmaLimit []RdmaEntry `json:"rdma_limit,omitempty"`
RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"`
}
type Stats struct {
CpuStats CpuStats `json:"cpu_stats,omitempty"`
CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
MemoryStats MemoryStats `json:"memory_stats,omitempty"`
PidsStats PidsStats `json:"pids_stats,omitempty"`
BlkioStats BlkioStats `json:"blkio_stats,omitempty"`
// the map is in the format "size of hugepage: stats of the hugepage"
HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"`
RdmaStats RdmaStats `json:"rdma_stats,omitempty"`
}
func NewStats() *Stats {
memoryStats := MemoryStats{Stats: make(map[string]uint64)}
hugetlbStats := make(map[string]HugetlbStats)
return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats}
}

View File

@ -1,564 +0,0 @@
package systemd
import (
"bufio"
"context"
"errors"
"fmt"
"math"
"os"
"regexp"
"strconv"
"strings"
"sync"
"time"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
"github.com/sirupsen/logrus"
cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
"github.com/opencontainers/runc/libcontainer/configs"
"github.com/opencontainers/runc/libcontainer/devices"
)
const (
// Default kernel value for cpu quota period is 100000 us (100 ms), same for v1 and v2.
// v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and
// v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
defCPUQuotaPeriod = uint64(100000)
)
var (
versionOnce sync.Once
version int
isRunningSystemdOnce sync.Once
isRunningSystemd bool
)
// NOTE: This function comes from package github.com/coreos/go-systemd/util
// It was borrowed here to avoid a dependency on cgo.
//
// IsRunningSystemd checks whether the host was booted with systemd as its init
// system. This functions similarly to systemd's `sd_booted(3)`: internally, it
// checks whether /run/systemd/system/ exists and is a directory.
// http://www.freedesktop.org/software/systemd/man/sd_booted.html
func IsRunningSystemd() bool {
isRunningSystemdOnce.Do(func() {
fi, err := os.Lstat("/run/systemd/system")
isRunningSystemd = err == nil && fi.IsDir()
})
return isRunningSystemd
}
// systemd represents slice hierarchy using `-`, so we need to follow suit when
// generating the path of slice. Essentially, test-a-b.slice becomes
// /test.slice/test-a.slice/test-a-b.slice.
func ExpandSlice(slice string) (string, error) {
suffix := ".slice"
// Name has to end with ".slice", but can't be just ".slice".
if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
// Path-separators are not allowed.
if strings.Contains(slice, "/") {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
var path, prefix string
sliceName := strings.TrimSuffix(slice, suffix)
// if input was -.slice, we should just return root now
if sliceName == "-" {
return "/", nil
}
for _, component := range strings.Split(sliceName, "-") {
// test--a.slice isn't permitted, nor is -test.slice.
if component == "" {
return "", fmt.Errorf("invalid slice name: %s", slice)
}
// Append the component to the path and to the prefix.
path += "/" + prefix + component + suffix
prefix += component + "-"
}
return path, nil
}
func groupPrefix(ruleType devices.Type) (string, error) {
switch ruleType {
case devices.BlockDevice:
return "block-", nil
case devices.CharDevice:
return "char-", nil
default:
return "", fmt.Errorf("device type %v has no group prefix", ruleType)
}
}
// findDeviceGroup tries to find the device group name (as listed in
// /proc/devices) with the type prefixed as required for DeviceAllow, for a
// given (type, major) combination. If more than one device group exists, an
// arbitrary one is chosen.
func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
fh, err := os.Open("/proc/devices")
if err != nil {
return "", err
}
defer fh.Close()
prefix, err := groupPrefix(ruleType)
if err != nil {
return "", err
}
scanner := bufio.NewScanner(fh)
var currentType devices.Type
for scanner.Scan() {
// We need to strip spaces because the first number is column-aligned.
line := strings.TrimSpace(scanner.Text())
// Handle the "header" lines.
switch line {
case "Block devices:":
currentType = devices.BlockDevice
continue
case "Character devices:":
currentType = devices.CharDevice
continue
case "":
continue
}
// Skip lines unrelated to our type.
if currentType != ruleType {
continue
}
// Parse out the (major, name).
var (
currMajor int64
currName string
)
if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 {
if err == nil {
err = errors.New("wrong number of fields")
}
return "", fmt.Errorf("scan /proc/devices line %q: %w", line, err)
}
if currMajor == ruleMajor {
return prefix + currName, nil
}
}
if err := scanner.Err(); err != nil {
return "", fmt.Errorf("reading /proc/devices: %w", err)
}
// Couldn't find the device group.
return "", nil
}
// DeviceAllow is the dbus type "a(ss)" which means we need a struct
// to represent it in Go.
type deviceAllowEntry struct {
Path string
Perms string
}
func allowAllDevices() []systemdDbus.Property {
// Setting mode to auto and removing all DeviceAllow rules
// results in allowing access to all devices.
return []systemdDbus.Property{
newProp("DevicePolicy", "auto"),
newProp("DeviceAllow", []deviceAllowEntry{}),
}
}
// generateDeviceProperties takes the configured device rules and generates a
// corresponding set of systemd properties to configure the devices correctly.
func generateDeviceProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) {
if r.SkipDevices {
return nil, nil
}
properties := []systemdDbus.Property{
// Always run in the strictest white-list mode.
newProp("DevicePolicy", "strict"),
// Empty the DeviceAllow array before filling it.
newProp("DeviceAllow", []deviceAllowEntry{}),
}
// Figure out the set of rules.
configEmu := &cgroupdevices.Emulator{}
for _, rule := range r.Devices {
if err := configEmu.Apply(*rule); err != nil {
return nil, fmt.Errorf("unable to apply rule for systemd: %w", err)
}
}
// systemd doesn't support blacklists. So we log a warning, and tell
// systemd to act as a deny-all whitelist. This ruleset will be replaced
// with our normal fallback code. This may result in spurious errors, but
// the only other option is to error out here.
if configEmu.IsBlacklist() {
// However, if we're dealing with an allow-all rule then we can do it.
if configEmu.IsAllowAll() {
return allowAllDevices(), nil
}
logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
return properties, nil
}
// Now generate the set of rules we actually need to apply. Unlike the
// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
// whitelist which is the default for devices.Emulator.
finalRules, err := configEmu.Rules()
if err != nil {
return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err)
}
var deviceAllowList []deviceAllowEntry
for _, rule := range finalRules {
if !rule.Allow {
// Should never happen.
return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
}
switch rule.Type {
case devices.BlockDevice, devices.CharDevice:
default:
// Should never happen.
return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
}
entry := deviceAllowEntry{
Perms: string(rule.Permissions),
}
// systemd has a fairly odd (though understandable) syntax here, and
// because of the OCI configuration format we have to do quite a bit of
// trickery to convert things:
//
// * Concrete rules with non-wildcard major/minor numbers have to use
// /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses
// stat(2) on such paths to look up device properties, meaning we
// cannot add whitelist rules for devices that don't exist. Since v240,
// device properties are parsed from the path string.
//
// However, path globbing is not support for path-based rules so we
// need to handle wildcards in some other manner.
//
// * Wildcard-minor rules have to specify a "device group name" (the
// second column in /proc/devices).
//
// * Wildcard (major and minor) rules can just specify a glob with the
// type ("char-*" or "block-*").
//
// The only type of rule we can't handle is wildcard-major rules, and
// so we'll give a warning in that case (note that the fallback code
// will insert any rules systemd couldn't handle). What amazing fun.
if rule.Major == devices.Wildcard {
// "_ *:n _" rules aren't supported by systemd.
if rule.Minor != devices.Wildcard {
logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
continue
}
// "_ *:* _" rules just wildcard everything.
prefix, err := groupPrefix(rule.Type)
if err != nil {
return nil, err
}
entry.Path = prefix + "*"
} else if rule.Minor == devices.Wildcard {
// "_ n:* _" rules require a device group from /proc/devices.
group, err := findDeviceGroup(rule.Type, rule.Major)
if err != nil {
return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err)
}
if group == "" {
// Couldn't find a group.
logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
continue
}
entry.Path = group
} else {
// "_ n:m _" rules are just a path in /dev/{block,char}/.
switch rule.Type {
case devices.BlockDevice:
entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
case devices.CharDevice:
entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
}
if sdVer < 240 {
// Old systemd versions use stat(2) on path to find out device major:minor
// numbers and type. If the path doesn't exist, it will not add the rule,
// emitting a warning instead.
// Since all of this logic is best-effort anyway (we manually set these
// rules separately to systemd) we can safely skip entries that don't
// have a corresponding path.
if _, err := os.Stat(entry.Path); err != nil {
continue
}
}
}
deviceAllowList = append(deviceAllowList, entry)
}
properties = append(properties, newProp("DeviceAllow", deviceAllowList))
return properties, nil
}
func newProp(name string, units interface{}) systemdDbus.Property {
return systemdDbus.Property{
Name: name,
Value: dbus.MakeVariant(units),
}
}
func getUnitName(c *configs.Cgroup) string {
// by default, we create a scope unless the user explicitly asks for a slice.
if !strings.HasSuffix(c.Name, ".slice") {
return c.ScopePrefix + "-" + c.Name + ".scope"
}
return c.Name
}
// This code should be in sync with getUnitName.
func getUnitType(unitName string) string {
if strings.HasSuffix(unitName, ".slice") {
return "Slice"
}
return "Scope"
}
// isDbusError returns true if the error is a specific dbus error.
func isDbusError(err error, name string) bool {
if err != nil {
var derr dbus.Error
if errors.As(err, &derr) {
return strings.Contains(derr.Name, name)
}
}
return false
}
// isUnitExists returns true if the error is that a systemd unit already exists.
func isUnitExists(err error) bool {
return isDbusError(err, "org.freedesktop.systemd1.UnitExists")
}
func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error {
statusChan := make(chan string, 1)
retry := true
retry:
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
_, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan)
return err
})
if err != nil {
if !isUnitExists(err) {
return err
}
if ignoreExist {
// TODO: remove this hack.
// This is kubelet making sure a slice exists (see
// https://github.com/opencontainers/runc/pull/1124).
return nil
}
if retry {
// In case a unit with the same name exists, this may
// be a leftover failed unit. Reset it, so systemd can
// remove it, and retry once.
err = resetFailedUnit(cm, unitName)
if err != nil {
logrus.Warnf("unable to reset failed unit: %v", err)
}
retry = false
goto retry
}
return err
}
timeout := time.NewTimer(30 * time.Second)
defer timeout.Stop()
select {
case s := <-statusChan:
close(statusChan)
// Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
if s != "done" {
_ = resetFailedUnit(cm, unitName)
return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
}
case <-timeout.C:
_ = resetFailedUnit(cm, unitName)
return errors.New("Timeout waiting for systemd to create " + unitName)
}
return nil
}
func stopUnit(cm *dbusConnManager, unitName string) error {
statusChan := make(chan string, 1)
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
_, err := c.StopUnitContext(context.TODO(), unitName, "replace", statusChan)
return err
})
if err == nil {
timeout := time.NewTimer(30 * time.Second)
defer timeout.Stop()
select {
case s := <-statusChan:
close(statusChan)
// Please refer to https://godoc.org/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
if s != "done" {
logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s)
}
case <-timeout.C:
return errors.New("Timed out while waiting for systemd to remove " + unitName)
}
}
// In case of a failed unit, let systemd remove it.
_ = resetFailedUnit(cm, unitName)
return nil
}
func resetFailedUnit(cm *dbusConnManager, name string) error {
return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
return c.ResetFailedUnitContext(context.TODO(), name)
})
}
func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) {
var prop *systemdDbus.Property
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) {
prop, Err = c.GetUnitTypePropertyContext(context.TODO(), unitName, unitType, propertyName)
return Err
})
return prop, err
}
func setUnitProperties(cm *dbusConnManager, name string, properties ...systemdDbus.Property) error {
return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
return c.SetUnitPropertiesContext(context.TODO(), name, true, properties...)
})
}
func getManagerProperty(cm *dbusConnManager, name string) (string, error) {
str := ""
err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
var err error
str, err = c.GetManagerProperty(name)
return err
})
if err != nil {
return "", err
}
return strconv.Unquote(str)
}
func systemdVersion(cm *dbusConnManager) int {
versionOnce.Do(func() {
version = -1
verStr, err := getManagerProperty(cm, "Version")
if err == nil {
version, err = systemdVersionAtoi(verStr)
}
if err != nil {
logrus.WithError(err).Error("unable to get systemd version")
}
})
return version
}
func systemdVersionAtoi(verStr string) (int, error) {
// verStr should be of the form:
// "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32" (without quotes).
// The result for all of the above should be 245.
// Thus, we unconditionally remove the "v" prefix
// and then match on the first integer we can grab.
re := regexp.MustCompile(`v?([0-9]+)`)
matches := re.FindStringSubmatch(verStr)
if len(matches) < 2 {
return 0, fmt.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches)
}
ver, err := strconv.Atoi(matches[1])
if err != nil {
return -1, fmt.Errorf("can't parse version: %w", err)
}
return ver, nil
}
func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota int64, period uint64) {
if period != 0 {
// systemd only supports CPUQuotaPeriodUSec since v242
sdVer := systemdVersion(cm)
if sdVer >= 242 {
*properties = append(*properties,
newProp("CPUQuotaPeriodUSec", period))
} else {
logrus.Debugf("systemd v%d is too old to support CPUQuotaPeriodSec "+
" (setting will still be applied to cgroupfs)", sdVer)
}
}
if quota != 0 || period != 0 {
// corresponds to USEC_INFINITY in systemd
cpuQuotaPerSecUSec := uint64(math.MaxUint64)
if quota > 0 {
if period == 0 {
// assume the default
period = defCPUQuotaPeriod
}
// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
// (integer percentage of CPU) internally. This means that if a fractional percent of
// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
cpuQuotaPerSecUSec = uint64(quota*1000000) / period
if cpuQuotaPerSecUSec%10000 != 0 {
cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
}
}
*properties = append(*properties,
newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
}
}
func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems string) error {
if cpus == "" && mems == "" {
return nil
}
// systemd only supports AllowedCPUs/AllowedMemoryNodes since v244
sdVer := systemdVersion(cm)
if sdVer < 244 {
logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+
" (settings will still be applied to cgroupfs)", sdVer)
return nil
}
if cpus != "" {
bits, err := RangeToBits(cpus)
if err != nil {
return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w",
cpus, err)
}
*props = append(*props,
newProp("AllowedCPUs", bits))
}
if mems != "" {
bits, err := RangeToBits(mems)
if err != nil {
return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w",
mems, err)
}
*props = append(*props,
newProp("AllowedMemoryNodes", bits))
}
return nil
}

View File

@ -1,60 +0,0 @@
package systemd
import (
"errors"
"math/big"
"strconv"
"strings"
)
// RangeToBits converts a text representation of a CPU mask (as written to
// or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes
// with the corresponding bits set (as consumed by systemd over dbus as
// AllowedCPUs/AllowedMemoryNodes unit property value).
func RangeToBits(str string) ([]byte, error) {
bits := new(big.Int)
for _, r := range strings.Split(str, ",") {
// allow extra spaces around
r = strings.TrimSpace(r)
// allow empty elements (extra commas)
if r == "" {
continue
}
ranges := strings.SplitN(r, "-", 2)
if len(ranges) > 1 {
start, err := strconv.ParseUint(ranges[0], 10, 32)
if err != nil {
return nil, err
}
end, err := strconv.ParseUint(ranges[1], 10, 32)
if err != nil {
return nil, err
}
if start > end {
return nil, errors.New("invalid range: " + r)
}
for i := start; i <= end; i++ {
bits.SetBit(bits, int(i), 1)
}
} else {
val, err := strconv.ParseUint(ranges[0], 10, 32)
if err != nil {
return nil, err
}
bits.SetBit(bits, int(val), 1)
}
}
ret := bits.Bytes()
if len(ret) == 0 {
// do not allow empty values
return nil, errors.New("empty value")
}
// fit cpuset parsing order in systemd
for l, r := 0, len(ret)-1; l < r; l, r = l+1, r-1 {
ret[l], ret[r] = ret[r], ret[l]
}
return ret, nil
}

View File

@ -1,102 +0,0 @@
package systemd
import (
"context"
"errors"
"fmt"
"sync"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
)
var (
dbusC *systemdDbus.Conn
dbusMu sync.RWMutex
dbusInited bool
dbusRootless bool
)
type dbusConnManager struct{}
// newDbusConnManager initializes systemd dbus connection manager.
func newDbusConnManager(rootless bool) *dbusConnManager {
dbusMu.Lock()
defer dbusMu.Unlock()
if dbusInited && rootless != dbusRootless {
panic("can't have both root and rootless dbus")
}
dbusInited = true
dbusRootless = rootless
return &dbusConnManager{}
}
// getConnection lazily initializes and returns systemd dbus connection.
func (d *dbusConnManager) getConnection() (*systemdDbus.Conn, error) {
// In the case where dbusC != nil
// Use the read lock the first time to ensure
// that Conn can be acquired at the same time.
dbusMu.RLock()
if conn := dbusC; conn != nil {
dbusMu.RUnlock()
return conn, nil
}
dbusMu.RUnlock()
// In the case where dbusC == nil
// Use write lock to ensure that only one
// will be created
dbusMu.Lock()
defer dbusMu.Unlock()
if conn := dbusC; conn != nil {
return conn, nil
}
conn, err := d.newConnection()
if err != nil {
// When dbus-user-session is not installed, we can't detect whether we should try to connect to user dbus or system dbus, so d.dbusRootless is set to false.
// This may fail with a cryptic error "read unix @->/run/systemd/private: read: connection reset by peer: unknown."
// https://github.com/moby/moby/issues/42793
return nil, fmt.Errorf("failed to connect to dbus (hint: for rootless containers, maybe you need to install dbus-user-session package, see https://github.com/opencontainers/runc/blob/master/docs/cgroup-v2.md): %w", err)
}
dbusC = conn
return conn, nil
}
func (d *dbusConnManager) newConnection() (*systemdDbus.Conn, error) {
if dbusRootless {
return newUserSystemdDbus()
}
return systemdDbus.NewWithContext(context.TODO())
}
// resetConnection resets the connection to its initial state
// (so it can be reconnected if necessary).
func (d *dbusConnManager) resetConnection(conn *systemdDbus.Conn) {
dbusMu.Lock()
defer dbusMu.Unlock()
if dbusC != nil && dbusC == conn {
dbusC.Close()
dbusC = nil
}
}
// retryOnDisconnect calls op, and if the error it returns is about closed dbus
// connection, the connection is re-established and the op is retried. This helps
// with the situation when dbus is restarted and we have a stale connection.
func (d *dbusConnManager) retryOnDisconnect(op func(*systemdDbus.Conn) error) error {
for {
conn, err := d.getConnection()
if err != nil {
return err
}
err = op(conn)
if err == nil {
return nil
}
if !errors.Is(err, dbus.ErrClosed) {
return err
}
d.resetConnection(conn)
}
}

View File

@ -1,106 +0,0 @@
package systemd
import (
"bufio"
"bytes"
"errors"
"fmt"
"os"
"os/exec"
"path/filepath"
"strconv"
"strings"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
dbus "github.com/godbus/dbus/v5"
"github.com/opencontainers/runc/libcontainer/userns"
)
// newUserSystemdDbus creates a connection for systemd user-instance.
func newUserSystemdDbus() (*systemdDbus.Conn, error) {
addr, err := DetectUserDbusSessionBusAddress()
if err != nil {
return nil, err
}
uid, err := DetectUID()
if err != nil {
return nil, err
}
return systemdDbus.NewConnection(func() (*dbus.Conn, error) {
conn, err := dbus.Dial(addr)
if err != nil {
return nil, fmt.Errorf("error while dialing %q: %w", addr, err)
}
methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))}
err = conn.Auth(methods)
if err != nil {
conn.Close()
return nil, fmt.Errorf("error while authenticating connection (address=%q, UID=%d): %w", addr, uid, err)
}
if err = conn.Hello(); err != nil {
conn.Close()
return nil, fmt.Errorf("error while sending Hello message (address=%q, UID=%d): %w", addr, uid, err)
}
return conn, nil
})
}
// DetectUID detects UID from the OwnerUID field of `busctl --user status`
// if running in userNS. The value corresponds to sd_bus_creds_get_owner_uid(3) .
//
// Otherwise returns os.Getuid() .
func DetectUID() (int, error) {
if !userns.RunningInUserNS() {
return os.Getuid(), nil
}
b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput()
if err != nil {
return -1, fmt.Errorf("could not execute `busctl --user --no-pager status` (output: %q): %w", string(b), err)
}
scanner := bufio.NewScanner(bytes.NewReader(b))
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(s, "OwnerUID=") {
uidStr := strings.TrimPrefix(s, "OwnerUID=")
i, err := strconv.Atoi(uidStr)
if err != nil {
return -1, fmt.Errorf("could not detect the OwnerUID: %w", err)
}
return i, nil
}
}
if err := scanner.Err(); err != nil {
return -1, err
}
return -1, errors.New("could not detect the OwnerUID")
}
// DetectUserDbusSessionBusAddress returns $DBUS_SESSION_BUS_ADDRESS if set.
// Otherwise returns "unix:path=$XDG_RUNTIME_DIR/bus" if $XDG_RUNTIME_DIR/bus exists.
// Otherwise parses the value from `systemctl --user show-environment` .
func DetectUserDbusSessionBusAddress() (string, error) {
if env := os.Getenv("DBUS_SESSION_BUS_ADDRESS"); env != "" {
return env, nil
}
if xdr := os.Getenv("XDG_RUNTIME_DIR"); xdr != "" {
busPath := filepath.Join(xdr, "bus")
if _, err := os.Stat(busPath); err == nil {
busAddress := "unix:path=" + busPath
return busAddress, nil
}
}
b, err := exec.Command("systemctl", "--user", "--no-pager", "show-environment").CombinedOutput()
if err != nil {
return "", fmt.Errorf("could not execute `systemctl --user --no-pager show-environment` (output=%q): %w", string(b), err)
}
scanner := bufio.NewScanner(bytes.NewReader(b))
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if strings.HasPrefix(s, "DBUS_SESSION_BUS_ADDRESS=") {
return strings.TrimPrefix(s, "DBUS_SESSION_BUS_ADDRESS="), nil
}
}
return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from `systemctl --user --no-pager show-environment`. Make sure you have installed the dbus-user-session or dbus-daemon package and then run: `systemctl --user start dbus`")
}

View File

@ -1,480 +0,0 @@
package systemd
import (
"errors"
"os"
"path/filepath"
"reflect"
"strings"
"sync"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/godbus/dbus/v5"
"github.com/sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs"
"github.com/opencontainers/runc/libcontainer/configs"
)
type legacyManager struct {
mu sync.Mutex
cgroups *configs.Cgroup
paths map[string]string
dbus *dbusConnManager
}
func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
if cg.Rootless {
return nil, errors.New("cannot use rootless systemd cgroups manager on cgroup v1")
}
if cg.Resources != nil && cg.Resources.Unified != nil {
return nil, cgroups.ErrV1NoUnified
}
if paths == nil {
var err error
paths, err = initPaths(cg)
if err != nil {
return nil, err
}
}
return &legacyManager{
cgroups: cg,
paths: paths,
dbus: newDbusConnManager(false),
}, nil
}
type subsystem interface {
// Name returns the name of the subsystem.
Name() string
// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
GetStats(path string, stats *cgroups.Stats) error
// Set sets cgroup resource limits.
Set(path string, r *configs.Resources) error
}
var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
var legacySubsystems = []subsystem{
&fs.CpusetGroup{},
&fs.DevicesGroup{},
&fs.MemoryGroup{},
&fs.CpuGroup{},
&fs.CpuacctGroup{},
&fs.PidsGroup{},
&fs.BlkioGroup{},
&fs.HugetlbGroup{},
&fs.PerfEventGroup{},
&fs.FreezerGroup{},
&fs.NetPrioGroup{},
&fs.NetClsGroup{},
&fs.NameGroup{GroupName: "name=systemd"},
&fs.RdmaGroup{},
&fs.NameGroup{GroupName: "misc"},
}
func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
var properties []systemdDbus.Property
deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm))
if err != nil {
return nil, err
}
properties = append(properties, deviceProperties...)
if r.Memory != 0 {
properties = append(properties,
newProp("MemoryLimit", uint64(r.Memory)))
}
if r.CpuShares != 0 {
properties = append(properties,
newProp("CPUShares", r.CpuShares))
}
addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
if r.BlkioWeight != 0 {
properties = append(properties,
newProp("BlockIOWeight", uint64(r.BlkioWeight)))
}
if r.PidsLimit > 0 || r.PidsLimit == -1 {
properties = append(properties,
newProp("TasksMax", uint64(r.PidsLimit)))
}
err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
if err != nil {
return nil, err
}
return properties, nil
}
// initPaths figures out and returns paths to cgroups.
func initPaths(c *configs.Cgroup) (map[string]string, error) {
slice := "system.slice"
if c.Parent != "" {
var err error
slice, err = ExpandSlice(c.Parent)
if err != nil {
return nil, err
}
}
unit := getUnitName(c)
paths := make(map[string]string)
for _, s := range legacySubsystems {
subsystemPath, err := getSubsystemPath(slice, unit, s.Name())
if err != nil {
// Even if it's `not found` error, we'll return err
// because devices cgroup is hard requirement for
// container security.
if s.Name() == "devices" {
return nil, err
}
// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
if cgroups.IsNotFound(err) {
continue
}
return nil, err
}
paths[s.Name()] = subsystemPath
}
// If systemd is using cgroups-hybrid mode then add the slice path of
// this container to the paths so the following process executed with
// "runc exec" joins that cgroup as well.
if cgroups.IsCgroup2HybridMode() {
// "" means cgroup-hybrid path
cgroupsHybridPath, err := getSubsystemPath(slice, unit, "")
if err != nil && cgroups.IsNotFound(err) {
return nil, err
}
paths[""] = cgroupsHybridPath
}
return paths, nil
}
func (m *legacyManager) Apply(pid int) error {
var (
c = m.cgroups
unitName = getUnitName(c)
slice = "system.slice"
properties []systemdDbus.Property
)
m.mu.Lock()
defer m.mu.Unlock()
if c.Parent != "" {
slice = c.Parent
}
properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
if strings.HasSuffix(unitName, ".slice") {
// If we create a slice, the parent is defined via a Wants=.
properties = append(properties, systemdDbus.PropWants(slice))
} else {
// Otherwise it's a scope, which we put into a Slice=.
properties = append(properties, systemdDbus.PropSlice(slice))
// Assume scopes always support delegation (supported since systemd v218).
properties = append(properties, newProp("Delegate", true))
}
// only add pid if its valid, -1 is used w/ general slice creation.
if pid != -1 {
properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
}
// Always enable accounting, this gets us the same behaviour as the fs implementation,
// plus the kernel has some problems with joining the memory cgroup at a later time.
properties = append(properties,
newProp("MemoryAccounting", true),
newProp("CPUAccounting", true),
newProp("BlockIOAccounting", true),
newProp("TasksAccounting", true),
)
// Assume DefaultDependencies= will always work (the check for it was previously broken.)
properties = append(properties,
newProp("DefaultDependencies", false))
properties = append(properties, c.SystemdProps...)
if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
return err
}
if err := m.joinCgroups(pid); err != nil {
return err
}
return nil
}
func (m *legacyManager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
stopErr := stopUnit(m.dbus, getUnitName(m.cgroups))
// Both on success and on error, cleanup all the cgroups
// we are aware of, as some of them were created directly
// by Apply() and are not managed by systemd.
if err := cgroups.RemovePaths(m.paths); err != nil && stopErr == nil {
return err
}
return stopErr
}
func (m *legacyManager) Path(subsys string) string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths[subsys]
}
func (m *legacyManager) joinCgroups(pid int) error {
for _, sys := range legacySubsystems {
name := sys.Name()
switch name {
case "name=systemd":
// let systemd handle this
case "cpuset":
if path, ok := m.paths[name]; ok {
s := &fs.CpusetGroup{}
if err := s.ApplyDir(path, m.cgroups.Resources, pid); err != nil {
return err
}
}
default:
if path, ok := m.paths[name]; ok {
if err := os.MkdirAll(path, 0o755); err != nil {
return err
}
if err := cgroups.WriteCgroupProc(path, pid); err != nil {
return err
}
}
}
}
return nil
}
func getSubsystemPath(slice, unit, subsystem string) (string, error) {
mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem)
if err != nil {
return "", err
}
return filepath.Join(mountpoint, slice, unit), nil
}
func (m *legacyManager) Freeze(state configs.FreezerState) error {
err := m.doFreeze(state)
if err == nil {
m.cgroups.Resources.Freezer = state
}
return err
}
// doFreeze is the same as Freeze but without
// changing the m.cgroups.Resources.Frozen field.
func (m *legacyManager) doFreeze(state configs.FreezerState) error {
path, ok := m.paths["freezer"]
if !ok {
return errSubsystemDoesNotExist
}
freezer := &fs.FreezerGroup{}
resources := &configs.Resources{Freezer: state}
return freezer.Set(path, resources)
}
func (m *legacyManager) GetPids() ([]int, error) {
path, ok := m.paths["devices"]
if !ok {
return nil, errSubsystemDoesNotExist
}
return cgroups.GetPids(path)
}
func (m *legacyManager) GetAllPids() ([]int, error) {
path, ok := m.paths["devices"]
if !ok {
return nil, errSubsystemDoesNotExist
}
return cgroups.GetAllPids(path)
}
func (m *legacyManager) GetStats() (*cgroups.Stats, error) {
m.mu.Lock()
defer m.mu.Unlock()
stats := cgroups.NewStats()
for _, sys := range legacySubsystems {
path := m.paths[sys.Name()]
if path == "" {
continue
}
if err := sys.GetStats(path, stats); err != nil {
return nil, err
}
}
return stats, nil
}
// freezeBeforeSet answers whether there is a need to freeze the cgroup before
// applying its systemd unit properties, and thaw after, while avoiding
// unnecessary freezer state changes.
//
// The reason why we have to freeze is that systemd's application of device
// rules is done disruptively, resulting in spurious errors to common devices
// (unlike our fs driver, they will happily write deny-all rules to running
// containers). So we have to freeze the container to avoid the container get
// an occasional "permission denied" error.
func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) (needsFreeze, needsThaw bool, err error) {
// Special case for SkipDevices, as used by Kubernetes to create pod
// cgroups with allow-all device policy).
if r.SkipDevices {
if r.SkipFreezeOnSet {
// Both needsFreeze and needsThaw are false.
return
}
// No need to freeze if SkipDevices is set, and either
// (1) systemd unit does not (yet) exist, or
// (2) it has DevicePolicy=auto and empty DeviceAllow list.
//
// Interestingly, (1) and (2) are the same here because
// a non-existent unit returns default properties,
// and settings in (2) are the defaults.
//
// Do not return errors from getUnitTypeProperty, as they alone
// should not prevent Set from working.
unitType := getUnitType(unitName)
devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy")
if e == nil && devPolicy.Value == dbus.MakeVariant("auto") {
devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow")
if e == nil {
if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 {
needsFreeze = false
needsThaw = false
return
}
}
}
}
needsFreeze = true
needsThaw = true
// Check the current freezer state.
freezerState, err := m.GetFreezerState()
if err != nil {
return
}
if freezerState == configs.Frozen {
// Already frozen, and should stay frozen.
needsFreeze = false
needsThaw = false
}
if r.Freezer == configs.Frozen {
// Will be frozen anyway -- no need to thaw.
needsThaw = false
}
return
}
func (m *legacyManager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
if r.Unified != nil {
return cgroups.ErrV1NoUnified
}
properties, err := genV1ResourcesProperties(r, m.dbus)
if err != nil {
return err
}
unitName := getUnitName(m.cgroups)
needsFreeze, needsThaw, err := m.freezeBeforeSet(unitName, r)
if err != nil {
return err
}
if needsFreeze {
if err := m.doFreeze(configs.Frozen); err != nil {
// If freezer cgroup isn't supported, we just warn about it.
logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
// skip update the cgroup while frozen failed. #3803
if !errors.Is(err, errSubsystemDoesNotExist) {
if needsThaw {
if thawErr := m.doFreeze(configs.Thawed); thawErr != nil {
logrus.Infof("thaw container after doFreeze failed: %v", thawErr)
}
}
return err
}
}
}
setErr := setUnitProperties(m.dbus, unitName, properties...)
if needsThaw {
if err := m.doFreeze(configs.Thawed); err != nil {
logrus.Infof("thaw container after SetUnitProperties failed: %v", err)
}
}
if setErr != nil {
return setErr
}
for _, sys := range legacySubsystems {
// Get the subsystem path, but don't error out for not found cgroups.
path, ok := m.paths[sys.Name()]
if !ok {
continue
}
if err := sys.Set(path, r); err != nil {
return err
}
}
return nil
}
func (m *legacyManager) GetPaths() map[string]string {
m.mu.Lock()
defer m.mu.Unlock()
return m.paths
}
func (m *legacyManager) GetCgroups() (*configs.Cgroup, error) {
return m.cgroups, nil
}
func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
path, ok := m.paths["freezer"]
if !ok {
return configs.Undefined, nil
}
freezer := &fs.FreezerGroup{}
return freezer.GetState(path)
}
func (m *legacyManager) Exists() bool {
return cgroups.PathExists(m.Path("devices"))
}
func (m *legacyManager) OOMKillCount() (uint64, error) {
return fs.OOMKillCount(m.Path("memory"))
}

View File

@ -1,472 +0,0 @@
package systemd
import (
"bufio"
"errors"
"fmt"
"math"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/cgroups"
"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
"github.com/opencontainers/runc/libcontainer/configs"
)
type unifiedManager struct {
mu sync.Mutex
cgroups *configs.Cgroup
// path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
path string
dbus *dbusConnManager
fsMgr cgroups.Manager
}
func NewUnifiedManager(config *configs.Cgroup, path string) (cgroups.Manager, error) {
m := &unifiedManager{
cgroups: config,
path: path,
dbus: newDbusConnManager(config.Rootless),
}
if err := m.initPath(); err != nil {
return nil, err
}
fsMgr, err := fs2.NewManager(config, m.path)
if err != nil {
return nil, err
}
m.fsMgr = fsMgr
return m, nil
}
// unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified
// key/value map (where key is cgroupfs file name) to systemd unit properties.
// This is on a best-effort basis, so the properties that are not known
// (to this function and/or systemd) are ignored (but logged with "debug"
// log level).
//
// For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt
//
// For the list of systemd unit properties, see systemd.resource-control(5).
func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) {
var err error
for k, v := range res {
if strings.Contains(k, "/") {
return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
}
sk := strings.SplitN(k, ".", 2)
if len(sk) != 2 {
return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
}
// Kernel is quite forgiving to extra whitespace
// around the value, and so should we.
v = strings.TrimSpace(v)
// Please keep cases in alphabetical order.
switch k {
case "cpu.max":
// value: quota [period]
quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set
period := defCPUQuotaPeriod
sv := strings.Fields(v)
if len(sv) < 1 || len(sv) > 2 {
return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v)
}
// quota
if sv[0] != "max" {
quota, err = strconv.ParseInt(sv[0], 10, 64)
if err != nil {
return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err)
}
}
// period
if len(sv) == 2 {
period, err = strconv.ParseUint(sv[1], 10, 64)
if err != nil {
return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err)
}
}
addCpuQuota(cm, &props, quota, period)
case "cpu.weight":
num, err := strconv.ParseUint(v, 10, 64)
if err != nil {
return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
}
props = append(props,
newProp("CPUWeight", num))
case "cpuset.cpus", "cpuset.mems":
bits, err := RangeToBits(v)
if err != nil {
return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err)
}
m := map[string]string{
"cpuset.cpus": "AllowedCPUs",
"cpuset.mems": "AllowedMemoryNodes",
}
// systemd only supports these properties since v244
sdVer := systemdVersion(cm)
if sdVer >= 244 {
props = append(props,
newProp(m[k], bits))
} else {
logrus.Debugf("systemd v%d is too old to support %s"+
" (setting will still be applied to cgroupfs)",
sdVer, m[k])
}
case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max":
num := uint64(math.MaxUint64)
if v != "max" {
num, err = strconv.ParseUint(v, 10, 64)
if err != nil {
return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
}
}
m := map[string]string{
"memory.high": "MemoryHigh",
"memory.low": "MemoryLow",
"memory.min": "MemoryMin",
"memory.max": "MemoryMax",
"memory.swap.max": "MemorySwapMax",
}
props = append(props,
newProp(m[k], num))
case "pids.max":
num := uint64(math.MaxUint64)
if v != "max" {
var err error
num, err = strconv.ParseUint(v, 10, 64)
if err != nil {
return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
}
}
props = append(props,
newProp("TasksMax", num))
case "memory.oom.group":
// Setting this to 1 is roughly equivalent to OOMPolicy=kill
// (as per systemd.service(5) and
// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html),
// but it's not clear what to do if it is unset or set
// to 0 in runc update, as there are two other possible
// values for OOMPolicy (continue/stop).
fallthrough
default:
// Ignore the unknown resource here -- will still be
// applied in Set which calls fs2.Set.
logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v)
}
}
return props, nil
}
func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
var properties []systemdDbus.Property
// NOTE: This is of questionable correctness because we insert our own
// devices eBPF program later. Two programs with identical rules
// aren't the end of the world, but it is a bit concerning. However
// it's unclear if systemd removes all eBPF programs attached when
// doing SetUnitProperties...
deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm))
if err != nil {
return nil, err
}
properties = append(properties, deviceProperties...)
if r.Memory != 0 {
properties = append(properties,
newProp("MemoryMax", uint64(r.Memory)))
}
if r.MemoryReservation != 0 {
properties = append(properties,
newProp("MemoryLow", uint64(r.MemoryReservation)))
}
swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
if err != nil {
return nil, err
}
if swap != 0 {
properties = append(properties,
newProp("MemorySwapMax", uint64(swap)))
}
if r.CpuWeight != 0 {
properties = append(properties,
newProp("CPUWeight", r.CpuWeight))
}
addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
if r.PidsLimit > 0 || r.PidsLimit == -1 {
properties = append(properties,
newProp("TasksMax", uint64(r.PidsLimit)))
}
err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
if err != nil {
return nil, err
}
// ignore r.KernelMemory
// convert Resources.Unified map to systemd properties
if r.Unified != nil {
unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified)
if err != nil {
return nil, err
}
properties = append(properties, unifiedProps...)
}
return properties, nil
}
func (m *unifiedManager) Apply(pid int) error {
var (
c = m.cgroups
unitName = getUnitName(c)
properties []systemdDbus.Property
)
slice := "system.slice"
if m.cgroups.Rootless {
slice = "user.slice"
}
if c.Parent != "" {
slice = c.Parent
}
properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
if strings.HasSuffix(unitName, ".slice") {
// If we create a slice, the parent is defined via a Wants=.
properties = append(properties, systemdDbus.PropWants(slice))
} else {
// Otherwise it's a scope, which we put into a Slice=.
properties = append(properties, systemdDbus.PropSlice(slice))
// Assume scopes always support delegation (supported since systemd v218).
properties = append(properties, newProp("Delegate", true))
}
// only add pid if its valid, -1 is used w/ general slice creation.
if pid != -1 {
properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
}
// Always enable accounting, this gets us the same behaviour as the fs implementation,
// plus the kernel has some problems with joining the memory cgroup at a later time.
properties = append(properties,
newProp("MemoryAccounting", true),
newProp("CPUAccounting", true),
newProp("IOAccounting", true),
newProp("TasksAccounting", true),
)
// Assume DefaultDependencies= will always work (the check for it was previously broken.)
properties = append(properties,
newProp("DefaultDependencies", false))
properties = append(properties, c.SystemdProps...)
if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err)
}
if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
return err
}
if c.OwnerUID != nil {
// The directory itself must be chowned.
err := os.Chown(m.path, *c.OwnerUID, -1)
if err != nil {
return err
}
filesToChown, err := cgroupFilesToChown()
if err != nil {
return err
}
for _, v := range filesToChown {
err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
// Some files might not be present.
if err != nil && !errors.Is(err, os.ErrNotExist) {
return err
}
}
}
return nil
}
// The kernel exposes a list of files that should be chowned to the delegate
// uid in /sys/kernel/cgroup/delegate. If the file is not present
// (Linux < 4.15), use the initial values mentioned in cgroups(7).
func cgroupFilesToChown() ([]string, error) {
const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"
f, err := os.Open(cgroupDelegateFile)
if err != nil {
return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil
}
defer f.Close()
filesToChown := []string{}
scanner := bufio.NewScanner(f)
for scanner.Scan() {
filesToChown = append(filesToChown, scanner.Text())
}
if err := scanner.Err(); err != nil {
return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
}
return filesToChown, nil
}
func (m *unifiedManager) Destroy() error {
m.mu.Lock()
defer m.mu.Unlock()
unitName := getUnitName(m.cgroups)
if err := stopUnit(m.dbus, unitName); err != nil {
return err
}
// systemd 239 do not remove sub-cgroups.
err := m.fsMgr.Destroy()
// fsMgr.Destroy has handled ErrNotExist
if err != nil {
return err
}
return nil
}
func (m *unifiedManager) Path(_ string) string {
return m.path
}
// getSliceFull value is used in initPath.
// The value is incompatible with systemdDbus.PropSlice.
func (m *unifiedManager) getSliceFull() (string, error) {
c := m.cgroups
slice := "system.slice"
if c.Rootless {
slice = "user.slice"
}
if c.Parent != "" {
var err error
slice, err = ExpandSlice(c.Parent)
if err != nil {
return "", err
}
}
if c.Rootless {
// managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service".
managerCG, err := getManagerProperty(m.dbus, "ControlGroup")
if err != nil {
return "", err
}
slice = filepath.Join(managerCG, slice)
}
// an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice"
// NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified.
return slice, nil
}
func (m *unifiedManager) initPath() error {
if m.path != "" {
return nil
}
sliceFull, err := m.getSliceFull()
if err != nil {
return err
}
c := m.cgroups
path := filepath.Join(sliceFull, getUnitName(c))
path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path)
if err != nil {
return err
}
// an example of the final path in rootless:
// "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope"
m.path = path
return nil
}
func (m *unifiedManager) Freeze(state configs.FreezerState) error {
return m.fsMgr.Freeze(state)
}
func (m *unifiedManager) GetPids() ([]int, error) {
return cgroups.GetPids(m.path)
}
func (m *unifiedManager) GetAllPids() ([]int, error) {
return cgroups.GetAllPids(m.path)
}
func (m *unifiedManager) GetStats() (*cgroups.Stats, error) {
return m.fsMgr.GetStats()
}
func (m *unifiedManager) Set(r *configs.Resources) error {
if r == nil {
return nil
}
properties, err := genV2ResourcesProperties(r, m.dbus)
if err != nil {
return err
}
if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {
return fmt.Errorf("unable to set unit properties: %w", err)
}
return m.fsMgr.Set(r)
}
func (m *unifiedManager) GetPaths() map[string]string {
paths := make(map[string]string, 1)
paths[""] = m.path
return paths
}
func (m *unifiedManager) GetCgroups() (*configs.Cgroup, error) {
return m.cgroups, nil
}
func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) {
return m.fsMgr.GetFreezerState()
}
func (m *unifiedManager) Exists() bool {
return cgroups.PathExists(m.path)
}
func (m *unifiedManager) OOMKillCount() (uint64, error) {
return m.fsMgr.OOMKillCount()
}

View File

@ -1,469 +0,0 @@
package cgroups
import (
"bufio"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strconv"
"strings"
"sync"
"time"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
const (
CgroupProcesses = "cgroup.procs"
unifiedMountpoint = "/sys/fs/cgroup"
hybridMountpoint = "/sys/fs/cgroup/unified"
)
var (
isUnifiedOnce sync.Once
isUnified bool
isHybridOnce sync.Once
isHybrid bool
)
// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
func IsCgroup2UnifiedMode() bool {
isUnifiedOnce.Do(func() {
var st unix.Statfs_t
err := unix.Statfs(unifiedMountpoint, &st)
if err != nil {
if os.IsNotExist(err) && userns.RunningInUserNS() {
// ignore the "not found" error if running in userns
logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
isUnified = false
return
}
panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
}
isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
})
return isUnified
}
// IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode.
func IsCgroup2HybridMode() bool {
isHybridOnce.Do(func() {
var st unix.Statfs_t
err := unix.Statfs(hybridMountpoint, &st)
if err != nil {
isHybrid = false
if !os.IsNotExist(err) {
// Report unexpected errors.
logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint)
}
return
}
isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC
})
return isHybrid
}
type Mount struct {
Mountpoint string
Root string
Subsystems []string
}
// GetCgroupMounts returns the mounts for the cgroup subsystems.
// all indicates whether to return just the first instance or all the mounts.
// This function should not be used from cgroupv2 code, as in this case
// all the controllers are available under the constant unifiedMountpoint.
func GetCgroupMounts(all bool) ([]Mount, error) {
if IsCgroup2UnifiedMode() {
// TODO: remove cgroupv2 case once all external users are converted
availableControllers, err := GetAllSubsystems()
if err != nil {
return nil, err
}
m := Mount{
Mountpoint: unifiedMountpoint,
Root: unifiedMountpoint,
Subsystems: availableControllers,
}
return []Mount{m}, nil
}
return getCgroupMountsV1(all)
}
// GetAllSubsystems returns all the cgroup subsystems supported by the kernel
func GetAllSubsystems() ([]string, error) {
// /proc/cgroups is meaningless for v2
// https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features
if IsCgroup2UnifiedMode() {
// "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers.
// - devices: implemented in kernel 4.15
// - freezer: implemented in kernel 5.2
// We assume these are always available, as it is hard to detect availability.
pseudo := []string{"devices", "freezer"}
data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers")
if err != nil {
return nil, err
}
subsystems := append(pseudo, strings.Fields(data)...)
return subsystems, nil
}
f, err := os.Open("/proc/cgroups")
if err != nil {
return nil, err
}
defer f.Close()
subsystems := []string{}
s := bufio.NewScanner(f)
for s.Scan() {
text := s.Text()
if text[0] != '#' {
parts := strings.Fields(text)
if len(parts) >= 4 && parts[3] != "0" {
subsystems = append(subsystems, parts[0])
}
}
}
if err := s.Err(); err != nil {
return nil, err
}
return subsystems, nil
}
func readProcsFile(dir string) ([]int, error) {
f, err := OpenFile(dir, CgroupProcesses, os.O_RDONLY)
if err != nil {
return nil, err
}
defer f.Close()
var (
s = bufio.NewScanner(f)
out = []int{}
)
for s.Scan() {
if t := s.Text(); t != "" {
pid, err := strconv.Atoi(t)
if err != nil {
return nil, err
}
out = append(out, pid)
}
}
return out, s.Err()
}
// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
// or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
//
// "cpu": "/user.slice/user-1000.slice"
// "pids": "/user.slice/user-1000.slice"
//
// etc.
//
// Note that for cgroup v2 unified hierarchy, there are no per-controller
// cgroup paths, so the resulting map will have a single element where the key
// is empty string ("") and the value is the cgroup path the <pid> is in.
func ParseCgroupFile(path string) (map[string]string, error) {
f, err := os.Open(path)
if err != nil {
return nil, err
}
defer f.Close()
return parseCgroupFromReader(f)
}
// helper function for ParseCgroupFile to make testing easier
func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
s := bufio.NewScanner(r)
cgroups := make(map[string]string)
for s.Scan() {
text := s.Text()
// from cgroups(7):
// /proc/[pid]/cgroup
// ...
// For each cgroup hierarchy ... there is one entry
// containing three colon-separated fields of the form:
// hierarchy-ID:subsystem-list:cgroup-path
parts := strings.SplitN(text, ":", 3)
if len(parts) < 3 {
return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text)
}
for _, subs := range strings.Split(parts[1], ",") {
cgroups[subs] = parts[2]
}
}
if err := s.Err(); err != nil {
return nil, err
}
return cgroups, nil
}
func PathExists(path string) bool {
if _, err := os.Stat(path); err != nil {
return false
}
return true
}
func EnterPid(cgroupPaths map[string]string, pid int) error {
for _, path := range cgroupPaths {
if PathExists(path) {
if err := WriteCgroupProc(path, pid); err != nil {
return err
}
}
}
return nil
}
func rmdir(path string) error {
err := unix.Rmdir(path)
if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare
return nil
}
return &os.PathError{Op: "rmdir", Path: path, Err: err}
}
// RemovePath aims to remove cgroup path. It does so recursively,
// by removing any subdirectories (sub-cgroups) first.
func RemovePath(path string) error {
// try the fast path first
if err := rmdir(path); err == nil {
return nil
}
infos, err := os.ReadDir(path)
if err != nil {
if os.IsNotExist(err) {
err = nil
}
return err
}
for _, info := range infos {
if info.IsDir() {
// We should remove subcgroups dir first
if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
break
}
}
}
if err == nil {
err = rmdir(path)
}
return err
}
// RemovePaths iterates over the provided paths removing them.
// We trying to remove all paths five times with increasing delay between tries.
// If after all there are not removed cgroups - appropriate error will be
// returned.
func RemovePaths(paths map[string]string) (err error) {
const retries = 5
delay := 10 * time.Millisecond
for i := 0; i < retries; i++ {
if i != 0 {
time.Sleep(delay)
delay *= 2
}
for s, p := range paths {
if err := RemovePath(p); err != nil {
// do not log intermediate iterations
switch i {
case 0:
logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
case retries - 1:
logrus.WithError(err).Error("Failed to remove cgroup")
}
}
_, err := os.Stat(p)
// We need this strange way of checking cgroups existence because
// RemoveAll almost always returns error, even on already removed
// cgroups
if os.IsNotExist(err) {
delete(paths, s)
}
}
if len(paths) == 0 {
//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
paths = make(map[string]string)
return nil
}
}
return fmt.Errorf("Failed to remove paths: %v", paths)
}
var (
hugePageSizes []string
initHPSOnce sync.Once
)
func HugePageSizes() []string {
initHPSOnce.Do(func() {
dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return
}
files, err := dir.Readdirnames(0)
dir.Close()
if err != nil {
return
}
hugePageSizes, err = getHugePageSizeFromFilenames(files)
if err != nil {
logrus.Warn("HugePageSizes: ", err)
}
})
return hugePageSizes
}
func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
pageSizes := make([]string, 0, len(fileNames))
var warn error
for _, file := range fileNames {
// example: hugepages-1048576kB
val := strings.TrimPrefix(file, "hugepages-")
if len(val) == len(file) {
// Unexpected file name: no prefix found, ignore it.
continue
}
// The suffix is always "kB" (as of Linux 5.13). If we find
// something else, produce an error but keep going.
eLen := len(val) - 2
val = strings.TrimSuffix(val, "kB")
if len(val) != eLen {
// Highly unlikely.
if warn == nil {
warn = errors.New(file + `: invalid suffix (expected "kB")`)
}
continue
}
size, err := strconv.Atoi(val)
if err != nil {
// Highly unlikely.
if warn == nil {
warn = fmt.Errorf("%s: %w", file, err)
}
continue
}
// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
// but in our case the size is in KB already.
if size >= (1 << 20) {
val = strconv.Itoa(size>>20) + "GB"
} else if size >= (1 << 10) {
val = strconv.Itoa(size>>10) + "MB"
} else {
val += "KB"
}
pageSizes = append(pageSizes, val)
}
return pageSizes, warn
}
// GetPids returns all pids, that were added to cgroup at path.
func GetPids(dir string) ([]int, error) {
return readProcsFile(dir)
}
// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file
func WriteCgroupProc(dir string, pid int) error {
// Normally dir should not be empty, one case is that cgroup subsystem
// is not mounted, we will get empty dir, and we want it fail here.
if dir == "" {
return fmt.Errorf("no such directory for %s", CgroupProcesses)
}
// Dont attach any pid to the cgroup if -1 is specified as a pid
if pid == -1 {
return nil
}
file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY)
if err != nil {
return fmt.Errorf("failed to write %v: %w", pid, err)
}
defer file.Close()
for i := 0; i < 5; i++ {
_, err = file.WriteString(strconv.Itoa(pid))
if err == nil {
return nil
}
// EINVAL might mean that the task being added to cgroup.procs is in state
// TASK_NEW. We should attempt to do so again.
if errors.Is(err, unix.EINVAL) {
time.Sleep(30 * time.Millisecond)
continue
}
return fmt.Errorf("failed to write %v: %w", pid, err)
}
return err
}
// Since the OCI spec is designed for cgroup v1, in some cases
// there is need to convert from the cgroup v1 configuration to cgroup v2
// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
// convert from [2-262144] to [1-10000]
// 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)"
func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 {
if cpuShares == 0 {
return 0
}
return (1 + ((cpuShares-2)*9999)/262142)
}
// ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec
// for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap
// is defined as memory+swap combined, while in cgroup v2 swap is a separate value.
func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
// for compatibility with cgroup1 controller, set swap to unlimited in
// case the memory is set to unlimited, and swap is not explicitly set,
// treating the request as "set both memory and swap to unlimited".
if memory == -1 && memorySwap == 0 {
return -1, nil
}
if memorySwap == -1 || memorySwap == 0 {
// -1 is "max", 0 is "unset", so treat as is
return memorySwap, nil
}
// sanity checks
if memory == 0 || memory == -1 {
return 0, errors.New("unable to set swap limit without memory limit")
}
if memory < 0 {
return 0, fmt.Errorf("invalid memory value: %d", memory)
}
if memorySwap < memory {
return 0, errors.New("memory+swap limit should be >= memory limit")
}
return memorySwap - memory, nil
}
// Since the OCI spec is designed for cgroup v1, in some cases
// there is need to convert from the cgroup v1 configuration to cgroup v2
// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
// convert linearly from [10-1000] to [1-10000]
func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
if blkIoWeight == 0 {
return 0
}
return 1 + (uint64(blkIoWeight)-10)*9999/990
}

View File

@ -1,290 +0,0 @@
package cgroups
import (
"errors"
"fmt"
"os"
"path/filepath"
"strings"
"sync"
"syscall"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/moby/sys/mountinfo"
"golang.org/x/sys/unix"
)
// Code in this source file are specific to cgroup v1,
// and must not be used from any cgroup v2 code.
const (
CgroupNamePrefix = "name="
defaultPrefix = "/sys/fs/cgroup"
)
var (
errUnified = errors.New("not implemented for cgroup v2 unified hierarchy")
ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1")
readMountinfoOnce sync.Once
readMountinfoErr error
cgroupMountinfo []*mountinfo.Info
)
type NotFoundError struct {
Subsystem string
}
func (e *NotFoundError) Error() string {
return fmt.Sprintf("mountpoint for %s not found", e.Subsystem)
}
func NewNotFoundError(sub string) error {
return &NotFoundError{
Subsystem: sub,
}
}
func IsNotFound(err error) bool {
var nfErr *NotFoundError
return errors.As(err, &nfErr)
}
func tryDefaultPath(cgroupPath, subsystem string) string {
if !strings.HasPrefix(defaultPrefix, cgroupPath) {
return ""
}
// remove possible prefix
subsystem = strings.TrimPrefix(subsystem, CgroupNamePrefix)
// Make sure we're still under defaultPrefix, and resolve
// a possible symlink (like cpu -> cpu,cpuacct).
path, err := securejoin.SecureJoin(defaultPrefix, subsystem)
if err != nil {
return ""
}
// (1) path should be a directory.
st, err := os.Lstat(path)
if err != nil || !st.IsDir() {
return ""
}
// (2) path should be a mount point.
pst, err := os.Lstat(filepath.Dir(path))
if err != nil {
return ""
}
if st.Sys().(*syscall.Stat_t).Dev == pst.Sys().(*syscall.Stat_t).Dev {
// parent dir has the same dev -- path is not a mount point
return ""
}
// (3) path should have 'cgroup' fs type.
fst := unix.Statfs_t{}
err = unix.Statfs(path, &fst)
if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC {
return ""
}
return path
}
// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones
// with fstype of "cgroup") for the current running process.
//
// The results are cached (to avoid re-reading mountinfo which is relatively
// expensive), so it is assumed that cgroup mounts are not being changed.
func readCgroupMountinfo() ([]*mountinfo.Info, error) {
readMountinfoOnce.Do(func() {
cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
mountinfo.FSTypeFilter("cgroup"),
)
})
return cgroupMountinfo, readMountinfoErr
}
// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
if IsCgroup2UnifiedMode() {
return "", errUnified
}
// If subsystem is empty, we look for the cgroupv2 hybrid path.
if len(subsystem) == 0 {
return hybridMountpoint, nil
}
// Avoid parsing mountinfo by trying the default path first, if possible.
if path := tryDefaultPath(cgroupPath, subsystem); path != "" {
return path, nil
}
mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem)
return mnt, err
}
func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) {
if IsCgroup2UnifiedMode() {
return "", "", errUnified
}
mi, err := readCgroupMountinfo()
if err != nil {
return "", "", err
}
return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem)
}
func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) {
for _, mi := range mounts {
if strings.HasPrefix(mi.Mountpoint, cgroupPath) {
for _, opt := range strings.Split(mi.VFSOptions, ",") {
if opt == subsystem {
return mi.Mountpoint, mi.Root, nil
}
}
}
}
return "", "", NewNotFoundError(subsystem)
}
func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
if len(m.Subsystems) == 0 {
return "", errors.New("no subsystem for mount")
}
return getControllerPath(m.Subsystems[0], cgroups)
}
func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) {
res := make([]Mount, 0, len(ss))
numFound := 0
for _, mi := range mounts {
m := Mount{
Mountpoint: mi.Mountpoint,
Root: mi.Root,
}
for _, opt := range strings.Split(mi.VFSOptions, ",") {
seen, known := ss[opt]
if !known || (!all && seen) {
continue
}
ss[opt] = true
opt = strings.TrimPrefix(opt, CgroupNamePrefix)
m.Subsystems = append(m.Subsystems, opt)
numFound++
}
if len(m.Subsystems) > 0 || all {
res = append(res, m)
}
if !all && numFound >= len(ss) {
break
}
}
return res, nil
}
func getCgroupMountsV1(all bool) ([]Mount, error) {
mi, err := readCgroupMountinfo()
if err != nil {
return nil, err
}
allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return nil, err
}
allMap := make(map[string]bool)
for s := range allSubsystems {
allMap[s] = false
}
return getCgroupMountsHelper(allMap, mi, all)
}
// GetOwnCgroup returns the relative path to the cgroup docker is running in.
func GetOwnCgroup(subsystem string) (string, error) {
if IsCgroup2UnifiedMode() {
return "", errUnified
}
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return "", err
}
return getControllerPath(subsystem, cgroups)
}
func GetOwnCgroupPath(subsystem string) (string, error) {
cgroup, err := GetOwnCgroup(subsystem)
if err != nil {
return "", err
}
// If subsystem is empty, we look for the cgroupv2 hybrid path.
if len(subsystem) == 0 {
return hybridMountpoint, nil
}
return getCgroupPathHelper(subsystem, cgroup)
}
func GetInitCgroup(subsystem string) (string, error) {
if IsCgroup2UnifiedMode() {
return "", errUnified
}
cgroups, err := ParseCgroupFile("/proc/1/cgroup")
if err != nil {
return "", err
}
return getControllerPath(subsystem, cgroups)
}
func GetInitCgroupPath(subsystem string) (string, error) {
cgroup, err := GetInitCgroup(subsystem)
if err != nil {
return "", err
}
return getCgroupPathHelper(subsystem, cgroup)
}
func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
if err != nil {
return "", err
}
// This is needed for nested containers, because in /proc/self/cgroup we
// see paths from host, which don't exist in container.
relCgroup, err := filepath.Rel(root, cgroup)
if err != nil {
return "", err
}
return filepath.Join(mnt, relCgroup), nil
}
func getControllerPath(subsystem string, cgroups map[string]string) (string, error) {
if IsCgroup2UnifiedMode() {
return "", errUnified
}
if p, ok := cgroups[subsystem]; ok {
return p, nil
}
if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
return p, nil
}
return "", NewNotFoundError(subsystem)
}

View File

@ -1,66 +0,0 @@
package configs
import "fmt"
// blockIODevice holds major:minor format supported in blkio cgroup
type blockIODevice struct {
// Major is the device's major number
Major int64 `json:"major"`
// Minor is the device's minor number
Minor int64 `json:"minor"`
}
// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair
type WeightDevice struct {
blockIODevice
// Weight is the bandwidth rate for the device, range is from 10 to 1000
Weight uint16 `json:"weight"`
// LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
LeafWeight uint16 `json:"leafWeight"`
}
// NewWeightDevice returns a configured WeightDevice pointer
func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice {
wd := &WeightDevice{}
wd.Major = major
wd.Minor = minor
wd.Weight = weight
wd.LeafWeight = leafWeight
return wd
}
// WeightString formats the struct to be writable to the cgroup specific file
func (wd *WeightDevice) WeightString() string {
return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight)
}
// LeafWeightString formats the struct to be writable to the cgroup specific file
func (wd *WeightDevice) LeafWeightString() string {
return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight)
}
// ThrottleDevice struct holds a `major:minor rate_per_second` pair
type ThrottleDevice struct {
blockIODevice
// Rate is the IO rate limit per cgroup per device
Rate uint64 `json:"rate"`
}
// NewThrottleDevice returns a configured ThrottleDevice pointer
func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice {
td := &ThrottleDevice{}
td.Major = major
td.Minor = minor
td.Rate = rate
return td
}
// String formats the struct to be writable to the cgroup specific file
func (td *ThrottleDevice) String() string {
return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate)
}
// StringName formats the struct to be writable to the cgroup specific file
func (td *ThrottleDevice) StringName(name string) string {
return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate)
}

View File

@ -1,158 +0,0 @@
package configs
import (
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/opencontainers/runc/libcontainer/devices"
)
type FreezerState string
const (
Undefined FreezerState = ""
Frozen FreezerState = "FROZEN"
Thawed FreezerState = "THAWED"
)
// Cgroup holds properties of a cgroup on Linux.
type Cgroup struct {
// Name specifies the name of the cgroup
Name string `json:"name,omitempty"`
// Parent specifies the name of parent of cgroup or slice
Parent string `json:"parent,omitempty"`
// Path specifies the path to cgroups that are created and/or joined by the container.
// The path is assumed to be relative to the host system cgroup mountpoint.
Path string `json:"path"`
// ScopePrefix describes prefix for the scope name
ScopePrefix string `json:"scope_prefix"`
// Resources contains various cgroups settings to apply
*Resources
// Systemd tells if systemd should be used to manage cgroups.
Systemd bool
// SystemdProps are any additional properties for systemd,
// derived from org.systemd.property.xxx annotations.
// Ignored unless systemd is used for managing cgroups.
SystemdProps []systemdDbus.Property `json:"-"`
// Rootless tells if rootless cgroups should be used.
Rootless bool
// The host UID that should own the cgroup, or nil to accept
// the default ownership. This should only be set when the
// cgroupfs is to be mounted read/write.
// Not all cgroup manager implementations support changing
// the ownership.
OwnerUID *int `json:"owner_uid,omitempty"`
}
type Resources struct {
// Devices is the set of access rules for devices in the container.
Devices []*devices.Rule `json:"devices"`
// Memory limit (in bytes)
Memory int64 `json:"memory"`
// Memory reservation or soft_limit (in bytes)
MemoryReservation int64 `json:"memory_reservation"`
// Total memory usage (memory + swap); set `-1` to enable unlimited swap
MemorySwap int64 `json:"memory_swap"`
// CPU shares (relative weight vs. other containers)
CpuShares uint64 `json:"cpu_shares"`
// CPU hardcap limit (in usecs). Allowed cpu time in a given period.
CpuQuota int64 `json:"cpu_quota"`
// CPU period to be used for hardcapping (in usecs). 0 to use system default.
CpuPeriod uint64 `json:"cpu_period"`
// How many time CPU will use in realtime scheduling (in usecs).
CpuRtRuntime int64 `json:"cpu_rt_quota"`
// CPU period to be used for realtime scheduling (in usecs).
CpuRtPeriod uint64 `json:"cpu_rt_period"`
// CPU to use
CpusetCpus string `json:"cpuset_cpus"`
// MEM to use
CpusetMems string `json:"cpuset_mems"`
// Process limit; set <= `0' to disable limit.
PidsLimit int64 `json:"pids_limit"`
// Specifies per cgroup weight, range is from 10 to 1000.
BlkioWeight uint16 `json:"blkio_weight"`
// Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only
BlkioLeafWeight uint16 `json:"blkio_leaf_weight"`
// Weight per cgroup per device, can override BlkioWeight.
BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"`
// IO read rate limit per cgroup per device, bytes per second.
BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"`
// IO write rate limit per cgroup per device, bytes per second.
BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"`
// IO read rate limit per cgroup per device, IO per second.
BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"`
// IO write rate limit per cgroup per device, IO per second.
BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"`
// set the freeze value for the process
Freezer FreezerState `json:"freezer"`
// Hugetlb limit (in bytes)
HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"`
// Whether to disable OOM Killer
OomKillDisable bool `json:"oom_kill_disable"`
// Tuning swappiness behaviour per cgroup
MemorySwappiness *uint64 `json:"memory_swappiness"`
// Set priority of network traffic for container
NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"`
// Set class identifier for container's network packets
NetClsClassid uint32 `json:"net_cls_classid_u"`
// Rdma resource restriction configuration
Rdma map[string]LinuxRdma `json:"rdma"`
// Used on cgroups v2:
// CpuWeight sets a proportional bandwidth limit.
CpuWeight uint64 `json:"cpu_weight"`
// Unified is cgroupv2-only key-value map.
Unified map[string]string `json:"unified"`
// SkipDevices allows to skip configuring device permissions.
// Used by e.g. kubelet while creating a parent cgroup (kubepods)
// common for many containers, and by runc update.
//
// NOTE it is impossible to start a container which has this flag set.
SkipDevices bool `json:"-"`
// SkipFreezeOnSet is a flag for cgroup manager to skip the cgroup
// freeze when setting resources. Only applicable to systemd legacy
// (i.e. cgroup v1) manager (which uses freeze by default to avoid
// spurious permission errors caused by systemd inability to update
// device rules in a non-disruptive manner).
//
// If not set, a few methods (such as looking into cgroup's
// devices.list and querying the systemd unit properties) are used
// during Set() to figure out whether the freeze is required. Those
// methods may be relatively slow, thus this flag.
SkipFreezeOnSet bool `json:"-"`
}

View File

@ -1,9 +0,0 @@
//go:build !linux
// +build !linux
package configs
// Cgroup holds properties of a cgroup on Linux
// TODO Windows: This can ultimately be entirely factored out on Windows as
// cgroups are a Unix-specific construct.
type Cgroup struct{}

View File

@ -1,414 +0,0 @@
package configs
import (
"bytes"
"encoding/json"
"fmt"
"os/exec"
"time"
"github.com/sirupsen/logrus"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runtime-spec/specs-go"
)
type Rlimit struct {
Type int `json:"type"`
Hard uint64 `json:"hard"`
Soft uint64 `json:"soft"`
}
// IDMap represents UID/GID Mappings for User Namespaces.
type IDMap struct {
ContainerID int `json:"container_id"`
HostID int `json:"host_id"`
Size int `json:"size"`
}
// Seccomp represents syscall restrictions
// By default, only the native architecture of the kernel is allowed to be used
// for syscalls. Additional architectures can be added by specifying them in
// Architectures.
type Seccomp struct {
DefaultAction Action `json:"default_action"`
Architectures []string `json:"architectures"`
Syscalls []*Syscall `json:"syscalls"`
DefaultErrnoRet *uint `json:"default_errno_ret"`
ListenerPath string `json:"listener_path,omitempty"`
ListenerMetadata string `json:"listener_metadata,omitempty"`
}
// Action is taken upon rule match in Seccomp
type Action int
const (
Kill Action = iota + 1
Errno
Trap
Allow
Trace
Log
Notify
KillThread
KillProcess
)
// Operator is a comparison operator to be used when matching syscall arguments in Seccomp
type Operator int
const (
EqualTo Operator = iota + 1
NotEqualTo
GreaterThan
GreaterThanOrEqualTo
LessThan
LessThanOrEqualTo
MaskEqualTo
)
// Arg is a rule to match a specific syscall argument in Seccomp
type Arg struct {
Index uint `json:"index"`
Value uint64 `json:"value"`
ValueTwo uint64 `json:"value_two"`
Op Operator `json:"op"`
}
// Syscall is a rule to match a syscall in Seccomp
type Syscall struct {
Name string `json:"name"`
Action Action `json:"action"`
ErrnoRet *uint `json:"errnoRet"`
Args []*Arg `json:"args"`
}
// TODO Windows. Many of these fields should be factored out into those parts
// which are common across platforms, and those which are platform specific.
// Config defines configuration options for executing a process inside a contained environment.
type Config struct {
// NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs
// This is a common option when the container is running in ramdisk
NoPivotRoot bool `json:"no_pivot_root"`
// ParentDeathSignal specifies the signal that is sent to the container's process in the case
// that the parent process dies.
ParentDeathSignal int `json:"parent_death_signal"`
// Path to a directory containing the container's root filesystem.
Rootfs string `json:"rootfs"`
// Umask is the umask to use inside of the container.
Umask *uint32 `json:"umask"`
// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
// bind mounts are writtable.
Readonlyfs bool `json:"readonlyfs"`
// Specifies the mount propagation flags to be applied to /.
RootPropagation int `json:"rootPropagation"`
// Mounts specify additional source and destination paths that will be mounted inside the container's
// rootfs and mount namespace if specified
Mounts []*Mount `json:"mounts"`
// The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well!
Devices []*devices.Device `json:"devices"`
MountLabel string `json:"mount_label"`
// Hostname optionally sets the container's hostname if provided
Hostname string `json:"hostname"`
// Namespaces specifies the container's namespaces that it should setup when cloning the init process
// If a namespace is not provided that namespace is shared from the container's parent process
Namespaces Namespaces `json:"namespaces"`
// Capabilities specify the capabilities to keep when executing the process inside the container
// All capabilities not specified will be dropped from the processes capability mask
Capabilities *Capabilities `json:"capabilities"`
// Networks specifies the container's network setup to be created
Networks []*Network `json:"networks"`
// Routes can be specified to create entries in the route table as the container is started
Routes []*Route `json:"routes"`
// Cgroups specifies specific cgroup settings for the various subsystems that the container is
// placed into to limit the resources the container has available
Cgroups *Cgroup `json:"cgroups"`
// AppArmorProfile specifies the profile to apply to the process running in the container and is
// change at the time the process is execed
AppArmorProfile string `json:"apparmor_profile,omitempty"`
// ProcessLabel specifies the label to apply to the process running in the container. It is
// commonly used by selinux
ProcessLabel string `json:"process_label,omitempty"`
// Rlimits specifies the resource limits, such as max open files, to set in the container
// If Rlimits are not set, the container will inherit rlimits from the parent process
Rlimits []Rlimit `json:"rlimits,omitempty"`
// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
// for a process. Valid values are between the range [-1000, '1000'], where processes with
// higher scores are preferred for being killed. If it is unset then we don't touch the current
// value.
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
OomScoreAdj *int `json:"oom_score_adj,omitempty"`
// UidMappings is an array of User ID mappings for User Namespaces
UidMappings []IDMap `json:"uid_mappings"`
// GidMappings is an array of Group ID mappings for User Namespaces
GidMappings []IDMap `json:"gid_mappings"`
// MaskPaths specifies paths within the container's rootfs to mask over with a bind
// mount pointing to /dev/null as to prevent reads of the file.
MaskPaths []string `json:"mask_paths"`
// ReadonlyPaths specifies paths within the container's rootfs to remount as read-only
// so that these files prevent any writes.
ReadonlyPaths []string `json:"readonly_paths"`
// Sysctl is a map of properties and their values. It is the equivalent of using
// sysctl -w my.property.name value in Linux.
Sysctl map[string]string `json:"sysctl"`
// Seccomp allows actions to be taken whenever a syscall is made within the container.
// A number of rules are given, each having an action to be taken if a syscall matches it.
// A default action to be taken if no rules match is also given.
Seccomp *Seccomp `json:"seccomp"`
// NoNewPrivileges controls whether processes in the container can gain additional privileges.
NoNewPrivileges bool `json:"no_new_privileges,omitempty"`
// Hooks are a collection of actions to perform at various container lifecycle events.
// CommandHooks are serialized to JSON, but other hooks are not.
Hooks Hooks
// Version is the version of opencontainer specification that is supported.
Version string `json:"version"`
// Labels are user defined metadata that is stored in the config and populated on the state
Labels []string `json:"labels"`
// NoNewKeyring will not allocated a new session keyring for the container. It will use the
// callers keyring in this case.
NoNewKeyring bool `json:"no_new_keyring"`
// IntelRdt specifies settings for Intel RDT group that the container is placed into
// to limit the resources (e.g., L3 cache, memory bandwidth) the container has available
IntelRdt *IntelRdt `json:"intel_rdt,omitempty"`
// RootlessEUID is set when the runc was launched with non-zero EUID.
// Note that RootlessEUID is set to false when launched with EUID=0 in userns.
// When RootlessEUID is set, runc creates a new userns for the container.
// (config.json needs to contain userns settings)
RootlessEUID bool `json:"rootless_euid,omitempty"`
// RootlessCgroups is set when unlikely to have the full access to cgroups.
// When RootlessCgroups is set, cgroups errors are ignored.
RootlessCgroups bool `json:"rootless_cgroups,omitempty"`
}
type (
HookName string
HookList []Hook
Hooks map[HookName]HookList
)
const (
// Prestart commands are executed after the container namespaces are created,
// but before the user supplied command is executed from init.
// Note: This hook is now deprecated
// Prestart commands are called in the Runtime namespace.
Prestart HookName = "prestart"
// CreateRuntime commands MUST be called as part of the create operation after
// the runtime environment has been created but before the pivot_root has been executed.
// CreateRuntime is called immediately after the deprecated Prestart hook.
// CreateRuntime commands are called in the Runtime Namespace.
CreateRuntime HookName = "createRuntime"
// CreateContainer commands MUST be called as part of the create operation after
// the runtime environment has been created but before the pivot_root has been executed.
// CreateContainer commands are called in the Container namespace.
CreateContainer HookName = "createContainer"
// StartContainer commands MUST be called as part of the start operation and before
// the container process is started.
// StartContainer commands are called in the Container namespace.
StartContainer HookName = "startContainer"
// Poststart commands are executed after the container init process starts.
// Poststart commands are called in the Runtime Namespace.
Poststart HookName = "poststart"
// Poststop commands are executed after the container init process exits.
// Poststop commands are called in the Runtime Namespace.
Poststop HookName = "poststop"
)
// KnownHookNames returns the known hook names.
// Used by `runc features`.
func KnownHookNames() []string {
return []string{
string(Prestart), // deprecated
string(CreateRuntime),
string(CreateContainer),
string(StartContainer),
string(Poststart),
string(Poststop),
}
}
type Capabilities struct {
// Bounding is the set of capabilities checked by the kernel.
Bounding []string
// Effective is the set of capabilities checked by the kernel.
Effective []string
// Inheritable is the capabilities preserved across execve.
Inheritable []string
// Permitted is the limiting superset for effective capabilities.
Permitted []string
// Ambient is the ambient set of capabilities that are kept.
Ambient []string
}
func (hooks HookList) RunHooks(state *specs.State) error {
for i, h := range hooks {
if err := h.Run(state); err != nil {
return fmt.Errorf("error running hook #%d: %w", i, err)
}
}
return nil
}
func (hooks *Hooks) UnmarshalJSON(b []byte) error {
var state map[HookName][]CommandHook
if err := json.Unmarshal(b, &state); err != nil {
return err
}
*hooks = Hooks{}
for n, commandHooks := range state {
if len(commandHooks) == 0 {
continue
}
(*hooks)[n] = HookList{}
for _, h := range commandHooks {
(*hooks)[n] = append((*hooks)[n], h)
}
}
return nil
}
func (hooks *Hooks) MarshalJSON() ([]byte, error) {
serialize := func(hooks []Hook) (serializableHooks []CommandHook) {
for _, hook := range hooks {
switch chook := hook.(type) {
case CommandHook:
serializableHooks = append(serializableHooks, chook)
default:
logrus.Warnf("cannot serialize hook of type %T, skipping", hook)
}
}
return serializableHooks
}
return json.Marshal(map[string]interface{}{
"prestart": serialize((*hooks)[Prestart]),
"createRuntime": serialize((*hooks)[CreateRuntime]),
"createContainer": serialize((*hooks)[CreateContainer]),
"startContainer": serialize((*hooks)[StartContainer]),
"poststart": serialize((*hooks)[Poststart]),
"poststop": serialize((*hooks)[Poststop]),
})
}
type Hook interface {
// Run executes the hook with the provided state.
Run(*specs.State) error
}
// NewFunctionHook will call the provided function when the hook is run.
func NewFunctionHook(f func(*specs.State) error) FuncHook {
return FuncHook{
run: f,
}
}
type FuncHook struct {
run func(*specs.State) error
}
func (f FuncHook) Run(s *specs.State) error {
return f.run(s)
}
type Command struct {
Path string `json:"path"`
Args []string `json:"args"`
Env []string `json:"env"`
Dir string `json:"dir"`
Timeout *time.Duration `json:"timeout"`
}
// NewCommandHook will execute the provided command when the hook is run.
func NewCommandHook(cmd Command) CommandHook {
return CommandHook{
Command: cmd,
}
}
type CommandHook struct {
Command
}
func (c Command) Run(s *specs.State) error {
b, err := json.Marshal(s)
if err != nil {
return err
}
var stdout, stderr bytes.Buffer
cmd := exec.Cmd{
Path: c.Path,
Args: c.Args,
Env: c.Env,
Stdin: bytes.NewReader(b),
Stdout: &stdout,
Stderr: &stderr,
}
if err := cmd.Start(); err != nil {
return err
}
errC := make(chan error, 1)
go func() {
err := cmd.Wait()
if err != nil {
err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String())
}
errC <- err
}()
var timerCh <-chan time.Time
if c.Timeout != nil {
timer := time.NewTimer(*c.Timeout)
defer timer.Stop()
timerCh = timer.C
}
select {
case err := <-errC:
return err
case <-timerCh:
_ = cmd.Process.Kill()
<-errC
return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
}
}

View File

@ -1,68 +0,0 @@
package configs
import "errors"
var (
errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.")
errNoUserMap = errors.New("User namespaces enabled, but no user mapping found.")
errNoGIDMap = errors.New("User namespaces enabled, but no gid mappings found.")
errNoGroupMap = errors.New("User namespaces enabled, but no group mapping found.")
)
// HostUID gets the translated uid for the process on host which could be
// different when user namespaces are enabled.
func (c Config) HostUID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.UidMappings == nil {
return -1, errNoUIDMap
}
id, found := c.hostIDFromMapping(containerId, c.UidMappings)
if !found {
return -1, errNoUserMap
}
return id, nil
}
// Return unchanged id.
return containerId, nil
}
// HostRootUID gets the root uid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostRootUID() (int, error) {
return c.HostUID(0)
}
// HostGID gets the translated gid for the process on host which could be
// different when user namespaces are enabled.
func (c Config) HostGID(containerId int) (int, error) {
if c.Namespaces.Contains(NEWUSER) {
if c.GidMappings == nil {
return -1, errNoGIDMap
}
id, found := c.hostIDFromMapping(containerId, c.GidMappings)
if !found {
return -1, errNoGroupMap
}
return id, nil
}
// Return unchanged id.
return containerId, nil
}
// HostRootGID gets the root gid for the process on host which could be non-zero
// when user namespaces are enabled.
func (c Config) HostRootGID() (int, error) {
return c.HostGID(0)
}
// Utility function that gets a host ID for a container ID from user namespace map
// if that ID is present in the map.
func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) {
for _, m := range uMap {
if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) {
hostID := m.HostID + (containerID - m.ContainerID)
return hostID, true
}
}
return -1, false
}

View File

@ -1,10 +0,0 @@
//go:build gofuzz
// +build gofuzz
package configs
func FuzzUnmarshalJSON(data []byte) int {
hooks := Hooks{}
_ = hooks.UnmarshalJSON(data)
return 1
}

View File

@ -1,9 +0,0 @@
package configs
type HugepageLimit struct {
// which type of hugepage to limit.
Pagesize string `json:"page_size"`
// usage limit for hugepage.
Limit uint64 `json:"limit"`
}

View File

@ -1,16 +0,0 @@
package configs
type IntelRdt struct {
// The identity for RDT Class of Service
ClosID string `json:"closID,omitempty"`
// The schema for L3 cache id and capacity bitmask (CBM)
// Format: "L3:<cache_id0>=<cbm0>;<cache_id1>=<cbm1>;..."
L3CacheSchema string `json:"l3_cache_schema,omitempty"`
// The schema of memory bandwidth per L3 cache id
// Format: "MB:<cache_id0>=bandwidth0;<cache_id1>=bandwidth1;..."
// The unit of memory bandwidth is specified in "percentages" by
// default, and in "MBps" if MBA Software Controller is enabled.
MemBwSchema string `json:"memBwSchema,omitempty"`
}

View File

@ -1,14 +0,0 @@
package configs
import (
"fmt"
)
type IfPrioMap struct {
Interface string `json:"interface"`
Priority int64 `json:"priority"`
}
func (i *IfPrioMap) CgroupString() string {
return fmt.Sprintf("%s %d", i.Interface, i.Priority)
}

View File

@ -1,48 +0,0 @@
package configs
import "golang.org/x/sys/unix"
const (
// EXT_COPYUP is a directive to copy up the contents of a directory when
// a tmpfs is mounted over it.
EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning
)
type Mount struct {
// Source path for the mount.
Source string `json:"source"`
// Destination path for the mount inside the container.
Destination string `json:"destination"`
// Device the mount is for.
Device string `json:"device"`
// Mount flags.
Flags int `json:"flags"`
// Propagation Flags
PropagationFlags []int `json:"propagation_flags"`
// Mount data applied to the mount.
Data string `json:"data"`
// Relabel source if set, "z" indicates shared, "Z" indicates unshared.
Relabel string `json:"relabel"`
// RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2).
RecAttr *unix.MountAttr `json:"rec_attr"`
// Extensions are additional flags that are specific to runc.
Extensions int `json:"extensions"`
// Optional Command to be run before Source is mounted.
PremountCmds []Command `json:"premount_cmds"`
// Optional Command to be run after Source is mounted.
PostmountCmds []Command `json:"postmount_cmds"`
}
func (m *Mount) IsBind() bool {
return m.Flags&unix.MS_BIND != 0
}

View File

@ -1,5 +0,0 @@
package configs
type NamespaceType string
type Namespaces []Namespace

View File

@ -1,126 +0,0 @@
package configs
import (
"fmt"
"os"
"sync"
)
const (
NEWNET NamespaceType = "NEWNET"
NEWPID NamespaceType = "NEWPID"
NEWNS NamespaceType = "NEWNS"
NEWUTS NamespaceType = "NEWUTS"
NEWIPC NamespaceType = "NEWIPC"
NEWUSER NamespaceType = "NEWUSER"
NEWCGROUP NamespaceType = "NEWCGROUP"
)
var (
nsLock sync.Mutex
supportedNamespaces = make(map[NamespaceType]bool)
)
// NsName converts the namespace type to its filename
func NsName(ns NamespaceType) string {
switch ns {
case NEWNET:
return "net"
case NEWNS:
return "mnt"
case NEWPID:
return "pid"
case NEWIPC:
return "ipc"
case NEWUSER:
return "user"
case NEWUTS:
return "uts"
case NEWCGROUP:
return "cgroup"
}
return ""
}
// IsNamespaceSupported returns whether a namespace is available or
// not
func IsNamespaceSupported(ns NamespaceType) bool {
nsLock.Lock()
defer nsLock.Unlock()
supported, ok := supportedNamespaces[ns]
if ok {
return supported
}
nsFile := NsName(ns)
// if the namespace type is unknown, just return false
if nsFile == "" {
return false
}
_, err := os.Stat("/proc/self/ns/" + nsFile)
// a namespace is supported if it exists and we have permissions to read it
supported = err == nil
supportedNamespaces[ns] = supported
return supported
}
func NamespaceTypes() []NamespaceType {
return []NamespaceType{
NEWUSER, // Keep user NS always first, don't move it.
NEWIPC,
NEWUTS,
NEWNET,
NEWPID,
NEWNS,
NEWCGROUP,
}
}
// Namespace defines configuration for each namespace. It specifies an
// alternate path that is able to be joined via setns.
type Namespace struct {
Type NamespaceType `json:"type"`
Path string `json:"path"`
}
func (n *Namespace) GetPath(pid int) string {
return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
}
func (n *Namespaces) Remove(t NamespaceType) bool {
i := n.index(t)
if i == -1 {
return false
}
*n = append((*n)[:i], (*n)[i+1:]...)
return true
}
func (n *Namespaces) Add(t NamespaceType, path string) {
i := n.index(t)
if i == -1 {
*n = append(*n, Namespace{Type: t, Path: path})
return
}
(*n)[i].Path = path
}
func (n *Namespaces) index(t NamespaceType) int {
for i, ns := range *n {
if ns.Type == t {
return i
}
}
return -1
}
func (n *Namespaces) Contains(t NamespaceType) bool {
return n.index(t) != -1
}
func (n *Namespaces) PathOf(t NamespaceType) string {
i := n.index(t)
if i == -1 {
return ""
}
return (*n)[i].Path
}

View File

@ -1,33 +0,0 @@
//go:build linux
// +build linux
package configs
import "golang.org/x/sys/unix"
func (n *Namespace) Syscall() int {
return namespaceInfo[n.Type]
}
var namespaceInfo = map[NamespaceType]int{
NEWNET: unix.CLONE_NEWNET,
NEWNS: unix.CLONE_NEWNS,
NEWUSER: unix.CLONE_NEWUSER,
NEWIPC: unix.CLONE_NEWIPC,
NEWUTS: unix.CLONE_NEWUTS,
NEWPID: unix.CLONE_NEWPID,
NEWCGROUP: unix.CLONE_NEWCGROUP,
}
// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
var flag int
for _, v := range *n {
if v.Path != "" {
continue
}
flag |= namespaceInfo[v.Type]
}
return uintptr(flag)
}

View File

@ -1,14 +0,0 @@
//go:build !linux && !windows
// +build !linux,!windows
package configs
func (n *Namespace) Syscall() int {
panic("No namespace syscall support")
}
// CloneFlags parses the container's Namespaces options to set the correct
// flags on clone, unshare. This function returns flags only for new namespaces.
func (n *Namespaces) CloneFlags() uintptr {
panic("No namespace syscall support")
}

View File

@ -1,8 +0,0 @@
//go:build !linux
// +build !linux
package configs
// Namespace defines configuration for each namespace. It specifies an
// alternate path that is able to be joined via setns.
type Namespace struct{}

View File

@ -1,75 +0,0 @@
package configs
// Network defines configuration for a container's networking stack
//
// The network configuration can be omitted from a container causing the
// container to be setup with the host's networking stack
type Network struct {
// Type sets the networks type, commonly veth and loopback
Type string `json:"type"`
// Name of the network interface
Name string `json:"name"`
// The bridge to use.
Bridge string `json:"bridge"`
// MacAddress contains the MAC address to set on the network interface
MacAddress string `json:"mac_address"`
// Address contains the IPv4 and mask to set on the network interface
Address string `json:"address"`
// Gateway sets the gateway address that is used as the default for the interface
Gateway string `json:"gateway"`
// IPv6Address contains the IPv6 and mask to set on the network interface
IPv6Address string `json:"ipv6_address"`
// IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface
IPv6Gateway string `json:"ipv6_gateway"`
// Mtu sets the mtu value for the interface and will be mirrored on both the host and
// container's interfaces if a pair is created, specifically in the case of type veth
// Note: This does not apply to loopback interfaces.
Mtu int `json:"mtu"`
// TxQueueLen sets the tx_queuelen value for the interface and will be mirrored on both the host and
// container's interfaces if a pair is created, specifically in the case of type veth
// Note: This does not apply to loopback interfaces.
TxQueueLen int `json:"txqueuelen"`
// HostInterfaceName is a unique name of a veth pair that resides on in the host interface of the
// container.
HostInterfaceName string `json:"host_interface_name"`
// HairpinMode specifies if hairpin NAT should be enabled on the virtual interface
// bridge port in the case of type veth
// Note: This is unsupported on some systems.
// Note: This does not apply to loopback interfaces.
HairpinMode bool `json:"hairpin_mode"`
}
// Route defines a routing table entry.
//
// Routes can be specified to create entries in the routing table as the container
// is started.
//
// All of destination, source, and gateway should be either IPv4 or IPv6.
// One of the three options must be present, and omitted entries will use their
// IP family default for the route table. For IPv4 for example, setting the
// gateway to 1.2.3.4 and the interface to eth0 will set up a standard
// destination of 0.0.0.0(or *) when viewed in the route table.
type Route struct {
// Destination specifies the destination IP address and mask in the CIDR form.
Destination string `json:"destination"`
// Source specifies the source IP address and mask in the CIDR form.
Source string `json:"source"`
// Gateway specifies the gateway IP address.
Gateway string `json:"gateway"`
// InterfaceName specifies the device to set this route up for, for example eth0.
InterfaceName string `json:"interface_name"`
}

View File

@ -1,9 +0,0 @@
package configs
// LinuxRdma for Linux cgroup 'rdma' resource management (Linux 4.11)
type LinuxRdma struct {
// Maximum number of HCA handles that can be opened. Default is "no limit".
HcaHandles *uint32 `json:"hca_handles,omitempty"`
// Maximum number of HCA objects that can be created. Default is "no limit".
HcaObjects *uint32 `json:"hca_objects,omitempty"`
}

View File

@ -1,5 +0,0 @@
package userns
// RunningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
var RunningInUserNS = runningInUserNS

View File

@ -1,16 +0,0 @@
//go:build gofuzz
// +build gofuzz
package userns
import (
"strings"
"github.com/opencontainers/runc/libcontainer/user"
)
func FuzzUIDMap(data []byte) int {
uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
_ = uidMapInUserNS(uidmap)
return 1
}

View File

@ -1,37 +0,0 @@
package userns
import (
"sync"
"github.com/opencontainers/runc/libcontainer/user"
)
var (
inUserNS bool
nsOnce sync.Once
)
// runningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
func runningInUserNS() bool {
nsOnce.Do(func() {
uidmap, err := user.CurrentProcessUIDMap()
if err != nil {
// This kernel-provided file only exists if user namespaces are supported
return
}
inUserNS = uidMapInUserNS(uidmap)
})
return inUserNS
}
func uidMapInUserNS(uidmap []user.IDMap) bool {
/*
* We assume we are in the initial user namespace if we have a full
* range - 4294967295 uids starting at uid 0.
*/
if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
return false
}
return true
}

View File

@ -1,18 +0,0 @@
//go:build !linux
// +build !linux
package userns
import "github.com/opencontainers/runc/libcontainer/user"
// runningInUserNS is a stub for non-Linux systems
// Always returns false
func runningInUserNS() bool {
return false
}
// uidMapInUserNS is a stub for non-Linux systems
// Always returns false
func uidMapInUserNS(uidmap []user.IDMap) bool {
return false
}

View File

@ -1,96 +0,0 @@
package utils
/*
* Copyright 2016, 2017 SUSE LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import (
"fmt"
"os"
"golang.org/x/sys/unix"
)
// MaxSendfdLen is the maximum length of the name of a file descriptor being
// sent using SendFd. The name of the file handle returned by RecvFd will never
// be larger than this value.
const MaxNameLen = 4096
// oobSpace is the size of the oob slice required to store a single FD. Note
// that unix.UnixRights appears to make the assumption that fd is always int32,
// so sizeof(fd) = 4.
var oobSpace = unix.CmsgSpace(4)
// RecvFd waits for a file descriptor to be sent over the given AF_UNIX
// socket. The file name of the remote file descriptor will be recreated
// locally (it is sent as non-auxiliary data in the same payload).
func RecvFd(socket *os.File) (*os.File, error) {
// For some reason, unix.Recvmsg uses the length rather than the capacity
// when passing the msg_controllen and other attributes to recvmsg. So we
// have to actually set the length.
name := make([]byte, MaxNameLen)
oob := make([]byte, oobSpace)
sockfd := socket.Fd()
n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0)
if err != nil {
return nil, err
}
if n >= MaxNameLen || oobn != oobSpace {
return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn)
}
// Truncate.
name = name[:n]
oob = oob[:oobn]
scms, err := unix.ParseSocketControlMessage(oob)
if err != nil {
return nil, err
}
if len(scms) != 1 {
return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms))
}
scm := scms[0]
fds, err := unix.ParseUnixRights(&scm)
if err != nil {
return nil, err
}
if len(fds) != 1 {
return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds))
}
fd := uintptr(fds[0])
return os.NewFile(fd, string(name)), nil
}
// SendFd sends a file descriptor over the given AF_UNIX socket. In
// addition, the file.Name() of the given file will also be sent as
// non-auxiliary data in the same payload (allowing to send contextual
// information for a file descriptor).
func SendFd(socket *os.File, name string, fd uintptr) error {
if len(name) >= MaxNameLen {
return fmt.Errorf("sendfd: filename too long: %s", name)
}
return SendFds(socket, []byte(name), int(fd))
}
// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket.
func SendFds(socket *os.File, msg []byte, fds ...int) error {
oob := unix.UnixRights(fds...)
return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0)
}

View File

@ -1,167 +0,0 @@
package utils
import (
"encoding/binary"
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"strconv"
"strings"
"unsafe"
securejoin "github.com/cyphar/filepath-securejoin"
"golang.org/x/sys/unix"
)
const (
exitSignalOffset = 128
)
// NativeEndian is the native byte order of the host system.
var NativeEndian binary.ByteOrder
func init() {
// Copied from <golang.org/x/net/internal/socket/sys.go>.
i := uint32(1)
b := (*[4]byte)(unsafe.Pointer(&i))
if b[0] == 1 {
NativeEndian = binary.LittleEndian
} else {
NativeEndian = binary.BigEndian
}
}
// ExitStatus returns the correct exit status for a process based on if it
// was signaled or exited cleanly
func ExitStatus(status unix.WaitStatus) int {
if status.Signaled() {
return exitSignalOffset + int(status.Signal())
}
return status.ExitStatus()
}
// WriteJSON writes the provided struct v to w using standard json marshaling
func WriteJSON(w io.Writer, v interface{}) error {
data, err := json.Marshal(v)
if err != nil {
return err
}
_, err = w.Write(data)
return err
}
// CleanPath makes a path safe for use with filepath.Join. This is done by not
// only cleaning the path, but also (if the path is relative) adding a leading
// '/' and cleaning it (then removing the leading '/'). This ensures that a
// path resulting from prepending another path will always resolve to lexically
// be a subdirectory of the prefixed path. This is all done lexically, so paths
// that include symlinks won't be safe as a result of using CleanPath.
func CleanPath(path string) string {
// Deal with empty strings nicely.
if path == "" {
return ""
}
// Ensure that all paths are cleaned (especially problematic ones like
// "/../../../../../" which can cause lots of issues).
path = filepath.Clean(path)
// If the path isn't absolute, we need to do more processing to fix paths
// such as "../../../../<etc>/some/path". We also shouldn't convert absolute
// paths to relative ones.
if !filepath.IsAbs(path) {
path = filepath.Clean(string(os.PathSeparator) + path)
// This can't fail, as (by definition) all paths are relative to root.
path, _ = filepath.Rel(string(os.PathSeparator), path)
}
// Clean the path again for good measure.
return filepath.Clean(path)
}
// stripRoot returns the passed path, stripping the root path if it was
// (lexicially) inside it. Note that both passed paths will always be treated
// as absolute, and the returned path will also always be absolute. In
// addition, the paths are cleaned before stripping the root.
func stripRoot(root, path string) string {
// Make the paths clean and absolute.
root, path = CleanPath("/"+root), CleanPath("/"+path)
switch {
case path == root:
path = "/"
case root == "/":
// do nothing
case strings.HasPrefix(path, root+"/"):
path = strings.TrimPrefix(path, root+"/")
}
return CleanPath("/" + path)
}
// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
// corresponding to the unsafePath resolved within the root. Before passing the
// fd, this path is verified to have been inside the root -- so operating on it
// through the passed fdpath should be safe. Do not access this path through
// the original path strings, and do not attempt to use the pathname outside of
// the passed closure (the file handle will be freed once the closure returns).
func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
// Remove the root then forcefully resolve inside the root.
unsafePath = stripRoot(root, unsafePath)
path, err := securejoin.SecureJoin(root, unsafePath)
if err != nil {
return fmt.Errorf("resolving path inside rootfs failed: %w", err)
}
// Open the target path.
fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
if err != nil {
return fmt.Errorf("open o_path procfd: %w", err)
}
defer fh.Close()
// Double-check the path is the one we expected.
procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
if realpath, err := os.Readlink(procfd); err != nil {
return fmt.Errorf("procfd verification failed: %w", err)
} else if realpath != path {
return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
}
// Run the closure.
return fn(procfd)
}
// SearchLabels searches a list of key-value pairs for the provided key and
// returns the corresponding value. The pairs must be separated with '='.
func SearchLabels(labels []string, query string) string {
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == query {
return parts[1]
}
}
return ""
}
// Annotations returns the bundle path and user defined annotations from the
// libcontainer state. We need to remove the bundle because that is a label
// added by libcontainer.
func Annotations(labels []string) (bundle string, userAnnotations map[string]string) {
userAnnotations = make(map[string]string)
for _, l := range labels {
parts := strings.SplitN(l, "=", 2)
if len(parts) < 2 {
continue
}
if parts[0] == "bundle" {
bundle = parts[1]
} else {
userAnnotations[parts[0]] = parts[1]
}
}
return
}

View File

@ -1,69 +0,0 @@
//go:build !windows
// +build !windows
package utils
import (
"fmt"
"os"
"strconv"
"golang.org/x/sys/unix"
)
// EnsureProcHandle returns whether or not the given file handle is on procfs.
func EnsureProcHandle(fh *os.File) error {
var buf unix.Statfs_t
if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil {
return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err)
}
if buf.Type != unix.PROC_SUPER_MAGIC {
return fmt.Errorf("%s is not on procfs", fh.Name())
}
return nil
}
// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for
// the process (except for those below the given fd value).
func CloseExecFrom(minFd int) error {
fdDir, err := os.Open("/proc/self/fd")
if err != nil {
return err
}
defer fdDir.Close()
if err := EnsureProcHandle(fdDir); err != nil {
return err
}
fdList, err := fdDir.Readdirnames(-1)
if err != nil {
return err
}
for _, fdStr := range fdList {
fd, err := strconv.Atoi(fdStr)
// Ignore non-numeric file names.
if err != nil {
continue
}
// Ignore descriptors lower than our specified minimum.
if fd < minFd {
continue
}
// Intentionally ignore errors from unix.CloseOnExec -- the cases where
// this might fail are basically file descriptors that have already
// been closed (including and especially the one that was created when
// os.ReadDir did the "opendir" syscall).
unix.CloseOnExec(fd)
}
return nil
}
// NewSockPair returns a new unix socket pair
func NewSockPair(name string) (parent *os.File, child *os.File, err error) {
fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0)
if err != nil {
return nil, nil, err
}
return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
}

14
vendor/modules.txt vendored
View File

@ -190,9 +190,6 @@ github.com/coreos/go-systemd/v22/dbus
# github.com/cpuguy83/go-md2man/v2 v2.0.2
## explicit; go 1.11
github.com/cpuguy83/go-md2man/v2/md2man
# github.com/cyphar/filepath-securejoin v0.2.3
## explicit; go 1.13
github.com/cyphar/filepath-securejoin
# github.com/davecgh/go-spew v1.1.1
## explicit
github.com/davecgh/go-spew/spew
@ -342,19 +339,8 @@ github.com/opencontainers/image-spec/specs-go
github.com/opencontainers/image-spec/specs-go/v1
# github.com/opencontainers/runc v1.1.9
## explicit; go 1.17
github.com/opencontainers/runc/libcontainer/cgroups
github.com/opencontainers/runc/libcontainer/cgroups/devices
github.com/opencontainers/runc/libcontainer/cgroups/ebpf
github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter
github.com/opencontainers/runc/libcontainer/cgroups/fs
github.com/opencontainers/runc/libcontainer/cgroups/fs2
github.com/opencontainers/runc/libcontainer/cgroups/fscommon
github.com/opencontainers/runc/libcontainer/cgroups/systemd
github.com/opencontainers/runc/libcontainer/configs
github.com/opencontainers/runc/libcontainer/devices
github.com/opencontainers/runc/libcontainer/user
github.com/opencontainers/runc/libcontainer/userns
github.com/opencontainers/runc/libcontainer/utils
# github.com/opencontainers/runtime-spec v1.1.0
## explicit
github.com/opencontainers/runtime-spec/specs-go