diff --git a/go.mod b/go.mod index e6fd044b2..1e1fef1c7 100644 --- a/go.mod +++ b/go.mod @@ -88,7 +88,6 @@ require ( github.com/containerd/typeurl v1.0.2 // indirect github.com/containers/ocicrypt v1.1.6 // indirect github.com/cpuguy83/go-md2man/v2 v2.0.2 // indirect - github.com/cyphar/filepath-securejoin v0.2.3 // indirect github.com/go-logr/logr v1.2.4 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/godbus/dbus/v5 v5.1.0 // indirect diff --git a/go.sum b/go.sum index f9623d735..0102c0fc9 100644 --- a/go.sum +++ b/go.sum @@ -342,7 +342,6 @@ github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7Do github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.11/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/cyphar/filepath-securejoin v0.2.2/go.mod h1:FpkQEhXnPnOthhzymB7CGsFk2G9VLXONKD9G7QGMM+4= -github.com/cyphar/filepath-securejoin v0.2.3 h1:YX6ebbZCZP7VkM3scTTokDgBL2TY741X51MTk3ycuNI= github.com/cyphar/filepath-securejoin v0.2.3/go.mod h1:aPGpWjXOXUn2NCNjFvBE6aRxGGx79pTxQpKOJNYHHl4= github.com/d2g/dhcp4 v0.0.0-20170904100407-a1d1b6c41b1c/go.mod h1:Ct2BUK8SB0YC1SMSibvLzxjeJLnrYEVLULFNiHY9YfQ= github.com/d2g/dhcp4client v1.0.0/go.mod h1:j0hNfjhrt2SxUOw55nL0ATM/z4Yt3t2Kd1mW34z5W5s= @@ -885,7 +884,6 @@ github.com/sclevine/spec v1.2.0/go.mod h1:W4J29eT/Kzv7/b9IWLB055Z+qvVC9vt0Arko24 github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/seccomp/libseccomp-golang v0.9.1/go.mod h1:GbW5+tmTXfcxTToHLXlScSlAvWlF4P2Ca7zGrPiEpWo= github.com/seccomp/libseccomp-golang v0.9.2-0.20210429002308-3879420cc921/go.mod h1:JA8cRccbGaA1s33RQf7Y1+q9gHmZX1yB/z9WDN1C6fg= -github.com/seccomp/libseccomp-golang v0.9.2-0.20220502022130-f33da4d89646 h1:RpforrEYXWkmGwJHIGnLZ3tTWStkjVVstwzNGqxX2Ds= github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/sirupsen/logrus v1.0.4-0.20170822132746-89742aefa4b2/go.mod h1:pMByvHTf9Beacp5x1UXfOR9xyW/9antXMhjMPG0dEzc= github.com/sirupsen/logrus v1.0.6/go.mod h1:pMByvHTf9Beacp5x1UXfOR9xyW/9antXMhjMPG0dEzc= diff --git a/pkg/cri/sbserver/runtime_config_linux.go b/pkg/cri/sbserver/runtime_config_linux.go index 0ee5345b9..502dfe90c 100644 --- a/pkg/cri/sbserver/runtime_config_linux.go +++ b/pkg/cri/sbserver/runtime_config_linux.go @@ -21,8 +21,8 @@ import ( "sort" "github.com/containerd/containerd/log" + "github.com/containerd/containerd/pkg/systemd" runcoptions "github.com/containerd/containerd/runtime/v2/runc/options" - "github.com/opencontainers/runc/libcontainer/cgroups/systemd" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" ) diff --git a/pkg/cri/sbserver/runtime_config_linux_test.go b/pkg/cri/sbserver/runtime_config_linux_test.go index a7a201172..de53523b9 100644 --- a/pkg/cri/sbserver/runtime_config_linux_test.go +++ b/pkg/cri/sbserver/runtime_config_linux_test.go @@ -21,8 +21,8 @@ import ( "testing" criconfig "github.com/containerd/containerd/pkg/cri/config" + "github.com/containerd/containerd/pkg/systemd" "github.com/containerd/containerd/plugin" - "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/stretchr/testify/assert" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" ) diff --git a/pkg/cri/server/runtime_config_linux.go b/pkg/cri/server/runtime_config_linux.go index f83ca3ba7..e58ef5f58 100644 --- a/pkg/cri/server/runtime_config_linux.go +++ b/pkg/cri/server/runtime_config_linux.go @@ -21,8 +21,8 @@ import ( "sort" "github.com/containerd/containerd/log" + "github.com/containerd/containerd/pkg/systemd" runcoptions "github.com/containerd/containerd/runtime/v2/runc/options" - "github.com/opencontainers/runc/libcontainer/cgroups/systemd" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" ) diff --git a/pkg/cri/server/runtime_config_linux_test.go b/pkg/cri/server/runtime_config_linux_test.go index 143e86744..035f88593 100644 --- a/pkg/cri/server/runtime_config_linux_test.go +++ b/pkg/cri/server/runtime_config_linux_test.go @@ -21,8 +21,8 @@ import ( "testing" criconfig "github.com/containerd/containerd/pkg/cri/config" + "github.com/containerd/containerd/pkg/systemd" "github.com/containerd/containerd/plugin" - "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/stretchr/testify/assert" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" ) diff --git a/pkg/systemd/util.go b/pkg/systemd/util.go new file mode 100644 index 000000000..1bec94baf --- /dev/null +++ b/pkg/systemd/util.go @@ -0,0 +1,58 @@ +//go:build linux + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// Copyright 2015 CoreOS, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package systemd + +import ( + "os" + "sync" +) + +var ( + runningSystemd bool + detectSystemd sync.Once +) + +// IsRunningSystemd checks whether the host was booted with systemd as its init +// system. This functions similarly to systemd's `sd_booted(3)`: internally, it +// checks whether /run/systemd/system/ exists and is a directory. +// https://github.com/coreos/go-systemd/blob/d843340ab4bd3815fda02e648f9b09ae2dc722a7/util/util.go#L68-L78 +func IsRunningSystemd() bool { + detectSystemd.Do(func() { + fi, err := os.Lstat("/run/systemd/system") + if err != nil { + return + } + runningSystemd = fi.IsDir() + }) + return runningSystemd +} diff --git a/vendor/github.com/cyphar/filepath-securejoin/.travis.yml b/vendor/github.com/cyphar/filepath-securejoin/.travis.yml deleted file mode 100644 index b94ff8cf9..000000000 --- a/vendor/github.com/cyphar/filepath-securejoin/.travis.yml +++ /dev/null @@ -1,21 +0,0 @@ -# Copyright (C) 2017 SUSE LLC. All rights reserved. -# Use of this source code is governed by a BSD-style -# license that can be found in the LICENSE file. - -language: go -go: - - 1.13.x - - 1.16.x - - tip -arch: - - AMD64 - - ppc64le -os: - - linux - - osx - -script: - - go test -cover -v ./... - -notifications: - email: false diff --git a/vendor/github.com/cyphar/filepath-securejoin/LICENSE b/vendor/github.com/cyphar/filepath-securejoin/LICENSE deleted file mode 100644 index bec842f29..000000000 --- a/vendor/github.com/cyphar/filepath-securejoin/LICENSE +++ /dev/null @@ -1,28 +0,0 @@ -Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved. -Copyright (C) 2017 SUSE LLC. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/vendor/github.com/cyphar/filepath-securejoin/README.md b/vendor/github.com/cyphar/filepath-securejoin/README.md deleted file mode 100644 index 3624617c8..000000000 --- a/vendor/github.com/cyphar/filepath-securejoin/README.md +++ /dev/null @@ -1,79 +0,0 @@ -## `filepath-securejoin` ## - -[![Build Status](https://travis-ci.org/cyphar/filepath-securejoin.svg?branch=master)](https://travis-ci.org/cyphar/filepath-securejoin) - -An implementation of `SecureJoin`, a [candidate for inclusion in the Go -standard library][go#20126]. The purpose of this function is to be a "secure" -alternative to `filepath.Join`, and in particular it provides certain -guarantees that are not provided by `filepath.Join`. - -> **NOTE**: This code is *only* safe if you are not at risk of other processes -> modifying path components after you've used `SecureJoin`. If it is possible -> for a malicious process to modify path components of the resolved path, then -> you will be vulnerable to some fairly trivial TOCTOU race conditions. [There -> are some Linux kernel patches I'm working on which might allow for a better -> solution.][lwn-obeneath] -> -> In addition, with a slightly modified API it might be possible to use -> `O_PATH` and verify that the opened path is actually the resolved one -- but -> I have not done that yet. I might add it in the future as a helper function -> to help users verify the path (we can't just return `/proc/self/fd/` -> because that doesn't always work transparently for all users). - -This is the function prototype: - -```go -func SecureJoin(root, unsafePath string) (string, error) -``` - -This library **guarantees** the following: - -* If no error is set, the resulting string **must** be a child path of - `root` and will not contain any symlink path components (they will all be - expanded). - -* When expanding symlinks, all symlink path components **must** be resolved - relative to the provided root. In particular, this can be considered a - userspace implementation of how `chroot(2)` operates on file paths. Note that - these symlinks will **not** be expanded lexically (`filepath.Clean` is not - called on the input before processing). - -* Non-existent path components are unaffected by `SecureJoin` (similar to - `filepath.EvalSymlinks`'s semantics). - -* The returned path will always be `filepath.Clean`ed and thus not contain any - `..` components. - -A (trivial) implementation of this function on GNU/Linux systems could be done -with the following (note that this requires root privileges and is far more -opaque than the implementation in this library, and also requires that -`readlink` is inside the `root` path): - -```go -package securejoin - -import ( - "os/exec" - "path/filepath" -) - -func SecureJoin(root, unsafePath string) (string, error) { - unsafePath = string(filepath.Separator) + unsafePath - cmd := exec.Command("chroot", root, - "readlink", "--canonicalize-missing", "--no-newline", unsafePath) - output, err := cmd.CombinedOutput() - if err != nil { - return "", err - } - expanded := string(output) - return filepath.Join(root, expanded), nil -} -``` - -[lwn-obeneath]: https://lwn.net/Articles/767547/ -[go#20126]: https://github.com/golang/go/issues/20126 - -### License ### - -The license of this project is the same as Go, which is a BSD 3-clause license -available in the `LICENSE` file. diff --git a/vendor/github.com/cyphar/filepath-securejoin/VERSION b/vendor/github.com/cyphar/filepath-securejoin/VERSION deleted file mode 100644 index 717903969..000000000 --- a/vendor/github.com/cyphar/filepath-securejoin/VERSION +++ /dev/null @@ -1 +0,0 @@ -0.2.3 diff --git a/vendor/github.com/cyphar/filepath-securejoin/join.go b/vendor/github.com/cyphar/filepath-securejoin/join.go deleted file mode 100644 index 7dd08dbbd..000000000 --- a/vendor/github.com/cyphar/filepath-securejoin/join.go +++ /dev/null @@ -1,115 +0,0 @@ -// Copyright (C) 2014-2015 Docker Inc & Go Authors. All rights reserved. -// Copyright (C) 2017 SUSE LLC. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package securejoin is an implementation of the hopefully-soon-to-be-included -// SecureJoin helper that is meant to be part of the "path/filepath" package. -// The purpose of this project is to provide a PoC implementation to make the -// SecureJoin proposal (https://github.com/golang/go/issues/20126) more -// tangible. -package securejoin - -import ( - "bytes" - "errors" - "os" - "path/filepath" - "strings" - "syscall" -) - -// IsNotExist tells you if err is an error that implies that either the path -// accessed does not exist (or path components don't exist). This is -// effectively a more broad version of os.IsNotExist. -func IsNotExist(err error) bool { - // Check that it's not actually an ENOTDIR, which in some cases is a more - // convoluted case of ENOENT (usually involving weird paths). - return errors.Is(err, os.ErrNotExist) || errors.Is(err, syscall.ENOTDIR) || errors.Is(err, syscall.ENOENT) -} - -// SecureJoinVFS joins the two given path components (similar to Join) except -// that the returned path is guaranteed to be scoped inside the provided root -// path (when evaluated). Any symbolic links in the path are evaluated with the -// given root treated as the root of the filesystem, similar to a chroot. The -// filesystem state is evaluated through the given VFS interface (if nil, the -// standard os.* family of functions are used). -// -// Note that the guarantees provided by this function only apply if the path -// components in the returned string are not modified (in other words are not -// replaced with symlinks on the filesystem) after this function has returned. -// Such a symlink race is necessarily out-of-scope of SecureJoin. -func SecureJoinVFS(root, unsafePath string, vfs VFS) (string, error) { - // Use the os.* VFS implementation if none was specified. - if vfs == nil { - vfs = osVFS{} - } - - var path bytes.Buffer - n := 0 - for unsafePath != "" { - if n > 255 { - return "", &os.PathError{Op: "SecureJoin", Path: root + "/" + unsafePath, Err: syscall.ELOOP} - } - - // Next path component, p. - i := strings.IndexRune(unsafePath, filepath.Separator) - var p string - if i == -1 { - p, unsafePath = unsafePath, "" - } else { - p, unsafePath = unsafePath[:i], unsafePath[i+1:] - } - - // Create a cleaned path, using the lexical semantics of /../a, to - // create a "scoped" path component which can safely be joined to fullP - // for evaluation. At this point, path.String() doesn't contain any - // symlink components. - cleanP := filepath.Clean(string(filepath.Separator) + path.String() + p) - if cleanP == string(filepath.Separator) { - path.Reset() - continue - } - fullP := filepath.Clean(root + cleanP) - - // Figure out whether the path is a symlink. - fi, err := vfs.Lstat(fullP) - if err != nil && !IsNotExist(err) { - return "", err - } - // Treat non-existent path components the same as non-symlinks (we - // can't do any better here). - if IsNotExist(err) || fi.Mode()&os.ModeSymlink == 0 { - path.WriteString(p) - path.WriteRune(filepath.Separator) - continue - } - - // Only increment when we actually dereference a link. - n++ - - // It's a symlink, expand it by prepending it to the yet-unparsed path. - dest, err := vfs.Readlink(fullP) - if err != nil { - return "", err - } - // Absolute symlinks reset any work we've already done. - if filepath.IsAbs(dest) { - path.Reset() - } - unsafePath = dest + string(filepath.Separator) + unsafePath - } - - // We have to clean path.String() here because it may contain '..' - // components that are entirely lexical, but would be misleading otherwise. - // And finally do a final clean to ensure that root is also lexically - // clean. - fullP := filepath.Clean(string(filepath.Separator) + path.String()) - return filepath.Clean(root + fullP), nil -} - -// SecureJoin is a wrapper around SecureJoinVFS that just uses the os.* library -// of functions as the VFS. If in doubt, use this function over SecureJoinVFS. -func SecureJoin(root, unsafePath string) (string, error) { - return SecureJoinVFS(root, unsafePath, nil) -} diff --git a/vendor/github.com/cyphar/filepath-securejoin/vfs.go b/vendor/github.com/cyphar/filepath-securejoin/vfs.go deleted file mode 100644 index a82a5eae1..000000000 --- a/vendor/github.com/cyphar/filepath-securejoin/vfs.go +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (C) 2017 SUSE LLC. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -package securejoin - -import "os" - -// In future this should be moved into a separate package, because now there -// are several projects (umoci and go-mtree) that are using this sort of -// interface. - -// VFS is the minimal interface necessary to use SecureJoinVFS. A nil VFS is -// equivalent to using the standard os.* family of functions. This is mainly -// used for the purposes of mock testing, but also can be used to otherwise use -// SecureJoin with VFS-like system. -type VFS interface { - // Lstat returns a FileInfo describing the named file. If the file is a - // symbolic link, the returned FileInfo describes the symbolic link. Lstat - // makes no attempt to follow the link. These semantics are identical to - // os.Lstat. - Lstat(name string) (os.FileInfo, error) - - // Readlink returns the destination of the named symbolic link. These - // semantics are identical to os.Readlink. - Readlink(name string) (string, error) -} - -// osVFS is the "nil" VFS, in that it just passes everything through to the os -// module. -type osVFS struct{} - -// Lstat returns a FileInfo describing the named file. If the file is a -// symbolic link, the returned FileInfo describes the symbolic link. Lstat -// makes no attempt to follow the link. These semantics are identical to -// os.Lstat. -func (o osVFS) Lstat(name string) (os.FileInfo, error) { return os.Lstat(name) } - -// Readlink returns the destination of the named symbolic link. These -// semantics are identical to os.Readlink. -func (o osVFS) Readlink(name string) (string, error) { return os.Readlink(name) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go deleted file mode 100644 index ba2b2266c..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go +++ /dev/null @@ -1,59 +0,0 @@ -package cgroups - -import ( - "github.com/opencontainers/runc/libcontainer/configs" -) - -type Manager interface { - // Apply creates a cgroup, if not yet created, and adds a process - // with the specified pid into that cgroup. A special value of -1 - // can be used to merely create a cgroup. - Apply(pid int) error - - // GetPids returns the PIDs of all processes inside the cgroup. - GetPids() ([]int, error) - - // GetAllPids returns the PIDs of all processes inside the cgroup - // any all its sub-cgroups. - GetAllPids() ([]int, error) - - // GetStats returns cgroups statistics. - GetStats() (*Stats, error) - - // Freeze sets the freezer cgroup to the specified state. - Freeze(state configs.FreezerState) error - - // Destroy removes cgroup. - Destroy() error - - // Path returns a cgroup path to the specified controller/subsystem. - // For cgroupv2, the argument is unused and can be empty. - Path(string) string - - // Set sets cgroup resources parameters/limits. If the argument is nil, - // the resources specified during Manager creation (or the previous call - // to Set) are used. - Set(r *configs.Resources) error - - // GetPaths returns cgroup path(s) to save in a state file in order to - // restore later. - // - // For cgroup v1, a key is cgroup subsystem name, and the value is the - // path to the cgroup for this subsystem. - // - // For cgroup v2 unified hierarchy, a key is "", and the value is the - // unified path. - GetPaths() map[string]string - - // GetCgroups returns the cgroup data as configured. - GetCgroups() (*configs.Cgroup, error) - - // GetFreezerState retrieves the current FreezerState of the cgroup. - GetFreezerState() (configs.FreezerState, error) - - // Exists returns whether the cgroup path exists or not. - Exists() bool - - // OOMKillCount reports OOM kill count for the cgroup. - OOMKillCount() (uint64, error) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go deleted file mode 100644 index 6c61ee4c0..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go +++ /dev/null @@ -1,386 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -/* - * Copyright (C) 2020 Aleksa Sarai - * Copyright (C) 2020 SUSE LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package devices - -import ( - "bufio" - "fmt" - "io" - "sort" - "strconv" - "strings" - - "github.com/opencontainers/runc/libcontainer/devices" -) - -// deviceMeta is a Rule without the Allow or Permissions fields, and no -// wildcard-type support. It's effectively the "match" portion of a metadata -// rule, for the purposes of our emulation. -type deviceMeta struct { - node devices.Type - major int64 - minor int64 -} - -// deviceRule is effectively the tuple (deviceMeta, Permissions). -type deviceRule struct { - meta deviceMeta - perms devices.Permissions -} - -// deviceRules is a mapping of device metadata rules to the associated -// permissions in the ruleset. -type deviceRules map[deviceMeta]devices.Permissions - -func (r deviceRules) orderedEntries() []deviceRule { - var rules []deviceRule - for meta, perms := range r { - rules = append(rules, deviceRule{meta: meta, perms: perms}) - } - sort.Slice(rules, func(i, j int) bool { - // Sort by (major, minor, type). - a, b := rules[i].meta, rules[j].meta - return a.major < b.major || - (a.major == b.major && a.minor < b.minor) || - (a.major == b.major && a.minor == b.minor && a.node < b.node) - }) - return rules -} - -type Emulator struct { - defaultAllow bool - rules deviceRules -} - -func (e *Emulator) IsBlacklist() bool { - return e.defaultAllow -} - -func (e *Emulator) IsAllowAll() bool { - return e.IsBlacklist() && len(e.rules) == 0 -} - -func parseLine(line string) (*deviceRule, error) { - // Input: node major:minor perms. - fields := strings.FieldsFunc(line, func(r rune) bool { - return r == ' ' || r == ':' - }) - if len(fields) != 4 { - return nil, fmt.Errorf("malformed devices.list rule %s", line) - } - - var ( - rule deviceRule - node = fields[0] - major = fields[1] - minor = fields[2] - perms = fields[3] - ) - - // Parse the node type. - switch node { - case "a": - // Super-special case -- "a" always means every device with every - // access mode. In fact, for devices.list this actually indicates that - // the cgroup is in black-list mode. - // TODO: Double-check that the entire file is "a *:* rwm". - return nil, nil - case "b": - rule.meta.node = devices.BlockDevice - case "c": - rule.meta.node = devices.CharDevice - default: - return nil, fmt.Errorf("unknown device type %q", node) - } - - // Parse the major number. - if major == "*" { - rule.meta.major = devices.Wildcard - } else { - val, err := strconv.ParseUint(major, 10, 32) - if err != nil { - return nil, fmt.Errorf("invalid major number: %w", err) - } - rule.meta.major = int64(val) - } - - // Parse the minor number. - if minor == "*" { - rule.meta.minor = devices.Wildcard - } else { - val, err := strconv.ParseUint(minor, 10, 32) - if err != nil { - return nil, fmt.Errorf("invalid minor number: %w", err) - } - rule.meta.minor = int64(val) - } - - // Parse the access permissions. - rule.perms = devices.Permissions(perms) - if !rule.perms.IsValid() || rule.perms.IsEmpty() { - return nil, fmt.Errorf("parse access mode: contained unknown modes or is empty: %q", perms) - } - return &rule, nil -} - -func (e *Emulator) addRule(rule deviceRule) error { //nolint:unparam - if e.rules == nil { - e.rules = make(map[deviceMeta]devices.Permissions) - } - - // Merge with any pre-existing permissions. - oldPerms := e.rules[rule.meta] - newPerms := rule.perms.Union(oldPerms) - e.rules[rule.meta] = newPerms - return nil -} - -func (e *Emulator) rmRule(rule deviceRule) error { - // Give an error if any of the permissions requested to be removed are - // present in a partially-matching wildcard rule, because such rules will - // be ignored by cgroupv1. - // - // This is a diversion from cgroupv1, but is necessary to avoid leading - // users into a false sense of security. cgroupv1 will silently(!) ignore - // requests to remove partial exceptions, but we really shouldn't do that. - // - // It may seem like we could just "split" wildcard rules which hit this - // issue, but unfortunately there are 2^32 possible major and minor - // numbers, which would exhaust kernel memory quickly if we did this. Not - // to mention it'd be really slow (the kernel side is implemented as a - // linked-list of exceptions). - for _, partialMeta := range []deviceMeta{ - {node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor}, - {node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard}, - {node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard}, - } { - // This wildcard rule is equivalent to the requested rule, so skip it. - if rule.meta == partialMeta { - continue - } - // Only give an error if the set of permissions overlap. - partialPerms := e.rules[partialMeta] - if !partialPerms.Intersection(rule.perms).IsEmpty() { - return fmt.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms) - } - } - - // Subtract all of the permissions listed from the full match rule. If the - // rule didn't exist, all of this is a no-op. - newPerms := e.rules[rule.meta].Difference(rule.perms) - if newPerms.IsEmpty() { - delete(e.rules, rule.meta) - } else { - e.rules[rule.meta] = newPerms - } - // TODO: The actual cgroup code doesn't care if an exception didn't exist - // during removal, so not erroring out here is /accurate/ but quite - // worrying. Maybe we should do additional validation, but again we - // have to worry about backwards-compatibility. - return nil -} - -func (e *Emulator) allow(rule *deviceRule) error { - // This cgroup is configured as a black-list. Reset the entire emulator, - // and put is into black-list mode. - if rule == nil || rule.meta.node == devices.WildcardDevice { - *e = Emulator{ - defaultAllow: true, - rules: nil, - } - return nil - } - - var err error - if e.defaultAllow { - err = wrapErr(e.rmRule(*rule), "unable to remove 'deny' exception") - } else { - err = wrapErr(e.addRule(*rule), "unable to add 'allow' exception") - } - return err -} - -func (e *Emulator) deny(rule *deviceRule) error { - // This cgroup is configured as a white-list. Reset the entire emulator, - // and put is into white-list mode. - if rule == nil || rule.meta.node == devices.WildcardDevice { - *e = Emulator{ - defaultAllow: false, - rules: nil, - } - return nil - } - - var err error - if e.defaultAllow { - err = wrapErr(e.addRule(*rule), "unable to add 'deny' exception") - } else { - err = wrapErr(e.rmRule(*rule), "unable to remove 'allow' exception") - } - return err -} - -func (e *Emulator) Apply(rule devices.Rule) error { - if !rule.Type.CanCgroup() { - return fmt.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type) - } - - innerRule := &deviceRule{ - meta: deviceMeta{ - node: rule.Type, - major: rule.Major, - minor: rule.Minor, - }, - perms: rule.Permissions, - } - if innerRule.meta.node == devices.WildcardDevice { - innerRule = nil - } - - if rule.Allow { - return e.allow(innerRule) - } - - return e.deny(innerRule) -} - -// EmulatorFromList takes a reader to a "devices.list"-like source, and returns -// a new Emulator that represents the state of the devices cgroup. Note that -// black-list devices cgroups cannot be fully reconstructed, due to limitations -// in the devices cgroup API. Instead, such cgroups are always treated as -// "allow all" cgroups. -func EmulatorFromList(list io.Reader) (*Emulator, error) { - // Normally cgroups are in black-list mode by default, but the way we - // figure out the current mode is whether or not devices.list has an - // allow-all rule. So we default to a white-list, and the existence of an - // "a *:* rwm" entry will tell us otherwise. - e := &Emulator{ - defaultAllow: false, - } - - // Parse the "devices.list". - s := bufio.NewScanner(list) - for s.Scan() { - line := s.Text() - deviceRule, err := parseLine(line) - if err != nil { - return nil, fmt.Errorf("error parsing line %q: %w", line, err) - } - // "devices.list" is an allow list. Note that this means that in - // black-list mode, we have no idea what rules are in play. As a - // result, we need to be very careful in Transition(). - if err := e.allow(deviceRule); err != nil { - return nil, fmt.Errorf("error adding devices.list rule: %w", err) - } - } - if err := s.Err(); err != nil { - return nil, fmt.Errorf("error reading devices.list lines: %w", err) - } - return e, nil -} - -// Transition calculates what is the minimally-disruptive set of rules need to -// be applied to a devices cgroup in order to transition to the given target. -// This means that any already-existing rules will not be applied, and -// disruptive rules (like denying all device access) will only be applied if -// necessary. -// -// This function is the sole reason for all of Emulator -- to allow us -// to figure out how to update a containers' cgroups without causing spurious -// device errors (if possible). -func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) { - var transitionRules []*devices.Rule - oldRules := source.rules - - // If the default policy doesn't match, we need to include a "disruptive" - // rule (either allow-all or deny-all) in order to switch the cgroup to the - // correct default policy. - // - // However, due to a limitation in "devices.list" we cannot be sure what - // deny rules are in place in a black-list cgroup. Thus if the source is a - // black-list we also have to include a disruptive rule. - if source.IsBlacklist() || source.defaultAllow != target.defaultAllow { - transitionRules = append(transitionRules, &devices.Rule{ - Type: 'a', - Major: -1, - Minor: -1, - Permissions: devices.Permissions("rwm"), - Allow: target.defaultAllow, - }) - // The old rules are only relevant if we aren't starting out with a - // disruptive rule. - oldRules = nil - } - - // NOTE: We traverse through the rules in a sorted order so we always write - // the same set of rules (this is to aid testing). - - // First, we create inverse rules for any old rules not in the new set. - // This includes partial-inverse rules for specific permissions. This is a - // no-op if we added a disruptive rule, since oldRules will be empty. - for _, rule := range oldRules.orderedEntries() { - meta, oldPerms := rule.meta, rule.perms - newPerms := target.rules[meta] - droppedPerms := oldPerms.Difference(newPerms) - if !droppedPerms.IsEmpty() { - transitionRules = append(transitionRules, &devices.Rule{ - Type: meta.node, - Major: meta.major, - Minor: meta.minor, - Permissions: droppedPerms, - Allow: target.defaultAllow, - }) - } - } - - // Add any additional rules which weren't in the old set. We happen to - // filter out rules which are present in both sets, though this isn't - // strictly necessary. - for _, rule := range target.rules.orderedEntries() { - meta, newPerms := rule.meta, rule.perms - oldPerms := oldRules[meta] - gainedPerms := newPerms.Difference(oldPerms) - if !gainedPerms.IsEmpty() { - transitionRules = append(transitionRules, &devices.Rule{ - Type: meta.node, - Major: meta.major, - Minor: meta.minor, - Permissions: gainedPerms, - Allow: !target.defaultAllow, - }) - } - } - return transitionRules, nil -} - -// Rules returns the minimum set of rules necessary to convert a *deny-all* -// cgroup to the emulated filter state (note that this is not the same as a -// default cgroupv1 cgroup -- which is allow-all). This is effectively just a -// wrapper around Transition() with the source emulator being an empty cgroup. -func (e *Emulator) Rules() ([]*devices.Rule, error) { - defaultCgroup := &Emulator{defaultAllow: false} - return defaultCgroup.Transition(e) -} - -func wrapErr(err error, text string) error { - if err == nil { - return nil - } - return fmt.Errorf(text+": %w", err) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go deleted file mode 100644 index 4e69b35bc..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go +++ /dev/null @@ -1,208 +0,0 @@ -// Package devicefilter contains eBPF device filter program -// -// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c -// -// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano) -// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397 -package devicefilter - -import ( - "errors" - "fmt" - "math" - "strconv" - - "github.com/cilium/ebpf/asm" - devicesemulator "github.com/opencontainers/runc/libcontainer/cgroups/devices" - "github.com/opencontainers/runc/libcontainer/devices" - "golang.org/x/sys/unix" -) - -const ( - // license string format is same as kernel MODULE_LICENSE macro - license = "Apache" -) - -// DeviceFilter returns eBPF device filter program and its license string -func DeviceFilter(rules []*devices.Rule) (asm.Instructions, string, error) { - // Generate the minimum ruleset for the device rules we are given. While we - // don't care about minimum transitions in cgroupv2, using the emulator - // gives us a guarantee that the behaviour of devices filtering is the same - // as cgroupv1, including security hardenings to avoid misconfiguration - // (such as punching holes in wildcard rules). - emu := new(devicesemulator.Emulator) - for _, rule := range rules { - if err := emu.Apply(*rule); err != nil { - return nil, "", err - } - } - cleanRules, err := emu.Rules() - if err != nil { - return nil, "", err - } - - p := &program{ - defaultAllow: emu.IsBlacklist(), - } - p.init() - - for idx, rule := range cleanRules { - if rule.Type == devices.WildcardDevice { - // We can safely skip over wildcard entries because there should - // only be one (at most) at the very start to instruct cgroupv1 to - // go into allow-list mode. However we do double-check this here. - if idx != 0 || rule.Allow != emu.IsBlacklist() { - return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had bad wildcard at idx %v (%s)", idx, rule.CgroupString()) - } - continue - } - if rule.Allow == p.defaultAllow { - // There should be no rules which have an action equal to the - // default action, the emulator removes those. - return nil, "", fmt.Errorf("[internal error] emulated cgroupv2 devices ruleset had no-op rule at idx %v (%s)", idx, rule.CgroupString()) - } - if err := p.appendRule(rule); err != nil { - return nil, "", err - } - } - return p.finalize(), license, nil -} - -type program struct { - insts asm.Instructions - defaultAllow bool - blockID int -} - -func (p *program) init() { - // struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423 - /* - u32 access_type - u32 major - u32 minor - */ - // R2 <- type (lower 16 bit of u32 access_type at R1[0]) - p.insts = append(p.insts, - asm.LoadMem(asm.R2, asm.R1, 0, asm.Word), - asm.And.Imm32(asm.R2, 0xFFFF)) - - // R3 <- access (upper 16 bit of u32 access_type at R1[0]) - p.insts = append(p.insts, - asm.LoadMem(asm.R3, asm.R1, 0, asm.Word), - // RSh: bitwise shift right - asm.RSh.Imm32(asm.R3, 16)) - - // R4 <- major (u32 major at R1[4]) - p.insts = append(p.insts, - asm.LoadMem(asm.R4, asm.R1, 4, asm.Word)) - - // R5 <- minor (u32 minor at R1[8]) - p.insts = append(p.insts, - asm.LoadMem(asm.R5, asm.R1, 8, asm.Word)) -} - -// appendRule rule converts an OCI rule to the relevant eBPF block and adds it -// to the in-progress filter program. In order to operate properly, it must be -// called with a "clean" rule list (generated by devices.Emulator.Rules() -- -// with any "a" rules removed). -func (p *program) appendRule(rule *devices.Rule) error { - if p.blockID < 0 { - return errors.New("the program is finalized") - } - - var bpfType int32 - switch rule.Type { - case devices.CharDevice: - bpfType = int32(unix.BPF_DEVCG_DEV_CHAR) - case devices.BlockDevice: - bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK) - default: - // We do not permit 'a', nor any other types we don't know about. - return fmt.Errorf("invalid type %q", string(rule.Type)) - } - if rule.Major > math.MaxUint32 { - return fmt.Errorf("invalid major %d", rule.Major) - } - if rule.Minor > math.MaxUint32 { - return fmt.Errorf("invalid minor %d", rule.Major) - } - hasMajor := rule.Major >= 0 // if not specified in OCI json, major is set to -1 - hasMinor := rule.Minor >= 0 - bpfAccess := int32(0) - for _, r := range rule.Permissions { - switch r { - case 'r': - bpfAccess |= unix.BPF_DEVCG_ACC_READ - case 'w': - bpfAccess |= unix.BPF_DEVCG_ACC_WRITE - case 'm': - bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD - default: - return fmt.Errorf("unknown device access %v", r) - } - } - // If the access is rwm, skip the check. - hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD) - - var ( - blockSym = "block-" + strconv.Itoa(p.blockID) - nextBlockSym = "block-" + strconv.Itoa(p.blockID+1) - prevBlockLastIdx = len(p.insts) - 1 - ) - p.insts = append(p.insts, - // if (R2 != bpfType) goto next - asm.JNE.Imm(asm.R2, bpfType, nextBlockSym), - ) - if hasAccess { - p.insts = append(p.insts, - // if (R3 & bpfAccess != R3 /* use R1 as a temp var */) goto next - asm.Mov.Reg32(asm.R1, asm.R3), - asm.And.Imm32(asm.R1, bpfAccess), - asm.JNE.Reg(asm.R1, asm.R3, nextBlockSym), - ) - } - if hasMajor { - p.insts = append(p.insts, - // if (R4 != major) goto next - asm.JNE.Imm(asm.R4, int32(rule.Major), nextBlockSym), - ) - } - if hasMinor { - p.insts = append(p.insts, - // if (R5 != minor) goto next - asm.JNE.Imm(asm.R5, int32(rule.Minor), nextBlockSym), - ) - } - p.insts = append(p.insts, acceptBlock(rule.Allow)...) - // set blockSym to the first instruction we added in this iteration - p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym) - p.blockID++ - return nil -} - -func (p *program) finalize() asm.Instructions { - var v int32 - if p.defaultAllow { - v = 1 - } - blockSym := "block-" + strconv.Itoa(p.blockID) - p.insts = append(p.insts, - // R0 <- v - asm.Mov.Imm32(asm.R0, v).Sym(blockSym), - asm.Return(), - ) - p.blockID = -1 - return p.insts -} - -func acceptBlock(accept bool) asm.Instructions { - var v int32 - if accept { - v = 1 - } - return []asm.Instruction{ - // R0 <- v - asm.Mov.Imm32(asm.R0, v), - asm.Return(), - } -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go deleted file mode 100644 index 35b00aaf0..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/ebpf_linux.go +++ /dev/null @@ -1,253 +0,0 @@ -package ebpf - -import ( - "errors" - "fmt" - "os" - "runtime" - "sync" - "unsafe" - - "github.com/cilium/ebpf" - "github.com/cilium/ebpf/asm" - "github.com/cilium/ebpf/link" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" -) - -func nilCloser() error { - return nil -} - -func findAttachedCgroupDeviceFilters(dirFd int) ([]*ebpf.Program, error) { - type bpfAttrQuery struct { - TargetFd uint32 - AttachType uint32 - QueryType uint32 - AttachFlags uint32 - ProgIds uint64 // __aligned_u64 - ProgCnt uint32 - } - - // Currently you can only have 64 eBPF programs attached to a cgroup. - size := 64 - retries := 0 - for retries < 10 { - progIds := make([]uint32, size) - query := bpfAttrQuery{ - TargetFd: uint32(dirFd), - AttachType: uint32(unix.BPF_CGROUP_DEVICE), - ProgIds: uint64(uintptr(unsafe.Pointer(&progIds[0]))), - ProgCnt: uint32(len(progIds)), - } - - // Fetch the list of program ids. - _, _, errno := unix.Syscall(unix.SYS_BPF, - uintptr(unix.BPF_PROG_QUERY), - uintptr(unsafe.Pointer(&query)), - unsafe.Sizeof(query)) - size = int(query.ProgCnt) - runtime.KeepAlive(query) - if errno != 0 { - // On ENOSPC we get the correct number of programs. - if errno == unix.ENOSPC { - retries++ - continue - } - return nil, fmt.Errorf("bpf_prog_query(BPF_CGROUP_DEVICE) failed: %w", errno) - } - - // Convert the ids to program handles. - progIds = progIds[:size] - programs := make([]*ebpf.Program, 0, len(progIds)) - for _, progId := range progIds { - program, err := ebpf.NewProgramFromID(ebpf.ProgramID(progId)) - if err != nil { - // We skip over programs that give us -EACCES or -EPERM. This - // is necessary because there may be BPF programs that have - // been attached (such as with --systemd-cgroup) which have an - // LSM label that blocks us from interacting with the program. - // - // Because additional BPF_CGROUP_DEVICE programs only can add - // restrictions, there's no real issue with just ignoring these - // programs (and stops runc from breaking on distributions with - // very strict SELinux policies). - if errors.Is(err, os.ErrPermission) { - logrus.Debugf("ignoring existing CGROUP_DEVICE program (prog_id=%v) which cannot be accessed by runc -- likely due to LSM policy: %v", progId, err) - continue - } - return nil, fmt.Errorf("cannot fetch program from id: %w", err) - } - programs = append(programs, program) - } - runtime.KeepAlive(progIds) - return programs, nil - } - - return nil, errors.New("could not get complete list of CGROUP_DEVICE programs") -} - -var ( - haveBpfProgReplaceBool bool - haveBpfProgReplaceOnce sync.Once -) - -// Loosely based on the BPF_F_REPLACE support check in -// https://github.com/cilium/ebpf/blob/v0.6.0/link/syscalls.go. -// -// TODO: move this logic to cilium/ebpf -func haveBpfProgReplace() bool { - haveBpfProgReplaceOnce.Do(func() { - prog, err := ebpf.NewProgram(&ebpf.ProgramSpec{ - Type: ebpf.CGroupDevice, - License: "MIT", - Instructions: asm.Instructions{ - asm.Mov.Imm(asm.R0, 0), - asm.Return(), - }, - }) - if err != nil { - logrus.Debugf("checking for BPF_F_REPLACE support: ebpf.NewProgram failed: %v", err) - return - } - defer prog.Close() - - devnull, err := os.Open("/dev/null") - if err != nil { - logrus.Debugf("checking for BPF_F_REPLACE support: open dummy target fd: %v", err) - return - } - defer devnull.Close() - - // We know that we have BPF_PROG_ATTACH since we can load - // BPF_CGROUP_DEVICE programs. If passing BPF_F_REPLACE gives us EINVAL - // we know that the feature isn't present. - err = link.RawAttachProgram(link.RawAttachProgramOptions{ - // We rely on this fd being checked after attachFlags. - Target: int(devnull.Fd()), - // Attempt to "replace" bad fds with this program. - Program: prog, - Attach: ebpf.AttachCGroupDevice, - Flags: unix.BPF_F_ALLOW_MULTI | unix.BPF_F_REPLACE, - }) - if errors.Is(err, unix.EINVAL) { - // not supported - return - } - // attach_flags test succeeded. - if !errors.Is(err, unix.EBADF) { - logrus.Debugf("checking for BPF_F_REPLACE: got unexpected (not EBADF or EINVAL) error: %v", err) - } - haveBpfProgReplaceBool = true - }) - return haveBpfProgReplaceBool -} - -// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/ directory. -// -// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 . -// -// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92 -func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFd int) (func() error, error) { - // Increase `ulimit -l` limit to avoid BPF_PROG_LOAD error (#2167). - // This limit is not inherited into the container. - memlockLimit := &unix.Rlimit{ - Cur: unix.RLIM_INFINITY, - Max: unix.RLIM_INFINITY, - } - _ = unix.Setrlimit(unix.RLIMIT_MEMLOCK, memlockLimit) - - // Get the list of existing programs. - oldProgs, err := findAttachedCgroupDeviceFilters(dirFd) - if err != nil { - return nilCloser, err - } - useReplaceProg := haveBpfProgReplace() && len(oldProgs) == 1 - - // Generate new program. - spec := &ebpf.ProgramSpec{ - Type: ebpf.CGroupDevice, - Instructions: insts, - License: license, - } - prog, err := ebpf.NewProgram(spec) - if err != nil { - return nilCloser, err - } - - // If there is only one old program, we can just replace it directly. - var ( - replaceProg *ebpf.Program - attachFlags uint32 = unix.BPF_F_ALLOW_MULTI - ) - if useReplaceProg { - replaceProg = oldProgs[0] - attachFlags |= unix.BPF_F_REPLACE - } - err = link.RawAttachProgram(link.RawAttachProgramOptions{ - Target: dirFd, - Program: prog, - Replace: replaceProg, - Attach: ebpf.AttachCGroupDevice, - Flags: attachFlags, - }) - if err != nil { - return nilCloser, fmt.Errorf("failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI): %w", err) - } - closer := func() error { - err = link.RawDetachProgram(link.RawDetachProgramOptions{ - Target: dirFd, - Program: prog, - Attach: ebpf.AttachCGroupDevice, - }) - if err != nil { - return fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE): %w", err) - } - // TODO: Should we attach the old filters back in this case? Otherwise - // we fail-open on a security feature, which is a bit scary. - return nil - } - if !useReplaceProg { - logLevel := logrus.DebugLevel - // If there was more than one old program, give a warning (since this - // really shouldn't happen with runc-managed cgroups) and then detach - // all the old programs. - if len(oldProgs) > 1 { - // NOTE: Ideally this should be a warning but it turns out that - // systemd-managed cgroups trigger this warning (apparently - // systemd doesn't delete old non-systemd programs when - // setting properties). - logrus.Infof("found more than one filter (%d) attached to a cgroup -- removing extra filters!", len(oldProgs)) - logLevel = logrus.InfoLevel - } - for idx, oldProg := range oldProgs { - // Output some extra debug info. - if info, err := oldProg.Info(); err == nil { - fields := logrus.Fields{ - "type": info.Type.String(), - "tag": info.Tag, - "name": info.Name, - } - if id, ok := info.ID(); ok { - fields["id"] = id - } - if runCount, ok := info.RunCount(); ok { - fields["run_count"] = runCount - } - if runtime, ok := info.Runtime(); ok { - fields["runtime"] = runtime.String() - } - logrus.WithFields(fields).Logf(logLevel, "removing old filter %d from cgroup", idx) - } - err = link.RawDetachProgram(link.RawDetachProgramOptions{ - Target: dirFd, - Program: oldProg, - Attach: ebpf.AttachCGroupDevice, - }) - if err != nil { - return closer, fmt.Errorf("failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE) on old filter program: %w", err) - } - } - } - return closer, nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go deleted file mode 100644 index 0cdaf7478..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/file.go +++ /dev/null @@ -1,190 +0,0 @@ -package cgroups - -import ( - "bytes" - "errors" - "fmt" - "os" - "path" - "strconv" - "strings" - "sync" - - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" -) - -// OpenFile opens a cgroup file in a given dir with given flags. -// It is supposed to be used for cgroup files only, and returns -// an error if the file is not a cgroup file. -// -// Arguments dir and file are joined together to form an absolute path -// to a file being opened. -func OpenFile(dir, file string, flags int) (*os.File, error) { - if dir == "" { - return nil, fmt.Errorf("no directory specified for %s", file) - } - return openFile(dir, file, flags) -} - -// ReadFile reads data from a cgroup file in dir. -// It is supposed to be used for cgroup files only. -func ReadFile(dir, file string) (string, error) { - fd, err := OpenFile(dir, file, unix.O_RDONLY) - if err != nil { - return "", err - } - defer fd.Close() - var buf bytes.Buffer - - _, err = buf.ReadFrom(fd) - return buf.String(), err -} - -// WriteFile writes data to a cgroup file in dir. -// It is supposed to be used for cgroup files only. -func WriteFile(dir, file, data string) error { - fd, err := OpenFile(dir, file, unix.O_WRONLY) - if err != nil { - return err - } - defer fd.Close() - if err := retryingWriteFile(fd, data); err != nil { - // Having data in the error message helps in debugging. - return fmt.Errorf("failed to write %q: %w", data, err) - } - return nil -} - -func retryingWriteFile(fd *os.File, data string) error { - for { - _, err := fd.Write([]byte(data)) - if errors.Is(err, unix.EINTR) { - logrus.Infof("interrupted while writing %s to %s", data, fd.Name()) - continue - } - return err - } -} - -const ( - cgroupfsDir = "/sys/fs/cgroup" - cgroupfsPrefix = cgroupfsDir + "/" -) - -var ( - // TestMode is set to true by unit tests that need "fake" cgroupfs. - TestMode bool - - cgroupFd int = -1 - prepOnce sync.Once - prepErr error - resolveFlags uint64 -) - -func prepareOpenat2() error { - prepOnce.Do(func() { - fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{ - Flags: unix.O_DIRECTORY | unix.O_PATH, - }) - if err != nil { - prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err} - if err != unix.ENOSYS { //nolint:errorlint // unix errors are bare - logrus.Warnf("falling back to securejoin: %s", prepErr) - } else { - logrus.Debug("openat2 not available, falling back to securejoin") - } - return - } - var st unix.Statfs_t - if err = unix.Fstatfs(fd, &st); err != nil { - prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err} - logrus.Warnf("falling back to securejoin: %s", prepErr) - return - } - - cgroupFd = fd - - resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS - if st.Type == unix.CGROUP2_SUPER_MAGIC { - // cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks - resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS - } - }) - - return prepErr -} - -func openFile(dir, file string, flags int) (*os.File, error) { - mode := os.FileMode(0) - if TestMode && flags&os.O_WRONLY != 0 { - // "emulate" cgroup fs for unit tests - flags |= os.O_TRUNC | os.O_CREATE - mode = 0o600 - } - path := path.Join(dir, file) - if prepareOpenat2() != nil { - return openFallback(path, flags, mode) - } - relPath := strings.TrimPrefix(path, cgroupfsPrefix) - if len(relPath) == len(path) { // non-standard path, old system? - return openFallback(path, flags, mode) - } - - fd, err := unix.Openat2(cgroupFd, relPath, - &unix.OpenHow{ - Resolve: resolveFlags, - Flags: uint64(flags) | unix.O_CLOEXEC, - Mode: uint64(mode), - }) - if err != nil { - err = &os.PathError{Op: "openat2", Path: path, Err: err} - // Check if cgroupFd is still opened to cgroupfsDir - // (happens when this package is incorrectly used - // across the chroot/pivot_root/mntns boundary, or - // when /sys/fs/cgroup is remounted). - // - // TODO: if such usage will ever be common, amend this - // to reopen cgroupFd and retry openat2. - fdStr := strconv.Itoa(cgroupFd) - fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr) - if fdDest != cgroupfsDir { - // Wrap the error so it is clear that cgroupFd - // is opened to an unexpected/wrong directory. - err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w", - fdStr, fdDest, cgroupfsDir, err) - } - return nil, err - } - - return os.NewFile(uintptr(fd), path), nil -} - -var errNotCgroupfs = errors.New("not a cgroup file") - -// Can be changed by unit tests. -var openFallback = openAndCheck - -// openAndCheck is used when openat2(2) is not available. It checks the opened -// file is on cgroupfs, returning an error otherwise. -func openAndCheck(path string, flags int, mode os.FileMode) (*os.File, error) { - fd, err := os.OpenFile(path, flags, mode) - if err != nil { - return nil, err - } - if TestMode { - return fd, nil - } - // Check this is a cgroupfs file. - var st unix.Statfs_t - if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil { - _ = fd.Close() - return nil, &os.PathError{Op: "statfs", Path: path, Err: err} - } - if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC { - _ = fd.Close() - return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs} - } - - return fd, nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go deleted file mode 100644 index c81b6562a..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go +++ /dev/null @@ -1,311 +0,0 @@ -package fs - -import ( - "bufio" - "os" - "path/filepath" - "strconv" - "strings" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" -) - -type BlkioGroup struct { - weightFilename string - weightDeviceFilename string -} - -func (s *BlkioGroup) Name() string { - return "blkio" -} - -func (s *BlkioGroup) Apply(path string, _ *configs.Resources, pid int) error { - return apply(path, pid) -} - -func (s *BlkioGroup) Set(path string, r *configs.Resources) error { - s.detectWeightFilenames(path) - if r.BlkioWeight != 0 { - if err := cgroups.WriteFile(path, s.weightFilename, strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil { - return err - } - } - - if r.BlkioLeafWeight != 0 { - if err := cgroups.WriteFile(path, "blkio.leaf_weight", strconv.FormatUint(uint64(r.BlkioLeafWeight), 10)); err != nil { - return err - } - } - for _, wd := range r.BlkioWeightDevice { - if wd.Weight != 0 { - if err := cgroups.WriteFile(path, s.weightDeviceFilename, wd.WeightString()); err != nil { - return err - } - } - if wd.LeafWeight != 0 { - if err := cgroups.WriteFile(path, "blkio.leaf_weight_device", wd.LeafWeightString()); err != nil { - return err - } - } - } - for _, td := range r.BlkioThrottleReadBpsDevice { - if err := cgroups.WriteFile(path, "blkio.throttle.read_bps_device", td.String()); err != nil { - return err - } - } - for _, td := range r.BlkioThrottleWriteBpsDevice { - if err := cgroups.WriteFile(path, "blkio.throttle.write_bps_device", td.String()); err != nil { - return err - } - } - for _, td := range r.BlkioThrottleReadIOPSDevice { - if err := cgroups.WriteFile(path, "blkio.throttle.read_iops_device", td.String()); err != nil { - return err - } - } - for _, td := range r.BlkioThrottleWriteIOPSDevice { - if err := cgroups.WriteFile(path, "blkio.throttle.write_iops_device", td.String()); err != nil { - return err - } - } - - return nil -} - -/* -examples: - - blkio.sectors - 8:0 6792 - - blkio.io_service_bytes - 8:0 Read 1282048 - 8:0 Write 2195456 - 8:0 Sync 2195456 - 8:0 Async 1282048 - 8:0 Total 3477504 - Total 3477504 - - blkio.io_serviced - 8:0 Read 124 - 8:0 Write 104 - 8:0 Sync 104 - 8:0 Async 124 - 8:0 Total 228 - Total 228 - - blkio.io_queued - 8:0 Read 0 - 8:0 Write 0 - 8:0 Sync 0 - 8:0 Async 0 - 8:0 Total 0 - Total 0 -*/ - -func splitBlkioStatLine(r rune) bool { - return r == ' ' || r == ':' -} - -func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) { - var blkioStats []cgroups.BlkioStatEntry - f, err := cgroups.OpenFile(dir, file, os.O_RDONLY) - if err != nil { - if os.IsNotExist(err) { - return blkioStats, nil - } - return nil, err - } - defer f.Close() - - sc := bufio.NewScanner(f) - for sc.Scan() { - // format: dev type amount - fields := strings.FieldsFunc(sc.Text(), splitBlkioStatLine) - if len(fields) < 3 { - if len(fields) == 2 && fields[0] == "Total" { - // skip total line - continue - } else { - return nil, malformedLine(dir, file, sc.Text()) - } - } - - v, err := strconv.ParseUint(fields[0], 10, 64) - if err != nil { - return nil, &parseError{Path: dir, File: file, Err: err} - } - major := v - - v, err = strconv.ParseUint(fields[1], 10, 64) - if err != nil { - return nil, &parseError{Path: dir, File: file, Err: err} - } - minor := v - - op := "" - valueField := 2 - if len(fields) == 4 { - op = fields[2] - valueField = 3 - } - v, err = strconv.ParseUint(fields[valueField], 10, 64) - if err != nil { - return nil, &parseError{Path: dir, File: file, Err: err} - } - blkioStats = append(blkioStats, cgroups.BlkioStatEntry{Major: major, Minor: minor, Op: op, Value: v}) - } - if err := sc.Err(); err != nil { - return nil, &parseError{Path: dir, File: file, Err: err} - } - - return blkioStats, nil -} - -func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error { - type blkioStatInfo struct { - filename string - blkioStatEntriesPtr *[]cgroups.BlkioStatEntry - } - bfqDebugStats := []blkioStatInfo{ - { - filename: "blkio.bfq.sectors_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, - }, - { - filename: "blkio.bfq.io_service_time_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, - }, - { - filename: "blkio.bfq.io_wait_time_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, - }, - { - filename: "blkio.bfq.io_merged_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, - }, - { - filename: "blkio.bfq.io_queued_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, - }, - { - filename: "blkio.bfq.time_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, - }, - { - filename: "blkio.bfq.io_serviced_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, - }, - { - filename: "blkio.bfq.io_service_bytes_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, - }, - } - bfqStats := []blkioStatInfo{ - { - filename: "blkio.bfq.io_serviced_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, - }, - { - filename: "blkio.bfq.io_service_bytes_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, - }, - } - cfqStats := []blkioStatInfo{ - { - filename: "blkio.sectors_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive, - }, - { - filename: "blkio.io_service_time_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive, - }, - { - filename: "blkio.io_wait_time_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive, - }, - { - filename: "blkio.io_merged_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive, - }, - { - filename: "blkio.io_queued_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive, - }, - { - filename: "blkio.time_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive, - }, - { - filename: "blkio.io_serviced_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, - }, - { - filename: "blkio.io_service_bytes_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, - }, - } - throttleRecursiveStats := []blkioStatInfo{ - { - filename: "blkio.throttle.io_serviced_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, - }, - { - filename: "blkio.throttle.io_service_bytes_recursive", - blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, - }, - } - baseStats := []blkioStatInfo{ - { - filename: "blkio.throttle.io_serviced", - blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive, - }, - { - filename: "blkio.throttle.io_service_bytes", - blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive, - }, - } - orderedStats := [][]blkioStatInfo{ - bfqDebugStats, - bfqStats, - cfqStats, - throttleRecursiveStats, - baseStats, - } - - var blkioStats []cgroups.BlkioStatEntry - var err error - - for _, statGroup := range orderedStats { - for i, statInfo := range statGroup { - if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil { - // if error occurs on first file, move to next group - if i == 0 { - break - } - return err - } - *statInfo.blkioStatEntriesPtr = blkioStats - // finish if all stats are gathered - if i == len(statGroup)-1 { - return nil - } - } - } - return nil -} - -func (s *BlkioGroup) detectWeightFilenames(path string) { - if s.weightFilename != "" { - // Already detected. - return - } - if cgroups.PathExists(filepath.Join(path, "blkio.weight")) { - s.weightFilename = "blkio.weight" - s.weightDeviceFilename = "blkio.weight_device" - } else { - s.weightFilename = "blkio.bfq.weight" - s.weightDeviceFilename = "blkio.bfq.weight_device" - } -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go deleted file mode 100644 index 6c79f899b..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go +++ /dev/null @@ -1,129 +0,0 @@ -package fs - -import ( - "bufio" - "errors" - "fmt" - "os" - "strconv" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" - "golang.org/x/sys/unix" -) - -type CpuGroup struct{} - -func (s *CpuGroup) Name() string { - return "cpu" -} - -func (s *CpuGroup) Apply(path string, r *configs.Resources, pid int) error { - if err := os.MkdirAll(path, 0o755); err != nil { - return err - } - // We should set the real-Time group scheduling settings before moving - // in the process because if the process is already in SCHED_RR mode - // and no RT bandwidth is set, adding it will fail. - if err := s.SetRtSched(path, r); err != nil { - return err - } - // Since we are not using apply(), we need to place the pid - // into the procs file. - return cgroups.WriteCgroupProc(path, pid) -} - -func (s *CpuGroup) SetRtSched(path string, r *configs.Resources) error { - if r.CpuRtPeriod != 0 { - if err := cgroups.WriteFile(path, "cpu.rt_period_us", strconv.FormatUint(r.CpuRtPeriod, 10)); err != nil { - return err - } - } - if r.CpuRtRuntime != 0 { - if err := cgroups.WriteFile(path, "cpu.rt_runtime_us", strconv.FormatInt(r.CpuRtRuntime, 10)); err != nil { - return err - } - } - return nil -} - -func (s *CpuGroup) Set(path string, r *configs.Resources) error { - if r.CpuShares != 0 { - shares := r.CpuShares - if err := cgroups.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil { - return err - } - // read it back - sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares") - if err != nil { - return err - } - // ... and check - if shares > sharesRead { - return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead) - } else if shares < sharesRead { - return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead) - } - } - - var period string - if r.CpuPeriod != 0 { - period = strconv.FormatUint(r.CpuPeriod, 10) - if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { - // Sometimes when the period to be set is smaller - // than the current one, it is rejected by the kernel - // (EINVAL) as old_quota/new_period exceeds the parent - // cgroup quota limit. If this happens and the quota is - // going to be set, ignore the error for now and retry - // after setting the quota. - if !errors.Is(err, unix.EINVAL) || r.CpuQuota == 0 { - return err - } - } else { - period = "" - } - } - if r.CpuQuota != 0 { - if err := cgroups.WriteFile(path, "cpu.cfs_quota_us", strconv.FormatInt(r.CpuQuota, 10)); err != nil { - return err - } - if period != "" { - if err := cgroups.WriteFile(path, "cpu.cfs_period_us", period); err != nil { - return err - } - } - } - return s.SetRtSched(path, r) -} - -func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error { - const file = "cpu.stat" - f, err := cgroups.OpenFile(path, file, os.O_RDONLY) - if err != nil { - if os.IsNotExist(err) { - return nil - } - return err - } - defer f.Close() - - sc := bufio.NewScanner(f) - for sc.Scan() { - t, v, err := fscommon.ParseKeyValue(sc.Text()) - if err != nil { - return &parseError{Path: path, File: file, Err: err} - } - switch t { - case "nr_periods": - stats.CpuStats.ThrottlingData.Periods = v - - case "nr_throttled": - stats.CpuStats.ThrottlingData.ThrottledPeriods = v - - case "throttled_time": - stats.CpuStats.ThrottlingData.ThrottledTime = v - } - } - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go deleted file mode 100644 index d3bd7e111..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go +++ /dev/null @@ -1,166 +0,0 @@ -package fs - -import ( - "bufio" - "os" - "strconv" - "strings" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" -) - -const ( - cgroupCpuacctStat = "cpuacct.stat" - cgroupCpuacctUsageAll = "cpuacct.usage_all" - - nanosecondsInSecond = 1000000000 - - userModeColumn = 1 - kernelModeColumn = 2 - cuacctUsageAllColumnsNumber = 3 - - // The value comes from `C.sysconf(C._SC_CLK_TCK)`, and - // on Linux it's a constant which is safe to be hard coded, - // so we can avoid using cgo here. For details, see: - // https://github.com/containerd/cgroups/pull/12 - clockTicks uint64 = 100 -) - -type CpuacctGroup struct{} - -func (s *CpuacctGroup) Name() string { - return "cpuacct" -} - -func (s *CpuacctGroup) Apply(path string, _ *configs.Resources, pid int) error { - return apply(path, pid) -} - -func (s *CpuacctGroup) Set(_ string, _ *configs.Resources) error { - return nil -} - -func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error { - if !cgroups.PathExists(path) { - return nil - } - userModeUsage, kernelModeUsage, err := getCpuUsageBreakdown(path) - if err != nil { - return err - } - - totalUsage, err := fscommon.GetCgroupParamUint(path, "cpuacct.usage") - if err != nil { - return err - } - - percpuUsage, err := getPercpuUsage(path) - if err != nil { - return err - } - - percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path) - if err != nil { - return err - } - - stats.CpuStats.CpuUsage.TotalUsage = totalUsage - stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage - stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode - stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode - stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage - stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage - return nil -} - -// Returns user and kernel usage breakdown in nanoseconds. -func getCpuUsageBreakdown(path string) (uint64, uint64, error) { - var userModeUsage, kernelModeUsage uint64 - const ( - userField = "user" - systemField = "system" - file = cgroupCpuacctStat - ) - - // Expected format: - // user - // system - data, err := cgroups.ReadFile(path, file) - if err != nil { - return 0, 0, err - } - // TODO: use strings.SplitN instead. - fields := strings.Fields(data) - if len(fields) < 4 || fields[0] != userField || fields[2] != systemField { - return 0, 0, malformedLine(path, file, data) - } - if userModeUsage, err = strconv.ParseUint(fields[1], 10, 64); err != nil { - return 0, 0, &parseError{Path: path, File: file, Err: err} - } - if kernelModeUsage, err = strconv.ParseUint(fields[3], 10, 64); err != nil { - return 0, 0, &parseError{Path: path, File: file, Err: err} - } - - return (userModeUsage * nanosecondsInSecond) / clockTicks, (kernelModeUsage * nanosecondsInSecond) / clockTicks, nil -} - -func getPercpuUsage(path string) ([]uint64, error) { - const file = "cpuacct.usage_percpu" - percpuUsage := []uint64{} - data, err := cgroups.ReadFile(path, file) - if err != nil { - return percpuUsage, err - } - // TODO: use strings.SplitN instead. - for _, value := range strings.Fields(data) { - value, err := strconv.ParseUint(value, 10, 64) - if err != nil { - return percpuUsage, &parseError{Path: path, File: file, Err: err} - } - percpuUsage = append(percpuUsage, value) - } - return percpuUsage, nil -} - -func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) { - usageKernelMode := []uint64{} - usageUserMode := []uint64{} - const file = cgroupCpuacctUsageAll - - fd, err := cgroups.OpenFile(path, file, os.O_RDONLY) - if os.IsNotExist(err) { - return usageKernelMode, usageUserMode, nil - } else if err != nil { - return nil, nil, err - } - defer fd.Close() - - scanner := bufio.NewScanner(fd) - scanner.Scan() // skipping header line - - for scanner.Scan() { - lineFields := strings.SplitN(scanner.Text(), " ", cuacctUsageAllColumnsNumber+1) - if len(lineFields) != cuacctUsageAllColumnsNumber { - continue - } - - usageInKernelMode, err := strconv.ParseUint(lineFields[kernelModeColumn], 10, 64) - if err != nil { - return nil, nil, &parseError{Path: path, File: file, Err: err} - } - usageKernelMode = append(usageKernelMode, usageInKernelMode) - - usageInUserMode, err := strconv.ParseUint(lineFields[userModeColumn], 10, 64) - if err != nil { - return nil, nil, &parseError{Path: path, File: file, Err: err} - } - usageUserMode = append(usageUserMode, usageInUserMode) - } - if err := scanner.Err(); err != nil { - return nil, nil, &parseError{Path: path, File: file, Err: err} - } - - return usageKernelMode, usageUserMode, nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go deleted file mode 100644 index 550baa427..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go +++ /dev/null @@ -1,245 +0,0 @@ -package fs - -import ( - "errors" - "os" - "path/filepath" - "strconv" - "strings" - - "golang.org/x/sys/unix" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" -) - -type CpusetGroup struct{} - -func (s *CpusetGroup) Name() string { - return "cpuset" -} - -func (s *CpusetGroup) Apply(path string, r *configs.Resources, pid int) error { - return s.ApplyDir(path, r, pid) -} - -func (s *CpusetGroup) Set(path string, r *configs.Resources) error { - if r.CpusetCpus != "" { - if err := cgroups.WriteFile(path, "cpuset.cpus", r.CpusetCpus); err != nil { - return err - } - } - if r.CpusetMems != "" { - if err := cgroups.WriteFile(path, "cpuset.mems", r.CpusetMems); err != nil { - return err - } - } - return nil -} - -func getCpusetStat(path string, file string) ([]uint16, error) { - var extracted []uint16 - fileContent, err := fscommon.GetCgroupParamString(path, file) - if err != nil { - return extracted, err - } - if len(fileContent) == 0 { - return extracted, &parseError{Path: path, File: file, Err: errors.New("empty file")} - } - - for _, s := range strings.Split(fileContent, ",") { - sp := strings.SplitN(s, "-", 3) - switch len(sp) { - case 3: - return extracted, &parseError{Path: path, File: file, Err: errors.New("extra dash")} - case 2: - min, err := strconv.ParseUint(sp[0], 10, 16) - if err != nil { - return extracted, &parseError{Path: path, File: file, Err: err} - } - max, err := strconv.ParseUint(sp[1], 10, 16) - if err != nil { - return extracted, &parseError{Path: path, File: file, Err: err} - } - if min > max { - return extracted, &parseError{Path: path, File: file, Err: errors.New("invalid values, min > max")} - } - for i := min; i <= max; i++ { - extracted = append(extracted, uint16(i)) - } - case 1: - value, err := strconv.ParseUint(s, 10, 16) - if err != nil { - return extracted, &parseError{Path: path, File: file, Err: err} - } - extracted = append(extracted, uint16(value)) - } - } - - return extracted, nil -} - -func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error { - var err error - - stats.CPUSetStats.CPUs, err = getCpusetStat(path, "cpuset.cpus") - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err - } - - stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.cpu_exclusive") - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err - } - - stats.CPUSetStats.Mems, err = getCpusetStat(path, "cpuset.mems") - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err - } - - stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_hardwall") - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err - } - - stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_exclusive") - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err - } - - stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_migrate") - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err - } - - stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_page") - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err - } - - stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_slab") - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err - } - - stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_pressure") - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err - } - - stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, "cpuset.sched_load_balance") - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err - } - - stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, "cpuset.sched_relax_domain_level") - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err - } - - return nil -} - -func (s *CpusetGroup) ApplyDir(dir string, r *configs.Resources, pid int) error { - // This might happen if we have no cpuset cgroup mounted. - // Just do nothing and don't fail. - if dir == "" { - return nil - } - // 'ensureParent' start with parent because we don't want to - // explicitly inherit from parent, it could conflict with - // 'cpuset.cpu_exclusive'. - if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil { - return err - } - if err := os.Mkdir(dir, 0o755); err != nil && !os.IsExist(err) { - return err - } - // We didn't inherit cpuset configs from parent, but we have - // to ensure cpuset configs are set before moving task into the - // cgroup. - // The logic is, if user specified cpuset configs, use these - // specified configs, otherwise, inherit from parent. This makes - // cpuset configs work correctly with 'cpuset.cpu_exclusive', and - // keep backward compatibility. - if err := s.ensureCpusAndMems(dir, r); err != nil { - return err - } - // Since we are not using apply(), we need to place the pid - // into the procs file. - return cgroups.WriteCgroupProc(dir, pid) -} - -func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) { - if cpus, err = cgroups.ReadFile(parent, "cpuset.cpus"); err != nil { - return - } - if mems, err = cgroups.ReadFile(parent, "cpuset.mems"); err != nil { - return - } - return cpus, mems, nil -} - -// cpusetEnsureParent makes sure that the parent directories of current -// are created and populated with the proper cpus and mems files copied -// from their respective parent. It does that recursively, starting from -// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point). -func cpusetEnsureParent(current string) error { - var st unix.Statfs_t - - parent := filepath.Dir(current) - err := unix.Statfs(parent, &st) - if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC { - return nil - } - // Treat non-existing directory as cgroupfs as it will be created, - // and the root cpuset directory obviously exists. - if err != nil && err != unix.ENOENT { //nolint:errorlint // unix errors are bare - return &os.PathError{Op: "statfs", Path: parent, Err: err} - } - - if err := cpusetEnsureParent(parent); err != nil { - return err - } - if err := os.Mkdir(current, 0o755); err != nil && !os.IsExist(err) { - return err - } - return cpusetCopyIfNeeded(current, parent) -} - -// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent -// directory to the current directory if the file's contents are 0 -func cpusetCopyIfNeeded(current, parent string) error { - currentCpus, currentMems, err := getCpusetSubsystemSettings(current) - if err != nil { - return err - } - parentCpus, parentMems, err := getCpusetSubsystemSettings(parent) - if err != nil { - return err - } - - if isEmptyCpuset(currentCpus) { - if err := cgroups.WriteFile(current, "cpuset.cpus", parentCpus); err != nil { - return err - } - } - if isEmptyCpuset(currentMems) { - if err := cgroups.WriteFile(current, "cpuset.mems", parentMems); err != nil { - return err - } - } - return nil -} - -func isEmptyCpuset(str string) bool { - return str == "" || str == "\n" -} - -func (s *CpusetGroup) ensureCpusAndMems(path string, r *configs.Resources) error { - if err := s.Set(path, r); err != nil { - return err - } - return cpusetCopyIfNeeded(path, filepath.Dir(path)) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go deleted file mode 100644 index 4527a70eb..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go +++ /dev/null @@ -1,109 +0,0 @@ -package fs - -import ( - "bytes" - "errors" - "reflect" - - "github.com/opencontainers/runc/libcontainer/cgroups" - cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/devices" - "github.com/opencontainers/runc/libcontainer/userns" -) - -type DevicesGroup struct { - TestingSkipFinalCheck bool -} - -func (s *DevicesGroup) Name() string { - return "devices" -} - -func (s *DevicesGroup) Apply(path string, r *configs.Resources, pid int) error { - if r.SkipDevices { - return nil - } - if path == "" { - // Return error here, since devices cgroup - // is a hard requirement for container's security. - return errSubsystemDoesNotExist - } - - return apply(path, pid) -} - -func loadEmulator(path string) (*cgroupdevices.Emulator, error) { - list, err := cgroups.ReadFile(path, "devices.list") - if err != nil { - return nil, err - } - return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list)) -} - -func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) { - // This defaults to a white-list -- which is what we want! - emu := &cgroupdevices.Emulator{} - for _, rule := range rules { - if err := emu.Apply(*rule); err != nil { - return nil, err - } - } - return emu, nil -} - -func (s *DevicesGroup) Set(path string, r *configs.Resources) error { - if userns.RunningInUserNS() || r.SkipDevices { - return nil - } - - // Generate two emulators, one for the current state of the cgroup and one - // for the requested state by the user. - current, err := loadEmulator(path) - if err != nil { - return err - } - target, err := buildEmulator(r.Devices) - if err != nil { - return err - } - - // Compute the minimal set of transition rules needed to achieve the - // requested state. - transitionRules, err := current.Transition(target) - if err != nil { - return err - } - for _, rule := range transitionRules { - file := "devices.deny" - if rule.Allow { - file = "devices.allow" - } - if err := cgroups.WriteFile(path, file, rule.CgroupString()); err != nil { - return err - } - } - - // Final safety check -- ensure that the resulting state is what was - // requested. This is only really correct for white-lists, but for - // black-lists we can at least check that the cgroup is in the right mode. - // - // This safety-check is skipped for the unit tests because we cannot - // currently mock devices.list correctly. - if !s.TestingSkipFinalCheck { - currentAfter, err := loadEmulator(path) - if err != nil { - return err - } - if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) { - return errors.New("resulting devices cgroup doesn't precisely match target") - } else if target.IsBlacklist() != currentAfter.IsBlacklist() { - return errors.New("resulting devices cgroup doesn't match target mode") - } - } - return nil -} - -func (s *DevicesGroup) GetStats(path string, stats *cgroups.Stats) error { - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/error.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/error.go deleted file mode 100644 index f2ab6f130..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/error.go +++ /dev/null @@ -1,15 +0,0 @@ -package fs - -import ( - "fmt" - - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" -) - -type parseError = fscommon.ParseError - -// malformedLine is used by all cgroupfs file parsers that expect a line -// in a particular format but get some garbage instead. -func malformedLine(path, file, line string) error { - return &parseError{Path: path, File: file, Err: fmt.Errorf("malformed line: %s", line)} -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go deleted file mode 100644 index 987f1bf5e..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go +++ /dev/null @@ -1,158 +0,0 @@ -package fs - -import ( - "errors" - "fmt" - "os" - "strings" - "time" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" -) - -type FreezerGroup struct{} - -func (s *FreezerGroup) Name() string { - return "freezer" -} - -func (s *FreezerGroup) Apply(path string, _ *configs.Resources, pid int) error { - return apply(path, pid) -} - -func (s *FreezerGroup) Set(path string, r *configs.Resources) (Err error) { - switch r.Freezer { - case configs.Frozen: - defer func() { - if Err != nil { - // Freezing failed, and it is bad and dangerous - // to leave the cgroup in FROZEN or FREEZING - // state, so (try to) thaw it back. - _ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) - } - }() - - // As per older kernel docs (freezer-subsystem.txt before - // kernel commit ef9fe980c6fcc1821), if FREEZING is seen, - // userspace should either retry or thaw. While current - // kernel cgroup v1 docs no longer mention a need to retry, - // even a recent kernel (v5.4, Ubuntu 20.04) can't reliably - // freeze a cgroup v1 while new processes keep appearing in it - // (either via fork/clone or by writing new PIDs to - // cgroup.procs). - // - // The numbers below are empirically chosen to have a decent - // chance to succeed in various scenarios ("runc pause/unpause - // with parallel runc exec" and "bare freeze/unfreeze on a very - // slow system"), tested on RHEL7 and Ubuntu 20.04 kernels. - // - // Adding any amount of sleep in between retries did not - // increase the chances of successful freeze in "pause/unpause - // with parallel exec" reproducer. OTOH, adding an occasional - // sleep helped for the case where the system is extremely slow - // (CentOS 7 VM on GHA CI). - // - // Alas, this is still a game of chances, since the real fix - // belong to the kernel (cgroup v2 do not have this bug). - - for i := 0; i < 1000; i++ { - if i%50 == 49 { - // Occasional thaw and sleep improves - // the chances to succeed in freezing - // in case new processes keep appearing - // in the cgroup. - _ = cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) - time.Sleep(10 * time.Millisecond) - } - - if err := cgroups.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil { - return err - } - - if i%25 == 24 { - // Occasional short sleep before reading - // the state back also improves the chances to - // succeed in freezing in case of a very slow - // system. - time.Sleep(10 * time.Microsecond) - } - state, err := cgroups.ReadFile(path, "freezer.state") - if err != nil { - return err - } - state = strings.TrimSpace(state) - switch state { - case "FREEZING": - continue - case string(configs.Frozen): - if i > 1 { - logrus.Debugf("frozen after %d retries", i) - } - return nil - default: - // should never happen - return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state)) - } - } - // Despite our best efforts, it got stuck in FREEZING. - return errors.New("unable to freeze") - case configs.Thawed: - return cgroups.WriteFile(path, "freezer.state", string(configs.Thawed)) - case configs.Undefined: - return nil - default: - return fmt.Errorf("Invalid argument '%s' to freezer.state", string(r.Freezer)) - } -} - -func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error { - return nil -} - -func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) { - for { - state, err := cgroups.ReadFile(path, "freezer.state") - if err != nil { - // If the kernel is too old, then we just treat the freezer as - // being in an "undefined" state. - if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { - err = nil - } - return configs.Undefined, err - } - switch strings.TrimSpace(state) { - case "THAWED": - return configs.Thawed, nil - case "FROZEN": - // Find out whether the cgroup is frozen directly, - // or indirectly via an ancestor. - self, err := cgroups.ReadFile(path, "freezer.self_freezing") - if err != nil { - // If the kernel is too old, then we just treat - // it as being frozen. - if errors.Is(err, os.ErrNotExist) || errors.Is(err, unix.ENODEV) { - err = nil - } - return configs.Frozen, err - } - switch self { - case "0\n": - return configs.Thawed, nil - case "1\n": - return configs.Frozen, nil - default: - return configs.Undefined, fmt.Errorf(`unknown "freezer.self_freezing" state: %q`, self) - } - case "FREEZING": - // Make sure we get a stable freezer state, so retry if the cgroup - // is still undergoing freezing. This should be a temporary delay. - time.Sleep(1 * time.Millisecond) - continue - default: - return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state) - } - } -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go deleted file mode 100644 index 9e2f0ec04..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go +++ /dev/null @@ -1,265 +0,0 @@ -package fs - -import ( - "errors" - "fmt" - "os" - "sync" - - "golang.org/x/sys/unix" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" -) - -var subsystems = []subsystem{ - &CpusetGroup{}, - &DevicesGroup{}, - &MemoryGroup{}, - &CpuGroup{}, - &CpuacctGroup{}, - &PidsGroup{}, - &BlkioGroup{}, - &HugetlbGroup{}, - &NetClsGroup{}, - &NetPrioGroup{}, - &PerfEventGroup{}, - &FreezerGroup{}, - &RdmaGroup{}, - &NameGroup{GroupName: "name=systemd", Join: true}, - &NameGroup{GroupName: "misc", Join: true}, -} - -var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") - -func init() { - // If using cgroups-hybrid mode then add a "" controller indicating - // it should join the cgroups v2. - if cgroups.IsCgroup2HybridMode() { - subsystems = append(subsystems, &NameGroup{GroupName: "", Join: true}) - } -} - -type subsystem interface { - // Name returns the name of the subsystem. - Name() string - // GetStats fills in the stats for the subsystem. - GetStats(path string, stats *cgroups.Stats) error - // Apply creates and joins a cgroup, adding pid into it. Some - // subsystems use resources to pre-configure the cgroup parents - // before creating or joining it. - Apply(path string, r *configs.Resources, pid int) error - // Set sets the cgroup resources. - Set(path string, r *configs.Resources) error -} - -type manager struct { - mu sync.Mutex - cgroups *configs.Cgroup - paths map[string]string -} - -func NewManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) { - // Some v1 controllers (cpu, cpuset, and devices) expect - // cgroups.Resources to not be nil in Apply. - if cg.Resources == nil { - return nil, errors.New("cgroup v1 manager needs configs.Resources to be set during manager creation") - } - if cg.Resources.Unified != nil { - return nil, cgroups.ErrV1NoUnified - } - - if paths == nil { - var err error - paths, err = initPaths(cg) - if err != nil { - return nil, err - } - } - - return &manager{ - cgroups: cg, - paths: paths, - }, nil -} - -// isIgnorableError returns whether err is a permission error (in the loose -// sense of the word). This includes EROFS (which for an unprivileged user is -// basically a permission error) and EACCES (for similar reasons) as well as -// the normal EPERM. -func isIgnorableError(rootless bool, err error) bool { - // We do not ignore errors if we are root. - if !rootless { - return false - } - // Is it an ordinary EPERM? - if errors.Is(err, os.ErrPermission) { - return true - } - // Handle some specific syscall errors. - var errno unix.Errno - if errors.As(err, &errno) { - return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES - } - return false -} - -func (m *manager) Apply(pid int) (err error) { - m.mu.Lock() - defer m.mu.Unlock() - - c := m.cgroups - - for _, sys := range subsystems { - name := sys.Name() - p, ok := m.paths[name] - if !ok { - continue - } - - if err := sys.Apply(p, c.Resources, pid); err != nil { - // In the case of rootless (including euid=0 in userns), where an - // explicit cgroup path hasn't been set, we don't bail on error in - // case of permission problems here, but do delete the path from - // the m.paths map, since it is either non-existent and could not - // be created, or the pid could not be added to it. - // - // Cases where limits for the subsystem have been set are handled - // later by Set, which fails with a friendly error (see - // if path == "" in Set). - if isIgnorableError(c.Rootless, err) && c.Path == "" { - delete(m.paths, name) - continue - } - return err - } - - } - return nil -} - -func (m *manager) Destroy() error { - m.mu.Lock() - defer m.mu.Unlock() - return cgroups.RemovePaths(m.paths) -} - -func (m *manager) Path(subsys string) string { - m.mu.Lock() - defer m.mu.Unlock() - return m.paths[subsys] -} - -func (m *manager) GetStats() (*cgroups.Stats, error) { - m.mu.Lock() - defer m.mu.Unlock() - stats := cgroups.NewStats() - for _, sys := range subsystems { - path := m.paths[sys.Name()] - if path == "" { - continue - } - if err := sys.GetStats(path, stats); err != nil { - return nil, err - } - } - return stats, nil -} - -func (m *manager) Set(r *configs.Resources) error { - if r == nil { - return nil - } - - if r.Unified != nil { - return cgroups.ErrV1NoUnified - } - - m.mu.Lock() - defer m.mu.Unlock() - for _, sys := range subsystems { - path := m.paths[sys.Name()] - if err := sys.Set(path, r); err != nil { - // When rootless is true, errors from the device subsystem - // are ignored, as it is really not expected to work. - if m.cgroups.Rootless && sys.Name() == "devices" { - continue - } - // However, errors from other subsystems are not ignored. - // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" - if path == "" { - // We never created a path for this cgroup, so we cannot set - // limits for it (though we have already tried at this point). - return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) - } - return err - } - } - - return nil -} - -// Freeze toggles the container's freezer cgroup depending on the state -// provided -func (m *manager) Freeze(state configs.FreezerState) error { - path := m.Path("freezer") - if path == "" { - return errors.New("cannot toggle freezer: cgroups not configured for container") - } - - prevState := m.cgroups.Resources.Freezer - m.cgroups.Resources.Freezer = state - freezer := &FreezerGroup{} - if err := freezer.Set(path, m.cgroups.Resources); err != nil { - m.cgroups.Resources.Freezer = prevState - return err - } - return nil -} - -func (m *manager) GetPids() ([]int, error) { - return cgroups.GetPids(m.Path("devices")) -} - -func (m *manager) GetAllPids() ([]int, error) { - return cgroups.GetAllPids(m.Path("devices")) -} - -func (m *manager) GetPaths() map[string]string { - m.mu.Lock() - defer m.mu.Unlock() - return m.paths -} - -func (m *manager) GetCgroups() (*configs.Cgroup, error) { - return m.cgroups, nil -} - -func (m *manager) GetFreezerState() (configs.FreezerState, error) { - dir := m.Path("freezer") - // If the container doesn't have the freezer cgroup, say it's undefined. - if dir == "" { - return configs.Undefined, nil - } - freezer := &FreezerGroup{} - return freezer.GetState(dir) -} - -func (m *manager) Exists() bool { - return cgroups.PathExists(m.Path("devices")) -} - -func OOMKillCount(path string) (uint64, error) { - return fscommon.GetValueByKey(path, "memory.oom_control", "oom_kill") -} - -func (m *manager) OOMKillCount() (uint64, error) { - c, err := OOMKillCount(m.Path("memory")) - // Ignore ENOENT when rootless as it couldn't create cgroup. - if err != nil && m.cgroups.Rootless && os.IsNotExist(err) { - err = nil - } - - return c, err -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go deleted file mode 100644 index 8ddd6fdd8..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go +++ /dev/null @@ -1,62 +0,0 @@ -package fs - -import ( - "strconv" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" -) - -type HugetlbGroup struct{} - -func (s *HugetlbGroup) Name() string { - return "hugetlb" -} - -func (s *HugetlbGroup) Apply(path string, _ *configs.Resources, pid int) error { - return apply(path, pid) -} - -func (s *HugetlbGroup) Set(path string, r *configs.Resources) error { - for _, hugetlb := range r.HugetlbLimit { - if err := cgroups.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { - return err - } - } - - return nil -} - -func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error { - if !cgroups.PathExists(path) { - return nil - } - hugetlbStats := cgroups.HugetlbStats{} - for _, pageSize := range cgroups.HugePageSizes() { - usage := "hugetlb." + pageSize + ".usage_in_bytes" - value, err := fscommon.GetCgroupParamUint(path, usage) - if err != nil { - return err - } - hugetlbStats.Usage = value - - maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes" - value, err = fscommon.GetCgroupParamUint(path, maxUsage) - if err != nil { - return err - } - hugetlbStats.MaxUsage = value - - failcnt := "hugetlb." + pageSize + ".failcnt" - value, err = fscommon.GetCgroupParamUint(path, failcnt) - if err != nil { - return err - } - hugetlbStats.Failcnt = value - - stats.HugetlbStats[pageSize] = hugetlbStats - } - - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go deleted file mode 100644 index b7c75f941..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go +++ /dev/null @@ -1,348 +0,0 @@ -package fs - -import ( - "bufio" - "errors" - "fmt" - "math" - "os" - "path/filepath" - "strconv" - "strings" - - "golang.org/x/sys/unix" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" -) - -const ( - cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" - cgroupMemoryLimit = "memory.limit_in_bytes" - cgroupMemoryUsage = "memory.usage_in_bytes" - cgroupMemoryMaxUsage = "memory.max_usage_in_bytes" -) - -type MemoryGroup struct{} - -func (s *MemoryGroup) Name() string { - return "memory" -} - -func (s *MemoryGroup) Apply(path string, _ *configs.Resources, pid int) error { - return apply(path, pid) -} - -func setMemory(path string, val int64) error { - if val == 0 { - return nil - } - - err := cgroups.WriteFile(path, cgroupMemoryLimit, strconv.FormatInt(val, 10)) - if !errors.Is(err, unix.EBUSY) { - return err - } - - // EBUSY means the kernel can't set new limit as it's too low - // (lower than the current usage). Return more specific error. - usage, err := fscommon.GetCgroupParamUint(path, cgroupMemoryUsage) - if err != nil { - return err - } - max, err := fscommon.GetCgroupParamUint(path, cgroupMemoryMaxUsage) - if err != nil { - return err - } - - return fmt.Errorf("unable to set memory limit to %d (current usage: %d, peak usage: %d)", val, usage, max) -} - -func setSwap(path string, val int64) error { - if val == 0 { - return nil - } - - return cgroups.WriteFile(path, cgroupMemorySwapLimit, strconv.FormatInt(val, 10)) -} - -func setMemoryAndSwap(path string, r *configs.Resources) error { - // If the memory update is set to -1 and the swap is not explicitly - // set, we should also set swap to -1, it means unlimited memory. - if r.Memory == -1 && r.MemorySwap == 0 { - // Only set swap if it's enabled in kernel - if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) { - r.MemorySwap = -1 - } - } - - // When memory and swap memory are both set, we need to handle the cases - // for updating container. - if r.Memory != 0 && r.MemorySwap != 0 { - curLimit, err := fscommon.GetCgroupParamUint(path, cgroupMemoryLimit) - if err != nil { - return err - } - - // When update memory limit, we should adapt the write sequence - // for memory and swap memory, so it won't fail because the new - // value and the old value don't fit kernel's validation. - if r.MemorySwap == -1 || curLimit < uint64(r.MemorySwap) { - if err := setSwap(path, r.MemorySwap); err != nil { - return err - } - if err := setMemory(path, r.Memory); err != nil { - return err - } - return nil - } - } - - if err := setMemory(path, r.Memory); err != nil { - return err - } - if err := setSwap(path, r.MemorySwap); err != nil { - return err - } - - return nil -} - -func (s *MemoryGroup) Set(path string, r *configs.Resources) error { - if err := setMemoryAndSwap(path, r); err != nil { - return err - } - - // ignore KernelMemory and KernelMemoryTCP - - if r.MemoryReservation != 0 { - if err := cgroups.WriteFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(r.MemoryReservation, 10)); err != nil { - return err - } - } - - if r.OomKillDisable { - if err := cgroups.WriteFile(path, "memory.oom_control", "1"); err != nil { - return err - } - } - if r.MemorySwappiness == nil || int64(*r.MemorySwappiness) == -1 { - return nil - } else if *r.MemorySwappiness <= 100 { - if err := cgroups.WriteFile(path, "memory.swappiness", strconv.FormatUint(*r.MemorySwappiness, 10)); err != nil { - return err - } - } else { - return fmt.Errorf("invalid memory swappiness value: %d (valid range is 0-100)", *r.MemorySwappiness) - } - - return nil -} - -func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { - const file = "memory.stat" - statsFile, err := cgroups.OpenFile(path, file, os.O_RDONLY) - if err != nil { - if os.IsNotExist(err) { - return nil - } - return err - } - defer statsFile.Close() - - sc := bufio.NewScanner(statsFile) - for sc.Scan() { - t, v, err := fscommon.ParseKeyValue(sc.Text()) - if err != nil { - return &parseError{Path: path, File: file, Err: err} - } - stats.MemoryStats.Stats[t] = v - } - stats.MemoryStats.Cache = stats.MemoryStats.Stats["cache"] - - memoryUsage, err := getMemoryData(path, "") - if err != nil { - return err - } - stats.MemoryStats.Usage = memoryUsage - swapUsage, err := getMemoryData(path, "memsw") - if err != nil { - return err - } - stats.MemoryStats.SwapUsage = swapUsage - kernelUsage, err := getMemoryData(path, "kmem") - if err != nil { - return err - } - stats.MemoryStats.KernelUsage = kernelUsage - kernelTCPUsage, err := getMemoryData(path, "kmem.tcp") - if err != nil { - return err - } - stats.MemoryStats.KernelTCPUsage = kernelTCPUsage - - value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy") - if err != nil { - return err - } - if value == 1 { - stats.MemoryStats.UseHierarchy = true - } - - pagesByNUMA, err := getPageUsageByNUMA(path) - if err != nil { - return err - } - stats.MemoryStats.PageUsageByNUMA = pagesByNUMA - - return nil -} - -func getMemoryData(path, name string) (cgroups.MemoryData, error) { - memoryData := cgroups.MemoryData{} - - moduleName := "memory" - if name != "" { - moduleName = "memory." + name - } - var ( - usage = moduleName + ".usage_in_bytes" - maxUsage = moduleName + ".max_usage_in_bytes" - failcnt = moduleName + ".failcnt" - limit = moduleName + ".limit_in_bytes" - ) - - value, err := fscommon.GetCgroupParamUint(path, usage) - if err != nil { - if name != "" && os.IsNotExist(err) { - // Ignore ENOENT as swap and kmem controllers - // are optional in the kernel. - return cgroups.MemoryData{}, nil - } - return cgroups.MemoryData{}, err - } - memoryData.Usage = value - value, err = fscommon.GetCgroupParamUint(path, maxUsage) - if err != nil { - return cgroups.MemoryData{}, err - } - memoryData.MaxUsage = value - value, err = fscommon.GetCgroupParamUint(path, failcnt) - if err != nil { - return cgroups.MemoryData{}, err - } - memoryData.Failcnt = value - value, err = fscommon.GetCgroupParamUint(path, limit) - if err != nil { - return cgroups.MemoryData{}, err - } - memoryData.Limit = value - - return memoryData, nil -} - -func getPageUsageByNUMA(path string) (cgroups.PageUsageByNUMA, error) { - const ( - maxColumns = math.MaxUint8 + 1 - file = "memory.numa_stat" - ) - stats := cgroups.PageUsageByNUMA{} - - fd, err := cgroups.OpenFile(path, file, os.O_RDONLY) - if os.IsNotExist(err) { - return stats, nil - } else if err != nil { - return stats, err - } - defer fd.Close() - - // File format is documented in linux/Documentation/cgroup-v1/memory.txt - // and it looks like this: - // - // total= N0= N1= ... - // file= N0= N1= ... - // anon= N0= N1= ... - // unevictable= N0= N1= ... - // hierarchical_= N0= N1= ... - - scanner := bufio.NewScanner(fd) - for scanner.Scan() { - var field *cgroups.PageStats - - line := scanner.Text() - columns := strings.SplitN(line, " ", maxColumns) - for i, column := range columns { - byNode := strings.SplitN(column, "=", 2) - // Some custom kernels have non-standard fields, like - // numa_locality 0 0 0 0 0 0 0 0 0 0 - // numa_exectime 0 - if len(byNode) < 2 { - if i == 0 { - // Ignore/skip those. - break - } else { - // The first column was already validated, - // so be strict to the rest. - return stats, malformedLine(path, file, line) - } - } - key, val := byNode[0], byNode[1] - if i == 0 { // First column: key is name, val is total. - field = getNUMAField(&stats, key) - if field == nil { // unknown field (new kernel?) - break - } - field.Total, err = strconv.ParseUint(val, 0, 64) - if err != nil { - return stats, &parseError{Path: path, File: file, Err: err} - } - field.Nodes = map[uint8]uint64{} - } else { // Subsequent columns: key is N, val is usage. - if len(key) < 2 || key[0] != 'N' { - // This is definitely an error. - return stats, malformedLine(path, file, line) - } - - n, err := strconv.ParseUint(key[1:], 10, 8) - if err != nil { - return stats, &parseError{Path: path, File: file, Err: err} - } - - usage, err := strconv.ParseUint(val, 10, 64) - if err != nil { - return stats, &parseError{Path: path, File: file, Err: err} - } - - field.Nodes[uint8(n)] = usage - } - - } - } - if err := scanner.Err(); err != nil { - return cgroups.PageUsageByNUMA{}, &parseError{Path: path, File: file, Err: err} - } - - return stats, nil -} - -func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats { - switch name { - case "total": - return &stats.Total - case "file": - return &stats.File - case "anon": - return &stats.Anon - case "unevictable": - return &stats.Unevictable - case "hierarchical_total": - return &stats.Hierarchical.Total - case "hierarchical_file": - return &stats.Hierarchical.File - case "hierarchical_anon": - return &stats.Hierarchical.Anon - case "hierarchical_unevictable": - return &stats.Hierarchical.Unevictable - } - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go deleted file mode 100644 index b8d5d849c..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go +++ /dev/null @@ -1,31 +0,0 @@ -package fs - -import ( - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" -) - -type NameGroup struct { - GroupName string - Join bool -} - -func (s *NameGroup) Name() string { - return s.GroupName -} - -func (s *NameGroup) Apply(path string, _ *configs.Resources, pid int) error { - if s.Join { - // Ignore errors if the named cgroup does not exist. - _ = apply(path, pid) - } - return nil -} - -func (s *NameGroup) Set(_ string, _ *configs.Resources) error { - return nil -} - -func (s *NameGroup) GetStats(path string, stats *cgroups.Stats) error { - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go deleted file mode 100644 index abfd09ce8..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_cls.go +++ /dev/null @@ -1,32 +0,0 @@ -package fs - -import ( - "strconv" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" -) - -type NetClsGroup struct{} - -func (s *NetClsGroup) Name() string { - return "net_cls" -} - -func (s *NetClsGroup) Apply(path string, _ *configs.Resources, pid int) error { - return apply(path, pid) -} - -func (s *NetClsGroup) Set(path string, r *configs.Resources) error { - if r.NetClsClassid != 0 { - if err := cgroups.WriteFile(path, "net_cls.classid", strconv.FormatUint(uint64(r.NetClsClassid), 10)); err != nil { - return err - } - } - - return nil -} - -func (s *NetClsGroup) GetStats(path string, stats *cgroups.Stats) error { - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go deleted file mode 100644 index da74d3779..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/net_prio.go +++ /dev/null @@ -1,30 +0,0 @@ -package fs - -import ( - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" -) - -type NetPrioGroup struct{} - -func (s *NetPrioGroup) Name() string { - return "net_prio" -} - -func (s *NetPrioGroup) Apply(path string, _ *configs.Resources, pid int) error { - return apply(path, pid) -} - -func (s *NetPrioGroup) Set(path string, r *configs.Resources) error { - for _, prioMap := range r.NetPrioIfpriomap { - if err := cgroups.WriteFile(path, "net_prio.ifpriomap", prioMap.CgroupString()); err != nil { - return err - } - } - - return nil -} - -func (s *NetPrioGroup) GetStats(path string, stats *cgroups.Stats) error { - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go deleted file mode 100644 index 1092331b2..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/paths.go +++ /dev/null @@ -1,186 +0,0 @@ -package fs - -import ( - "errors" - "os" - "path/filepath" - "sync" - - "golang.org/x/sys/unix" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/utils" -) - -// The absolute path to the root of the cgroup hierarchies. -var ( - cgroupRootLock sync.Mutex - cgroupRoot string -) - -const defaultCgroupRoot = "/sys/fs/cgroup" - -func initPaths(cg *configs.Cgroup) (map[string]string, error) { - root, err := rootPath() - if err != nil { - return nil, err - } - - inner, err := innerPath(cg) - if err != nil { - return nil, err - } - - paths := make(map[string]string) - for _, sys := range subsystems { - name := sys.Name() - path, err := subsysPath(root, inner, name) - if err != nil { - // The non-presence of the devices subsystem - // is considered fatal for security reasons. - if cgroups.IsNotFound(err) && (cg.SkipDevices || name != "devices") { - continue - } - - return nil, err - } - paths[name] = path - } - - return paths, nil -} - -func tryDefaultCgroupRoot() string { - var st, pst unix.Stat_t - - // (1) it should be a directory... - err := unix.Lstat(defaultCgroupRoot, &st) - if err != nil || st.Mode&unix.S_IFDIR == 0 { - return "" - } - - // (2) ... and a mount point ... - err = unix.Lstat(filepath.Dir(defaultCgroupRoot), &pst) - if err != nil { - return "" - } - - if st.Dev == pst.Dev { - // parent dir has the same dev -- not a mount point - return "" - } - - // (3) ... of 'tmpfs' fs type. - var fst unix.Statfs_t - err = unix.Statfs(defaultCgroupRoot, &fst) - if err != nil || fst.Type != unix.TMPFS_MAGIC { - return "" - } - - // (4) it should have at least 1 entry ... - dir, err := os.Open(defaultCgroupRoot) - if err != nil { - return "" - } - names, err := dir.Readdirnames(1) - if err != nil { - return "" - } - if len(names) < 1 { - return "" - } - // ... which is a cgroup mount point. - err = unix.Statfs(filepath.Join(defaultCgroupRoot, names[0]), &fst) - if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { - return "" - } - - return defaultCgroupRoot -} - -// rootPath finds and returns path to the root of the cgroup hierarchies. -func rootPath() (string, error) { - cgroupRootLock.Lock() - defer cgroupRootLock.Unlock() - - if cgroupRoot != "" { - return cgroupRoot, nil - } - - // fast path - cgroupRoot = tryDefaultCgroupRoot() - if cgroupRoot != "" { - return cgroupRoot, nil - } - - // slow path: parse mountinfo - mi, err := cgroups.GetCgroupMounts(false) - if err != nil { - return "", err - } - if len(mi) < 1 { - return "", errors.New("no cgroup mount found in mountinfo") - } - - // Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"), - // use its parent directory. - root := filepath.Dir(mi[0].Mountpoint) - - if _, err := os.Stat(root); err != nil { - return "", err - } - - cgroupRoot = root - return cgroupRoot, nil -} - -func innerPath(c *configs.Cgroup) (string, error) { - if (c.Name != "" || c.Parent != "") && c.Path != "" { - return "", errors.New("cgroup: either Path or Name and Parent should be used") - } - - // XXX: Do not remove CleanPath. Path safety is important! -- cyphar - innerPath := utils.CleanPath(c.Path) - if innerPath == "" { - cgParent := utils.CleanPath(c.Parent) - cgName := utils.CleanPath(c.Name) - innerPath = filepath.Join(cgParent, cgName) - } - - return innerPath, nil -} - -func subsysPath(root, inner, subsystem string) (string, error) { - // If the cgroup name/path is absolute do not look relative to the cgroup of the init process. - if filepath.IsAbs(inner) { - mnt, err := cgroups.FindCgroupMountpoint(root, subsystem) - // If we didn't mount the subsystem, there is no point we make the path. - if err != nil { - return "", err - } - - // Sometimes subsystems can be mounted together as 'cpu,cpuacct'. - return filepath.Join(root, filepath.Base(mnt), inner), nil - } - - // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating - // process could in container and shared pid namespace with host, and - // /proc/1/cgroup could point to whole other world of cgroups. - parentPath, err := cgroups.GetOwnCgroupPath(subsystem) - if err != nil { - return "", err - } - - return filepath.Join(parentPath, inner), nil -} - -func apply(path string, pid int) error { - if path == "" { - return nil - } - if err := os.MkdirAll(path, 0o755); err != nil { - return err - } - return cgroups.WriteCgroupProc(path, pid) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go deleted file mode 100644 index b86955c8f..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/perf_event.go +++ /dev/null @@ -1,24 +0,0 @@ -package fs - -import ( - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" -) - -type PerfEventGroup struct{} - -func (s *PerfEventGroup) Name() string { - return "perf_event" -} - -func (s *PerfEventGroup) Apply(path string, _ *configs.Resources, pid int) error { - return apply(path, pid) -} - -func (s *PerfEventGroup) Set(_ string, _ *configs.Resources) error { - return nil -} - -func (s *PerfEventGroup) GetStats(path string, stats *cgroups.Stats) error { - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go deleted file mode 100644 index 1f13532a5..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/pids.go +++ /dev/null @@ -1,62 +0,0 @@ -package fs - -import ( - "math" - "strconv" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" -) - -type PidsGroup struct{} - -func (s *PidsGroup) Name() string { - return "pids" -} - -func (s *PidsGroup) Apply(path string, _ *configs.Resources, pid int) error { - return apply(path, pid) -} - -func (s *PidsGroup) Set(path string, r *configs.Resources) error { - if r.PidsLimit != 0 { - // "max" is the fallback value. - limit := "max" - - if r.PidsLimit > 0 { - limit = strconv.FormatInt(r.PidsLimit, 10) - } - - if err := cgroups.WriteFile(path, "pids.max", limit); err != nil { - return err - } - } - - return nil -} - -func (s *PidsGroup) GetStats(path string, stats *cgroups.Stats) error { - if !cgroups.PathExists(path) { - return nil - } - current, err := fscommon.GetCgroupParamUint(path, "pids.current") - if err != nil { - return err - } - - max, err := fscommon.GetCgroupParamUint(path, "pids.max") - if err != nil { - return err - } - // If no limit is set, read from pids.max returns "max", which is - // converted to MaxUint64 by GetCgroupParamUint. Historically, we - // represent "no limit" for pids as 0, thus this conversion. - if max == math.MaxUint64 { - max = 0 - } - - stats.PidsStats.Current = current - stats.PidsStats.Limit = max - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/rdma.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/rdma.go deleted file mode 100644 index 5bbe0f35f..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/rdma.go +++ /dev/null @@ -1,25 +0,0 @@ -package fs - -import ( - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" -) - -type RdmaGroup struct{} - -func (s *RdmaGroup) Name() string { - return "rdma" -} - -func (s *RdmaGroup) Apply(path string, _ *configs.Resources, pid int) error { - return apply(path, pid) -} - -func (s *RdmaGroup) Set(path string, r *configs.Resources) error { - return fscommon.RdmaSet(path, r) -} - -func (s *RdmaGroup) GetStats(path string, stats *cgroups.Stats) error { - return fscommon.RdmaGetStats(path, stats) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go deleted file mode 100644 index bbbae4d58..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go +++ /dev/null @@ -1,87 +0,0 @@ -package fs2 - -import ( - "bufio" - "os" - "strconv" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" -) - -func isCpuSet(r *configs.Resources) bool { - return r.CpuWeight != 0 || r.CpuQuota != 0 || r.CpuPeriod != 0 -} - -func setCpu(dirPath string, r *configs.Resources) error { - if !isCpuSet(r) { - return nil - } - - // NOTE: .CpuShares is not used here. Conversion is the caller's responsibility. - if r.CpuWeight != 0 { - if err := cgroups.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil { - return err - } - } - - if r.CpuQuota != 0 || r.CpuPeriod != 0 { - str := "max" - if r.CpuQuota > 0 { - str = strconv.FormatInt(r.CpuQuota, 10) - } - period := r.CpuPeriod - if period == 0 { - // This default value is documented in - // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html - period = 100000 - } - str += " " + strconv.FormatUint(period, 10) - if err := cgroups.WriteFile(dirPath, "cpu.max", str); err != nil { - return err - } - } - - return nil -} - -func statCpu(dirPath string, stats *cgroups.Stats) error { - const file = "cpu.stat" - f, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY) - if err != nil { - return err - } - defer f.Close() - - sc := bufio.NewScanner(f) - for sc.Scan() { - t, v, err := fscommon.ParseKeyValue(sc.Text()) - if err != nil { - return &parseError{Path: dirPath, File: file, Err: err} - } - switch t { - case "usage_usec": - stats.CpuStats.CpuUsage.TotalUsage = v * 1000 - - case "user_usec": - stats.CpuStats.CpuUsage.UsageInUsermode = v * 1000 - - case "system_usec": - stats.CpuStats.CpuUsage.UsageInKernelmode = v * 1000 - - case "nr_periods": - stats.CpuStats.ThrottlingData.Periods = v - - case "nr_throttled": - stats.CpuStats.ThrottlingData.ThrottledPeriods = v - - case "throttled_usec": - stats.CpuStats.ThrottlingData.ThrottledTime = v * 1000 - } - } - if err := sc.Err(); err != nil { - return &parseError{Path: dirPath, File: file, Err: err} - } - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpuset.go deleted file mode 100644 index 16c45bad8..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpuset.go +++ /dev/null @@ -1,28 +0,0 @@ -package fs2 - -import ( - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" -) - -func isCpusetSet(r *configs.Resources) bool { - return r.CpusetCpus != "" || r.CpusetMems != "" -} - -func setCpuset(dirPath string, r *configs.Resources) error { - if !isCpusetSet(r) { - return nil - } - - if r.CpusetCpus != "" { - if err := cgroups.WriteFile(dirPath, "cpuset.cpus", r.CpusetCpus); err != nil { - return err - } - } - if r.CpusetMems != "" { - if err := cgroups.WriteFile(dirPath, "cpuset.mems", r.CpusetMems); err != nil { - return err - } - } - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/create.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/create.go deleted file mode 100644 index 641123a4d..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/create.go +++ /dev/null @@ -1,152 +0,0 @@ -package fs2 - -import ( - "fmt" - "os" - "path/filepath" - "strings" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" -) - -func supportedControllers() (string, error) { - return cgroups.ReadFile(UnifiedMountpoint, "/cgroup.controllers") -} - -// needAnyControllers returns whether we enable some supported controllers or not, -// based on (1) controllers available and (2) resources that are being set. -// We don't check "pseudo" controllers such as -// "freezer" and "devices". -func needAnyControllers(r *configs.Resources) (bool, error) { - if r == nil { - return false, nil - } - - // list of all available controllers - content, err := supportedControllers() - if err != nil { - return false, err - } - avail := make(map[string]struct{}) - for _, ctr := range strings.Fields(content) { - avail[ctr] = struct{}{} - } - - // check whether the controller if available or not - have := func(controller string) bool { - _, ok := avail[controller] - return ok - } - - if isPidsSet(r) && have("pids") { - return true, nil - } - if isMemorySet(r) && have("memory") { - return true, nil - } - if isIoSet(r) && have("io") { - return true, nil - } - if isCpuSet(r) && have("cpu") { - return true, nil - } - if isCpusetSet(r) && have("cpuset") { - return true, nil - } - if isHugeTlbSet(r) && have("hugetlb") { - return true, nil - } - - return false, nil -} - -// containsDomainController returns whether the current config contains domain controller or not. -// Refer to: http://man7.org/linux/man-pages/man7/cgroups.7.html -// As at Linux 4.19, the following controllers are threaded: cpu, perf_event, and pids. -func containsDomainController(r *configs.Resources) bool { - return isMemorySet(r) || isIoSet(r) || isCpuSet(r) || isHugeTlbSet(r) -} - -// CreateCgroupPath creates cgroupv2 path, enabling all the supported controllers. -func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) { - if !strings.HasPrefix(path, UnifiedMountpoint) { - return fmt.Errorf("invalid cgroup path %s", path) - } - - content, err := supportedControllers() - if err != nil { - return err - } - - const ( - cgTypeFile = "cgroup.type" - cgStCtlFile = "cgroup.subtree_control" - ) - ctrs := strings.Fields(content) - res := "+" + strings.Join(ctrs, " +") - - elements := strings.Split(path, "/") - elements = elements[3:] - current := "/sys/fs" - for i, e := range elements { - current = filepath.Join(current, e) - if i > 0 { - if err := os.Mkdir(current, 0o755); err != nil { - if !os.IsExist(err) { - return err - } - } else { - // If the directory was created, be sure it is not left around on errors. - current := current - defer func() { - if Err != nil { - os.Remove(current) - } - }() - } - cgType, _ := cgroups.ReadFile(current, cgTypeFile) - cgType = strings.TrimSpace(cgType) - switch cgType { - // If the cgroup is in an invalid mode (usually this means there's an internal - // process in the cgroup tree, because we created a cgroup under an - // already-populated-by-other-processes cgroup), then we have to error out if - // the user requested controllers which are not thread-aware. However, if all - // the controllers requested are thread-aware we can simply put the cgroup into - // threaded mode. - case "domain invalid": - if containsDomainController(c.Resources) { - return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in an invalid state", current) - } else { - // Not entirely correct (in theory we'd always want to be a domain -- - // since that means we're a properly delegated cgroup subtree) but in - // this case there's not much we can do and it's better than giving an - // error. - _ = cgroups.WriteFile(current, cgTypeFile, "threaded") - } - // If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers - // (and you cannot usually take a cgroup out of threaded mode). - case "domain threaded": - fallthrough - case "threaded": - if containsDomainController(c.Resources) { - return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType) - } - } - } - // enable all supported controllers - if i < len(elements)-1 { - if err := cgroups.WriteFile(current, cgStCtlFile, res); err != nil { - // try write one by one - allCtrs := strings.Split(res, " ") - for _, ctr := range allCtrs { - _ = cgroups.WriteFile(current, cgStCtlFile, ctr) - } - } - // Some controllers might not be enabled when rootless or containerized, - // but we don't catch the error here. (Caught in setXXX() functions.) - } - } - - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go deleted file mode 100644 index 9c949c91f..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go +++ /dev/null @@ -1,99 +0,0 @@ -/* - Copyright The containerd Authors. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package fs2 - -import ( - "bufio" - "errors" - "fmt" - "io" - "os" - "path/filepath" - "strings" - - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/utils" -) - -const UnifiedMountpoint = "/sys/fs/cgroup" - -func defaultDirPath(c *configs.Cgroup) (string, error) { - if (c.Name != "" || c.Parent != "") && c.Path != "" { - return "", fmt.Errorf("cgroup: either Path or Name and Parent should be used, got %+v", c) - } - - return _defaultDirPath(UnifiedMountpoint, c.Path, c.Parent, c.Name) -} - -func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) { - if (cgName != "" || cgParent != "") && cgPath != "" { - return "", errors.New("cgroup: either Path or Name and Parent should be used") - } - - // XXX: Do not remove CleanPath. Path safety is important! -- cyphar - innerPath := utils.CleanPath(cgPath) - if innerPath == "" { - cgParent := utils.CleanPath(cgParent) - cgName := utils.CleanPath(cgName) - innerPath = filepath.Join(cgParent, cgName) - } - if filepath.IsAbs(innerPath) { - return filepath.Join(root, innerPath), nil - } - - ownCgroup, err := parseCgroupFile("/proc/self/cgroup") - if err != nil { - return "", err - } - // The current user scope most probably has tasks in it already, - // making it impossible to enable controllers for its sub-cgroup. - // A parent cgroup (with no tasks in it) is what we need. - ownCgroup = filepath.Dir(ownCgroup) - - return filepath.Join(root, ownCgroup, innerPath), nil -} - -// parseCgroupFile parses /proc/PID/cgroup file and return string -func parseCgroupFile(path string) (string, error) { - f, err := os.Open(path) - if err != nil { - return "", err - } - defer f.Close() - return parseCgroupFromReader(f) -} - -func parseCgroupFromReader(r io.Reader) (string, error) { - s := bufio.NewScanner(r) - for s.Scan() { - var ( - text = s.Text() - parts = strings.SplitN(text, ":", 3) - ) - if len(parts) < 3 { - return "", fmt.Errorf("invalid cgroup entry: %q", text) - } - // text is like "0::/user.slice/user-1001.slice/session-1.scope" - if parts[0] == "0" && parts[1] == "" { - return parts[2], nil - } - } - if err := s.Err(); err != nil { - return "", err - } - return "", errors.New("cgroup path not found") -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/devices.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/devices.go deleted file mode 100644 index 0d2345607..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/devices.go +++ /dev/null @@ -1,75 +0,0 @@ -package fs2 - -import ( - "fmt" - - "golang.org/x/sys/unix" - - "github.com/opencontainers/runc/libcontainer/cgroups/ebpf" - "github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/devices" - "github.com/opencontainers/runc/libcontainer/userns" -) - -func isRWM(perms devices.Permissions) bool { - var r, w, m bool - for _, perm := range perms { - switch perm { - case 'r': - r = true - case 'w': - w = true - case 'm': - m = true - } - } - return r && w && m -} - -// This is similar to the logic applied in crun for handling errors from bpf(2) -// . -func canSkipEBPFError(r *configs.Resources) bool { - // If we're running in a user namespace we can ignore eBPF rules because we - // usually cannot use bpf(2), as well as rootless containers usually don't - // have the necessary privileges to mknod(2) device inodes or access - // host-level instances (though ideally we would be blocking device access - // for rootless containers anyway). - if userns.RunningInUserNS() { - return true - } - - // We cannot ignore an eBPF load error if any rule if is a block rule or it - // doesn't permit all access modes. - // - // NOTE: This will sometimes trigger in cases where access modes are split - // between different rules but to handle this correctly would require - // using ".../libcontainer/cgroup/devices".Emulator. - for _, dev := range r.Devices { - if !dev.Allow || !isRWM(dev.Permissions) { - return false - } - } - return true -} - -func setDevices(dirPath string, r *configs.Resources) error { - if r.SkipDevices { - return nil - } - insts, license, err := devicefilter.DeviceFilter(r.Devices) - if err != nil { - return err - } - dirFD, err := unix.Open(dirPath, unix.O_DIRECTORY|unix.O_RDONLY, 0o600) - if err != nil { - return fmt.Errorf("cannot get dir FD for %s", dirPath) - } - defer unix.Close(dirFD) - if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { - if !canSkipEBPFError(r) { - return err - } - } - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go deleted file mode 100644 index 8917a6411..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go +++ /dev/null @@ -1,127 +0,0 @@ -package fs2 - -import ( - "bufio" - "errors" - "fmt" - "os" - "strings" - "time" - - "golang.org/x/sys/unix" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" -) - -func setFreezer(dirPath string, state configs.FreezerState) error { - var stateStr string - switch state { - case configs.Undefined: - return nil - case configs.Frozen: - stateStr = "1" - case configs.Thawed: - stateStr = "0" - default: - return fmt.Errorf("invalid freezer state %q requested", state) - } - - fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDWR) - if err != nil { - // We can ignore this request as long as the user didn't ask us to - // freeze the container (since without the freezer cgroup, that's a - // no-op). - if state != configs.Frozen { - return nil - } - return fmt.Errorf("freezer not supported: %w", err) - } - defer fd.Close() - - if _, err := fd.WriteString(stateStr); err != nil { - return err - } - // Confirm that the cgroup did actually change states. - if actualState, err := readFreezer(dirPath, fd); err != nil { - return err - } else if actualState != state { - return fmt.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState) - } - return nil -} - -func getFreezer(dirPath string) (configs.FreezerState, error) { - fd, err := cgroups.OpenFile(dirPath, "cgroup.freeze", unix.O_RDONLY) - if err != nil { - // If the kernel is too old, then we just treat the freezer as being in - // an "undefined" state. - if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) { - err = nil - } - return configs.Undefined, err - } - defer fd.Close() - - return readFreezer(dirPath, fd) -} - -func readFreezer(dirPath string, fd *os.File) (configs.FreezerState, error) { - if _, err := fd.Seek(0, 0); err != nil { - return configs.Undefined, err - } - state := make([]byte, 2) - if _, err := fd.Read(state); err != nil { - return configs.Undefined, err - } - switch string(state) { - case "0\n": - return configs.Thawed, nil - case "1\n": - return waitFrozen(dirPath) - default: - return configs.Undefined, fmt.Errorf(`unknown "cgroup.freeze" state: %q`, state) - } -} - -// waitFrozen polls cgroup.events until it sees "frozen 1" in it. -func waitFrozen(dirPath string) (configs.FreezerState, error) { - fd, err := cgroups.OpenFile(dirPath, "cgroup.events", unix.O_RDONLY) - if err != nil { - return configs.Undefined, err - } - defer fd.Close() - - // XXX: Simple wait/read/retry is used here. An implementation - // based on poll(2) or inotify(7) is possible, but it makes the code - // much more complicated. Maybe address this later. - const ( - // Perform maxIter with waitTime in between iterations. - waitTime = 10 * time.Millisecond - maxIter = 1000 - ) - scanner := bufio.NewScanner(fd) - for i := 0; scanner.Scan(); { - if i == maxIter { - return configs.Undefined, fmt.Errorf("timeout of %s reached waiting for the cgroup to freeze", waitTime*maxIter) - } - line := scanner.Text() - val := strings.TrimPrefix(line, "frozen ") - if val != line { // got prefix - if val[0] == '1' { - return configs.Frozen, nil - } - - i++ - // wait, then re-read - time.Sleep(waitTime) - _, err := fd.Seek(0, 0) - if err != nil { - return configs.Undefined, err - } - } - } - // Should only reach here either on read error, - // or if the file does not contain "frozen " line. - return configs.Undefined, scanner.Err() -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go deleted file mode 100644 index 492778e31..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go +++ /dev/null @@ -1,259 +0,0 @@ -package fs2 - -import ( - "errors" - "fmt" - "os" - "strings" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" -) - -type parseError = fscommon.ParseError - -type manager struct { - config *configs.Cgroup - // dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope" - dirPath string - // controllers is content of "cgroup.controllers" file. - // excludes pseudo-controllers ("devices" and "freezer"). - controllers map[string]struct{} -} - -// NewManager creates a manager for cgroup v2 unified hierarchy. -// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope". -// If dirPath is empty, it is automatically set using config. -func NewManager(config *configs.Cgroup, dirPath string) (cgroups.Manager, error) { - if dirPath == "" { - var err error - dirPath, err = defaultDirPath(config) - if err != nil { - return nil, err - } - } - - m := &manager{ - config: config, - dirPath: dirPath, - } - return m, nil -} - -func (m *manager) getControllers() error { - if m.controllers != nil { - return nil - } - - data, err := cgroups.ReadFile(m.dirPath, "cgroup.controllers") - if err != nil { - if m.config.Rootless && m.config.Path == "" { - return nil - } - return err - } - fields := strings.Fields(data) - m.controllers = make(map[string]struct{}, len(fields)) - for _, c := range fields { - m.controllers[c] = struct{}{} - } - - return nil -} - -func (m *manager) Apply(pid int) error { - if err := CreateCgroupPath(m.dirPath, m.config); err != nil { - // Related tests: - // - "runc create (no limits + no cgrouppath + no permission) succeeds" - // - "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error" - // - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" - if m.config.Rootless { - if m.config.Path == "" { - if blNeed, nErr := needAnyControllers(m.config.Resources); nErr == nil && !blNeed { - return nil - } - return fmt.Errorf("rootless needs no limits + no cgrouppath when no permission is granted for cgroups: %w", err) - } - } - return err - } - if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil { - return err - } - return nil -} - -func (m *manager) GetPids() ([]int, error) { - return cgroups.GetPids(m.dirPath) -} - -func (m *manager) GetAllPids() ([]int, error) { - return cgroups.GetAllPids(m.dirPath) -} - -func (m *manager) GetStats() (*cgroups.Stats, error) { - var errs []error - - st := cgroups.NewStats() - - // pids (since kernel 4.5) - if err := statPids(m.dirPath, st); err != nil { - errs = append(errs, err) - } - // memory (since kernel 4.5) - if err := statMemory(m.dirPath, st); err != nil && !os.IsNotExist(err) { - errs = append(errs, err) - } - // io (since kernel 4.5) - if err := statIo(m.dirPath, st); err != nil && !os.IsNotExist(err) { - errs = append(errs, err) - } - // cpu (since kernel 4.15) - // Note cpu.stat is available even if the controller is not enabled. - if err := statCpu(m.dirPath, st); err != nil && !os.IsNotExist(err) { - errs = append(errs, err) - } - // hugetlb (since kernel 5.6) - if err := statHugeTlb(m.dirPath, st); err != nil && !os.IsNotExist(err) { - errs = append(errs, err) - } - // rdma (since kernel 4.11) - if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) { - errs = append(errs, err) - } - if len(errs) > 0 && !m.config.Rootless { - return st, fmt.Errorf("error while statting cgroup v2: %+v", errs) - } - return st, nil -} - -func (m *manager) Freeze(state configs.FreezerState) error { - if m.config.Resources == nil { - return errors.New("cannot toggle freezer: cgroups not configured for container") - } - if err := setFreezer(m.dirPath, state); err != nil { - return err - } - m.config.Resources.Freezer = state - return nil -} - -func (m *manager) Destroy() error { - return cgroups.RemovePath(m.dirPath) -} - -func (m *manager) Path(_ string) string { - return m.dirPath -} - -func (m *manager) Set(r *configs.Resources) error { - if r == nil { - return nil - } - if err := m.getControllers(); err != nil { - return err - } - // pids (since kernel 4.5) - if err := setPids(m.dirPath, r); err != nil { - return err - } - // memory (since kernel 4.5) - if err := setMemory(m.dirPath, r); err != nil { - return err - } - // io (since kernel 4.5) - if err := setIo(m.dirPath, r); err != nil { - return err - } - // cpu (since kernel 4.15) - if err := setCpu(m.dirPath, r); err != nil { - return err - } - // devices (since kernel 4.15, pseudo-controller) - // - // When rootless is true, errors from the device subsystem are ignored because it is really not expected to work. - // However, errors from other subsystems are not ignored. - // see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error" - if err := setDevices(m.dirPath, r); err != nil && !m.config.Rootless { - return err - } - // cpuset (since kernel 5.0) - if err := setCpuset(m.dirPath, r); err != nil { - return err - } - // hugetlb (since kernel 5.6) - if err := setHugeTlb(m.dirPath, r); err != nil { - return err - } - // rdma (since kernel 4.11) - if err := fscommon.RdmaSet(m.dirPath, r); err != nil { - return err - } - // freezer (since kernel 5.2, pseudo-controller) - if err := setFreezer(m.dirPath, r.Freezer); err != nil { - return err - } - if err := m.setUnified(r.Unified); err != nil { - return err - } - m.config.Resources = r - return nil -} - -func (m *manager) setUnified(res map[string]string) error { - for k, v := range res { - if strings.Contains(k, "/") { - return fmt.Errorf("unified resource %q must be a file name (no slashes)", k) - } - if err := cgroups.WriteFile(m.dirPath, k, v); err != nil { - // Check for both EPERM and ENOENT since O_CREAT is used by WriteFile. - if errors.Is(err, os.ErrPermission) || errors.Is(err, os.ErrNotExist) { - // Check if a controller is available, - // to give more specific error if not. - sk := strings.SplitN(k, ".", 2) - if len(sk) != 2 { - return fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k) - } - c := sk[0] - if _, ok := m.controllers[c]; !ok && c != "cgroup" { - return fmt.Errorf("unified resource %q can't be set: controller %q not available", k, c) - } - } - return fmt.Errorf("unable to set unified resource %q: %w", k, err) - } - } - - return nil -} - -func (m *manager) GetPaths() map[string]string { - paths := make(map[string]string, 1) - paths[""] = m.dirPath - return paths -} - -func (m *manager) GetCgroups() (*configs.Cgroup, error) { - return m.config, nil -} - -func (m *manager) GetFreezerState() (configs.FreezerState, error) { - return getFreezer(m.dirPath) -} - -func (m *manager) Exists() bool { - return cgroups.PathExists(m.dirPath) -} - -func OOMKillCount(path string) (uint64, error) { - return fscommon.GetValueByKey(path, "memory.events", "oom_kill") -} - -func (m *manager) OOMKillCount() (uint64, error) { - c, err := OOMKillCount(m.dirPath) - if err != nil && m.config.Rootless && os.IsNotExist(err) { - err = nil - } - - return c, err -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go deleted file mode 100644 index c92a7e64a..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go +++ /dev/null @@ -1,48 +0,0 @@ -package fs2 - -import ( - "strconv" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" -) - -func isHugeTlbSet(r *configs.Resources) bool { - return len(r.HugetlbLimit) > 0 -} - -func setHugeTlb(dirPath string, r *configs.Resources) error { - if !isHugeTlbSet(r) { - return nil - } - for _, hugetlb := range r.HugetlbLimit { - if err := cgroups.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil { - return err - } - } - - return nil -} - -func statHugeTlb(dirPath string, stats *cgroups.Stats) error { - hugetlbStats := cgroups.HugetlbStats{} - for _, pagesize := range cgroups.HugePageSizes() { - value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current") - if err != nil { - return err - } - hugetlbStats.Usage = value - - fileName := "hugetlb." + pagesize + ".events" - value, err = fscommon.GetValueByKey(dirPath, fileName, "max") - if err != nil { - return err - } - hugetlbStats.Failcnt = value - - stats.HugetlbStats[pagesize] = hugetlbStats - } - - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/io.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/io.go deleted file mode 100644 index b2ff7d340..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/io.go +++ /dev/null @@ -1,193 +0,0 @@ -package fs2 - -import ( - "bufio" - "bytes" - "fmt" - "os" - "strconv" - "strings" - - "github.com/sirupsen/logrus" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" -) - -func isIoSet(r *configs.Resources) bool { - return r.BlkioWeight != 0 || - len(r.BlkioWeightDevice) > 0 || - len(r.BlkioThrottleReadBpsDevice) > 0 || - len(r.BlkioThrottleWriteBpsDevice) > 0 || - len(r.BlkioThrottleReadIOPSDevice) > 0 || - len(r.BlkioThrottleWriteIOPSDevice) > 0 -} - -// bfqDeviceWeightSupported checks for per-device BFQ weight support (added -// in kernel v5.4, commit 795fe54c2a8) by reading from "io.bfq.weight". -func bfqDeviceWeightSupported(bfq *os.File) bool { - if bfq == nil { - return false - } - _, _ = bfq.Seek(0, 0) - buf := make([]byte, 32) - _, _ = bfq.Read(buf) - // If only a single number (default weight) if read back, we have older kernel. - _, err := strconv.ParseInt(string(bytes.TrimSpace(buf)), 10, 64) - return err != nil -} - -func setIo(dirPath string, r *configs.Resources) error { - if !isIoSet(r) { - return nil - } - - // If BFQ IO scheduler is available, use it. - var bfq *os.File - if r.BlkioWeight != 0 || len(r.BlkioWeightDevice) > 0 { - var err error - bfq, err = cgroups.OpenFile(dirPath, "io.bfq.weight", os.O_RDWR) - if err == nil { - defer bfq.Close() - } else if !os.IsNotExist(err) { - return err - } - } - - if r.BlkioWeight != 0 { - if bfq != nil { // Use BFQ. - if _, err := bfq.WriteString(strconv.FormatUint(uint64(r.BlkioWeight), 10)); err != nil { - return err - } - } else { - // Fallback to io.weight with a conversion scheme. - v := cgroups.ConvertBlkIOToIOWeightValue(r.BlkioWeight) - if err := cgroups.WriteFile(dirPath, "io.weight", strconv.FormatUint(v, 10)); err != nil { - return err - } - } - } - if bfqDeviceWeightSupported(bfq) { - for _, wd := range r.BlkioWeightDevice { - if _, err := bfq.WriteString(wd.WeightString() + "\n"); err != nil { - return fmt.Errorf("setting device weight %q: %w", wd.WeightString(), err) - } - } - } - for _, td := range r.BlkioThrottleReadBpsDevice { - if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil { - return err - } - } - for _, td := range r.BlkioThrottleWriteBpsDevice { - if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wbps")); err != nil { - return err - } - } - for _, td := range r.BlkioThrottleReadIOPSDevice { - if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("riops")); err != nil { - return err - } - } - for _, td := range r.BlkioThrottleWriteIOPSDevice { - if err := cgroups.WriteFile(dirPath, "io.max", td.StringName("wiops")); err != nil { - return err - } - } - - return nil -} - -func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) { - ret := map[string][]string{} - f, err := cgroups.OpenFile(dirPath, name, os.O_RDONLY) - if err != nil { - return nil, err - } - defer f.Close() - scanner := bufio.NewScanner(f) - for scanner.Scan() { - line := scanner.Text() - parts := strings.Fields(line) - if len(parts) < 2 { - continue - } - ret[parts[0]] = parts[1:] - } - if err := scanner.Err(); err != nil { - return nil, &parseError{Path: dirPath, File: name, Err: err} - } - return ret, nil -} - -func statIo(dirPath string, stats *cgroups.Stats) error { - const file = "io.stat" - values, err := readCgroup2MapFile(dirPath, file) - if err != nil { - return err - } - // more details on the io.stat file format: https://www.kernel.org/doc/Documentation/cgroup-v2.txt - var parsedStats cgroups.BlkioStats - for k, v := range values { - d := strings.Split(k, ":") - if len(d) != 2 { - continue - } - major, err := strconv.ParseUint(d[0], 10, 64) - if err != nil { - return &parseError{Path: dirPath, File: file, Err: err} - } - minor, err := strconv.ParseUint(d[1], 10, 64) - if err != nil { - return &parseError{Path: dirPath, File: file, Err: err} - } - - for _, item := range v { - d := strings.Split(item, "=") - if len(d) != 2 { - continue - } - op := d[0] - - // Map to the cgroupv1 naming and layout (in separate tables). - var targetTable *[]cgroups.BlkioStatEntry - switch op { - // Equivalent to cgroupv1's blkio.io_service_bytes. - case "rbytes": - op = "Read" - targetTable = &parsedStats.IoServiceBytesRecursive - case "wbytes": - op = "Write" - targetTable = &parsedStats.IoServiceBytesRecursive - // Equivalent to cgroupv1's blkio.io_serviced. - case "rios": - op = "Read" - targetTable = &parsedStats.IoServicedRecursive - case "wios": - op = "Write" - targetTable = &parsedStats.IoServicedRecursive - default: - // Skip over entries we cannot map to cgroupv1 stats for now. - // In the future we should expand the stats struct to include - // them. - logrus.Debugf("cgroupv2 io stats: skipping over unmappable %s entry", item) - continue - } - - value, err := strconv.ParseUint(d[1], 10, 64) - if err != nil { - return &parseError{Path: dirPath, File: file, Err: err} - } - - entry := cgroups.BlkioStatEntry{ - Op: op, - Major: major, - Minor: minor, - Value: value, - } - *targetTable = append(*targetTable, entry) - } - } - stats.BlkioStats = parsedStats - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go deleted file mode 100644 index 9cca98c4c..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go +++ /dev/null @@ -1,220 +0,0 @@ -package fs2 - -import ( - "bufio" - "errors" - "math" - "os" - "strconv" - "strings" - - "golang.org/x/sys/unix" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" -) - -// numToStr converts an int64 value to a string for writing to a -// cgroupv2 files with .min, .max, .low, or .high suffix. -// The value of -1 is converted to "max" for cgroupv1 compatibility -// (which used to write -1 to remove the limit). -func numToStr(value int64) (ret string) { - switch { - case value == 0: - ret = "" - case value == -1: - ret = "max" - default: - ret = strconv.FormatInt(value, 10) - } - - return ret -} - -func isMemorySet(r *configs.Resources) bool { - return r.MemoryReservation != 0 || r.Memory != 0 || r.MemorySwap != 0 -} - -func setMemory(dirPath string, r *configs.Resources) error { - if !isMemorySet(r) { - return nil - } - swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) - if err != nil { - return err - } - swapStr := numToStr(swap) - if swapStr == "" && swap == 0 && r.MemorySwap > 0 { - // memory and memorySwap set to the same value -- disable swap - swapStr = "0" - } - // never write empty string to `memory.swap.max`, it means set to 0. - if swapStr != "" { - if err := cgroups.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil { - return err - } - } - - if val := numToStr(r.Memory); val != "" { - if err := cgroups.WriteFile(dirPath, "memory.max", val); err != nil { - return err - } - } - - // cgroup.Resources.KernelMemory is ignored - - if val := numToStr(r.MemoryReservation); val != "" { - if err := cgroups.WriteFile(dirPath, "memory.low", val); err != nil { - return err - } - } - - return nil -} - -func statMemory(dirPath string, stats *cgroups.Stats) error { - const file = "memory.stat" - statsFile, err := cgroups.OpenFile(dirPath, file, os.O_RDONLY) - if err != nil { - return err - } - defer statsFile.Close() - - sc := bufio.NewScanner(statsFile) - for sc.Scan() { - t, v, err := fscommon.ParseKeyValue(sc.Text()) - if err != nil { - return &parseError{Path: dirPath, File: file, Err: err} - } - stats.MemoryStats.Stats[t] = v - } - if err := sc.Err(); err != nil { - return &parseError{Path: dirPath, File: file, Err: err} - } - stats.MemoryStats.Cache = stats.MemoryStats.Stats["file"] - // Unlike cgroup v1 which has memory.use_hierarchy binary knob, - // cgroup v2 is always hierarchical. - stats.MemoryStats.UseHierarchy = true - - memoryUsage, err := getMemoryDataV2(dirPath, "") - if err != nil { - if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint { - // The root cgroup does not have memory.{current,max} - // so emulate those using data from /proc/meminfo and - // /sys/fs/cgroup/memory.stat - return rootStatsFromMeminfo(stats) - } - return err - } - stats.MemoryStats.Usage = memoryUsage - swapUsage, err := getMemoryDataV2(dirPath, "swap") - if err != nil { - return err - } - // As cgroup v1 reports SwapUsage values as mem+swap combined, - // while in cgroup v2 swap values do not include memory, - // report combined mem+swap for v1 compatibility. - swapUsage.Usage += memoryUsage.Usage - if swapUsage.Limit != math.MaxUint64 { - swapUsage.Limit += memoryUsage.Limit - } - stats.MemoryStats.SwapUsage = swapUsage - - return nil -} - -func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { - memoryData := cgroups.MemoryData{} - - moduleName := "memory" - if name != "" { - moduleName = "memory." + name - } - usage := moduleName + ".current" - limit := moduleName + ".max" - - value, err := fscommon.GetCgroupParamUint(path, usage) - if err != nil { - if name != "" && os.IsNotExist(err) { - // Ignore EEXIST as there's no swap accounting - // if kernel CONFIG_MEMCG_SWAP is not set or - // swapaccount=0 kernel boot parameter is given. - return cgroups.MemoryData{}, nil - } - return cgroups.MemoryData{}, err - } - memoryData.Usage = value - - value, err = fscommon.GetCgroupParamUint(path, limit) - if err != nil { - return cgroups.MemoryData{}, err - } - memoryData.Limit = value - - return memoryData, nil -} - -func rootStatsFromMeminfo(stats *cgroups.Stats) error { - const file = "/proc/meminfo" - f, err := os.Open(file) - if err != nil { - return err - } - defer f.Close() - - // Fields we are interested in. - var ( - swap_free uint64 - swap_total uint64 - ) - mem := map[string]*uint64{ - "SwapFree": &swap_free, - "SwapTotal": &swap_total, - } - - found := 0 - sc := bufio.NewScanner(f) - for sc.Scan() { - parts := strings.SplitN(sc.Text(), ":", 3) - if len(parts) != 2 { - // Should not happen. - continue - } - k := parts[0] - p, ok := mem[k] - if !ok { - // Unknown field -- not interested. - continue - } - vStr := strings.TrimSpace(strings.TrimSuffix(parts[1], " kB")) - *p, err = strconv.ParseUint(vStr, 10, 64) - if err != nil { - return &parseError{File: file, Err: errors.New("bad value for " + k)} - } - - found++ - if found == len(mem) { - // Got everything we need -- skip the rest. - break - } - } - if err := sc.Err(); err != nil { - return &parseError{Path: "", File: file, Err: err} - } - - // cgroup v1 `usage_in_bytes` reports memory usage as the sum of - // - rss (NR_ANON_MAPPED) - // - cache (NR_FILE_PAGES) - // cgroup v1 reports SwapUsage values as mem+swap combined - // cgroup v2 reports rss and cache as anon and file. - // sum `anon` + `file` to report the same value as `usage_in_bytes` in v1. - // sum swap usage as combined mem+swap usage for consistency as well. - stats.MemoryStats.Usage.Usage = stats.MemoryStats.Stats["anon"] + stats.MemoryStats.Stats["file"] - stats.MemoryStats.Usage.Limit = math.MaxUint64 - stats.MemoryStats.SwapUsage.Usage = (swap_total - swap_free) * 1024 - stats.MemoryStats.SwapUsage.Limit = math.MaxUint64 - stats.MemoryStats.SwapUsage.Usage += stats.MemoryStats.Usage.Usage - - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/pids.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/pids.go deleted file mode 100644 index c8c4a3658..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/pids.go +++ /dev/null @@ -1,72 +0,0 @@ -package fs2 - -import ( - "errors" - "math" - "os" - "strings" - - "golang.org/x/sys/unix" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" - "github.com/opencontainers/runc/libcontainer/configs" -) - -func isPidsSet(r *configs.Resources) bool { - return r.PidsLimit != 0 -} - -func setPids(dirPath string, r *configs.Resources) error { - if !isPidsSet(r) { - return nil - } - if val := numToStr(r.PidsLimit); val != "" { - if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil { - return err - } - } - - return nil -} - -func statPidsFromCgroupProcs(dirPath string, stats *cgroups.Stats) error { - // if the controller is not enabled, let's read PIDS from cgroups.procs - // (or threads if cgroup.threads is enabled) - contents, err := cgroups.ReadFile(dirPath, "cgroup.procs") - if errors.Is(err, unix.ENOTSUP) { - contents, err = cgroups.ReadFile(dirPath, "cgroup.threads") - } - if err != nil { - return err - } - pids := strings.Count(contents, "\n") - stats.PidsStats.Current = uint64(pids) - stats.PidsStats.Limit = 0 - return nil -} - -func statPids(dirPath string, stats *cgroups.Stats) error { - current, err := fscommon.GetCgroupParamUint(dirPath, "pids.current") - if err != nil { - if os.IsNotExist(err) { - return statPidsFromCgroupProcs(dirPath, stats) - } - return err - } - - max, err := fscommon.GetCgroupParamUint(dirPath, "pids.max") - if err != nil { - return err - } - // If no limit is set, read from pids.max returns "max", which is - // converted to MaxUint64 by GetCgroupParamUint. Historically, we - // represent "no limit" for pids as 0, thus this conversion. - if max == math.MaxUint64 { - max = 0 - } - - stats.PidsStats.Current = current - stats.PidsStats.Limit = max - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/rdma.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/rdma.go deleted file mode 100644 index d463d15ee..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/rdma.go +++ /dev/null @@ -1,121 +0,0 @@ -package fscommon - -import ( - "bufio" - "errors" - "math" - "os" - "strconv" - "strings" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/configs" - "golang.org/x/sys/unix" -) - -// parseRdmaKV parses raw string to RdmaEntry. -func parseRdmaKV(raw string, entry *cgroups.RdmaEntry) error { - var value uint32 - - parts := strings.SplitN(raw, "=", 3) - - if len(parts) != 2 { - return errors.New("Unable to parse RDMA entry") - } - - k, v := parts[0], parts[1] - - if v == "max" { - value = math.MaxUint32 - } else { - val64, err := strconv.ParseUint(v, 10, 32) - if err != nil { - return err - } - value = uint32(val64) - } - if k == "hca_handle" { - entry.HcaHandles = value - } else if k == "hca_object" { - entry.HcaObjects = value - } - - return nil -} - -// readRdmaEntries reads and converts array of rawstrings to RdmaEntries from file. -// example entry: mlx4_0 hca_handle=2 hca_object=2000 -func readRdmaEntries(dir, file string) ([]cgroups.RdmaEntry, error) { - rdmaEntries := make([]cgroups.RdmaEntry, 0) - fd, err := cgroups.OpenFile(dir, file, unix.O_RDONLY) - if err != nil { - return nil, err - } - defer fd.Close() //nolint:errorlint - scanner := bufio.NewScanner(fd) - for scanner.Scan() { - parts := strings.SplitN(scanner.Text(), " ", 4) - if len(parts) == 3 { - entry := new(cgroups.RdmaEntry) - entry.Device = parts[0] - err = parseRdmaKV(parts[1], entry) - if err != nil { - continue - } - err = parseRdmaKV(parts[2], entry) - if err != nil { - continue - } - - rdmaEntries = append(rdmaEntries, *entry) - } - } - return rdmaEntries, scanner.Err() -} - -// RdmaGetStats returns rdma stats such as totalLimit and current entries. -func RdmaGetStats(path string, stats *cgroups.Stats) error { - currentEntries, err := readRdmaEntries(path, "rdma.current") - if err != nil { - if errors.Is(err, os.ErrNotExist) { - err = nil - } - return err - } - maxEntries, err := readRdmaEntries(path, "rdma.max") - if err != nil { - return err - } - // If device got removed between reading two files, ignore returning stats. - if len(currentEntries) != len(maxEntries) { - return nil - } - - stats.RdmaStats = cgroups.RdmaStats{ - RdmaLimit: maxEntries, - RdmaCurrent: currentEntries, - } - - return nil -} - -func createCmdString(device string, limits configs.LinuxRdma) string { - cmdString := device - if limits.HcaHandles != nil { - cmdString += " hca_handle=" + strconv.FormatUint(uint64(*limits.HcaHandles), 10) - } - if limits.HcaObjects != nil { - cmdString += " hca_object=" + strconv.FormatUint(uint64(*limits.HcaObjects), 10) - } - return cmdString -} - -// RdmaSet sets RDMA resources. -func RdmaSet(path string, r *configs.Resources) error { - for device, limits := range r.Rdma { - if err := cgroups.WriteFile(path, "rdma.max", createCmdString(device, limits)); err != nil { - return err - } - } - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go deleted file mode 100644 index f4a51c9e5..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go +++ /dev/null @@ -1,145 +0,0 @@ -package fscommon - -import ( - "errors" - "fmt" - "math" - "path" - "strconv" - "strings" - - "github.com/opencontainers/runc/libcontainer/cgroups" -) - -var ( - // Deprecated: use cgroups.OpenFile instead. - OpenFile = cgroups.OpenFile - // Deprecated: use cgroups.ReadFile instead. - ReadFile = cgroups.ReadFile - // Deprecated: use cgroups.WriteFile instead. - WriteFile = cgroups.WriteFile -) - -// ParseError records a parse error details, including the file path. -type ParseError struct { - Path string - File string - Err error -} - -func (e *ParseError) Error() string { - return "unable to parse " + path.Join(e.Path, e.File) + ": " + e.Err.Error() -} - -func (e *ParseError) Unwrap() error { return e.Err } - -// ParseUint converts a string to an uint64 integer. -// Negative values are returned at zero as, due to kernel bugs, -// some of the memory cgroup stats can be negative. -func ParseUint(s string, base, bitSize int) (uint64, error) { - value, err := strconv.ParseUint(s, base, bitSize) - if err != nil { - intValue, intErr := strconv.ParseInt(s, base, bitSize) - // 1. Handle negative values greater than MinInt64 (and) - // 2. Handle negative values lesser than MinInt64 - if intErr == nil && intValue < 0 { - return 0, nil - } else if errors.Is(intErr, strconv.ErrRange) && intValue < 0 { - return 0, nil - } - - return value, err - } - - return value, nil -} - -// ParseKeyValue parses a space-separated "name value" kind of cgroup -// parameter and returns its key as a string, and its value as uint64 -// (ParseUint is used to convert the value). For example, -// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234. -func ParseKeyValue(t string) (string, uint64, error) { - parts := strings.SplitN(t, " ", 3) - if len(parts) != 2 { - return "", 0, fmt.Errorf("line %q is not in key value format", t) - } - - value, err := ParseUint(parts[1], 10, 64) - if err != nil { - return "", 0, err - } - - return parts[0], value, nil -} - -// GetValueByKey reads a key-value pairs from the specified cgroup file, -// and returns a value of the specified key. ParseUint is used for value -// conversion. -func GetValueByKey(path, file, key string) (uint64, error) { - content, err := cgroups.ReadFile(path, file) - if err != nil { - return 0, err - } - - lines := strings.Split(content, "\n") - for _, line := range lines { - arr := strings.Split(line, " ") - if len(arr) == 2 && arr[0] == key { - val, err := ParseUint(arr[1], 10, 64) - if err != nil { - err = &ParseError{Path: path, File: file, Err: err} - } - return val, err - } - } - - return 0, nil -} - -// GetCgroupParamUint reads a single uint64 value from the specified cgroup file. -// If the value read is "max", the math.MaxUint64 is returned. -func GetCgroupParamUint(path, file string) (uint64, error) { - contents, err := GetCgroupParamString(path, file) - if err != nil { - return 0, err - } - contents = strings.TrimSpace(contents) - if contents == "max" { - return math.MaxUint64, nil - } - - res, err := ParseUint(contents, 10, 64) - if err != nil { - return res, &ParseError{Path: path, File: file, Err: err} - } - return res, nil -} - -// GetCgroupParamInt reads a single int64 value from specified cgroup file. -// If the value read is "max", the math.MaxInt64 is returned. -func GetCgroupParamInt(path, file string) (int64, error) { - contents, err := cgroups.ReadFile(path, file) - if err != nil { - return 0, err - } - contents = strings.TrimSpace(contents) - if contents == "max" { - return math.MaxInt64, nil - } - - res, err := strconv.ParseInt(contents, 10, 64) - if err != nil { - return res, &ParseError{Path: path, File: file, Err: err} - } - return res, nil -} - -// GetCgroupParamString reads a string from the specified cgroup file. -func GetCgroupParamString(path, file string) (string, error) { - contents, err := cgroups.ReadFile(path, file) - if err != nil { - return "", err - } - - return strings.TrimSpace(contents), nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/getallpids.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/getallpids.go deleted file mode 100644 index 1355a5101..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/getallpids.go +++ /dev/null @@ -1,27 +0,0 @@ -package cgroups - -import ( - "io/fs" - "path/filepath" -) - -// GetAllPids returns all pids from the cgroup identified by path, and all its -// sub-cgroups. -func GetAllPids(path string) ([]int, error) { - var pids []int - err := filepath.WalkDir(path, func(p string, d fs.DirEntry, iErr error) error { - if iErr != nil { - return iErr - } - if !d.IsDir() { - return nil - } - cPids, err := readProcsFile(p) - if err != nil { - return err - } - pids = append(pids, cPids...) - return nil - }) - return pids, err -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go deleted file mode 100644 index 40a81dd5a..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go +++ /dev/null @@ -1,173 +0,0 @@ -package cgroups - -type ThrottlingData struct { - // Number of periods with throttling active - Periods uint64 `json:"periods,omitempty"` - // Number of periods when the container hit its throttling limit. - ThrottledPeriods uint64 `json:"throttled_periods,omitempty"` - // Aggregate time the container was throttled for in nanoseconds. - ThrottledTime uint64 `json:"throttled_time,omitempty"` -} - -// CpuUsage denotes the usage of a CPU. -// All CPU stats are aggregate since container inception. -type CpuUsage struct { - // Total CPU time consumed. - // Units: nanoseconds. - TotalUsage uint64 `json:"total_usage,omitempty"` - // Total CPU time consumed per core. - // Units: nanoseconds. - PercpuUsage []uint64 `json:"percpu_usage,omitempty"` - // CPU time consumed per core in kernel mode - // Units: nanoseconds. - PercpuUsageInKernelmode []uint64 `json:"percpu_usage_in_kernelmode"` - // CPU time consumed per core in user mode - // Units: nanoseconds. - PercpuUsageInUsermode []uint64 `json:"percpu_usage_in_usermode"` - // Time spent by tasks of the cgroup in kernel mode. - // Units: nanoseconds. - UsageInKernelmode uint64 `json:"usage_in_kernelmode"` - // Time spent by tasks of the cgroup in user mode. - // Units: nanoseconds. - UsageInUsermode uint64 `json:"usage_in_usermode"` -} - -type CpuStats struct { - CpuUsage CpuUsage `json:"cpu_usage,omitempty"` - ThrottlingData ThrottlingData `json:"throttling_data,omitempty"` -} - -type CPUSetStats struct { - // List of the physical numbers of the CPUs on which processes - // in that cpuset are allowed to execute - CPUs []uint16 `json:"cpus,omitempty"` - // cpu_exclusive flag - CPUExclusive uint64 `json:"cpu_exclusive"` - // List of memory nodes on which processes in that cpuset - // are allowed to allocate memory - Mems []uint16 `json:"mems,omitempty"` - // mem_hardwall flag - MemHardwall uint64 `json:"mem_hardwall"` - // mem_exclusive flag - MemExclusive uint64 `json:"mem_exclusive"` - // memory_migrate flag - MemoryMigrate uint64 `json:"memory_migrate"` - // memory_spread page flag - MemorySpreadPage uint64 `json:"memory_spread_page"` - // memory_spread slab flag - MemorySpreadSlab uint64 `json:"memory_spread_slab"` - // memory_pressure - MemoryPressure uint64 `json:"memory_pressure"` - // sched_load balance flag - SchedLoadBalance uint64 `json:"sched_load_balance"` - // sched_relax_domain_level - SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"` -} - -type MemoryData struct { - Usage uint64 `json:"usage,omitempty"` - MaxUsage uint64 `json:"max_usage,omitempty"` - Failcnt uint64 `json:"failcnt"` - Limit uint64 `json:"limit"` -} - -type MemoryStats struct { - // memory used for cache - Cache uint64 `json:"cache,omitempty"` - // usage of memory - Usage MemoryData `json:"usage,omitempty"` - // usage of memory + swap - SwapUsage MemoryData `json:"swap_usage,omitempty"` - // usage of kernel memory - KernelUsage MemoryData `json:"kernel_usage,omitempty"` - // usage of kernel TCP memory - KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"` - // usage of memory pages by NUMA node - // see chapter 5.6 of memory controller documentation - PageUsageByNUMA PageUsageByNUMA `json:"page_usage_by_numa,omitempty"` - // if true, memory usage is accounted for throughout a hierarchy of cgroups. - UseHierarchy bool `json:"use_hierarchy"` - - Stats map[string]uint64 `json:"stats,omitempty"` -} - -type PageUsageByNUMA struct { - // Embedding is used as types can't be recursive. - PageUsageByNUMAInner - Hierarchical PageUsageByNUMAInner `json:"hierarchical,omitempty"` -} - -type PageUsageByNUMAInner struct { - Total PageStats `json:"total,omitempty"` - File PageStats `json:"file,omitempty"` - Anon PageStats `json:"anon,omitempty"` - Unevictable PageStats `json:"unevictable,omitempty"` -} - -type PageStats struct { - Total uint64 `json:"total,omitempty"` - Nodes map[uint8]uint64 `json:"nodes,omitempty"` -} - -type PidsStats struct { - // number of pids in the cgroup - Current uint64 `json:"current,omitempty"` - // active pids hard limit - Limit uint64 `json:"limit,omitempty"` -} - -type BlkioStatEntry struct { - Major uint64 `json:"major,omitempty"` - Minor uint64 `json:"minor,omitempty"` - Op string `json:"op,omitempty"` - Value uint64 `json:"value,omitempty"` -} - -type BlkioStats struct { - // number of bytes transferred to and from the block device - IoServiceBytesRecursive []BlkioStatEntry `json:"io_service_bytes_recursive,omitempty"` - IoServicedRecursive []BlkioStatEntry `json:"io_serviced_recursive,omitempty"` - IoQueuedRecursive []BlkioStatEntry `json:"io_queue_recursive,omitempty"` - IoServiceTimeRecursive []BlkioStatEntry `json:"io_service_time_recursive,omitempty"` - IoWaitTimeRecursive []BlkioStatEntry `json:"io_wait_time_recursive,omitempty"` - IoMergedRecursive []BlkioStatEntry `json:"io_merged_recursive,omitempty"` - IoTimeRecursive []BlkioStatEntry `json:"io_time_recursive,omitempty"` - SectorsRecursive []BlkioStatEntry `json:"sectors_recursive,omitempty"` -} - -type HugetlbStats struct { - // current res_counter usage for hugetlb - Usage uint64 `json:"usage,omitempty"` - // maximum usage ever recorded. - MaxUsage uint64 `json:"max_usage,omitempty"` - // number of times hugetlb usage allocation failure. - Failcnt uint64 `json:"failcnt"` -} - -type RdmaEntry struct { - Device string `json:"device,omitempty"` - HcaHandles uint32 `json:"hca_handles,omitempty"` - HcaObjects uint32 `json:"hca_objects,omitempty"` -} - -type RdmaStats struct { - RdmaLimit []RdmaEntry `json:"rdma_limit,omitempty"` - RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"` -} - -type Stats struct { - CpuStats CpuStats `json:"cpu_stats,omitempty"` - CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"` - MemoryStats MemoryStats `json:"memory_stats,omitempty"` - PidsStats PidsStats `json:"pids_stats,omitempty"` - BlkioStats BlkioStats `json:"blkio_stats,omitempty"` - // the map is in the format "size of hugepage: stats of the hugepage" - HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` - RdmaStats RdmaStats `json:"rdma_stats,omitempty"` -} - -func NewStats() *Stats { - memoryStats := MemoryStats{Stats: make(map[string]uint64)} - hugetlbStats := make(map[string]HugetlbStats) - return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats} -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go deleted file mode 100644 index c5b476e2c..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go +++ /dev/null @@ -1,564 +0,0 @@ -package systemd - -import ( - "bufio" - "context" - "errors" - "fmt" - "math" - "os" - "regexp" - "strconv" - "strings" - "sync" - "time" - - systemdDbus "github.com/coreos/go-systemd/v22/dbus" - dbus "github.com/godbus/dbus/v5" - "github.com/sirupsen/logrus" - - cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices" - "github.com/opencontainers/runc/libcontainer/configs" - "github.com/opencontainers/runc/libcontainer/devices" -) - -const ( - // Default kernel value for cpu quota period is 100000 us (100 ms), same for v1 and v2. - // v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and - // v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html - defCPUQuotaPeriod = uint64(100000) -) - -var ( - versionOnce sync.Once - version int - - isRunningSystemdOnce sync.Once - isRunningSystemd bool -) - -// NOTE: This function comes from package github.com/coreos/go-systemd/util -// It was borrowed here to avoid a dependency on cgo. -// -// IsRunningSystemd checks whether the host was booted with systemd as its init -// system. This functions similarly to systemd's `sd_booted(3)`: internally, it -// checks whether /run/systemd/system/ exists and is a directory. -// http://www.freedesktop.org/software/systemd/man/sd_booted.html -func IsRunningSystemd() bool { - isRunningSystemdOnce.Do(func() { - fi, err := os.Lstat("/run/systemd/system") - isRunningSystemd = err == nil && fi.IsDir() - }) - return isRunningSystemd -} - -// systemd represents slice hierarchy using `-`, so we need to follow suit when -// generating the path of slice. Essentially, test-a-b.slice becomes -// /test.slice/test-a.slice/test-a-b.slice. -func ExpandSlice(slice string) (string, error) { - suffix := ".slice" - // Name has to end with ".slice", but can't be just ".slice". - if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) { - return "", fmt.Errorf("invalid slice name: %s", slice) - } - - // Path-separators are not allowed. - if strings.Contains(slice, "/") { - return "", fmt.Errorf("invalid slice name: %s", slice) - } - - var path, prefix string - sliceName := strings.TrimSuffix(slice, suffix) - // if input was -.slice, we should just return root now - if sliceName == "-" { - return "/", nil - } - for _, component := range strings.Split(sliceName, "-") { - // test--a.slice isn't permitted, nor is -test.slice. - if component == "" { - return "", fmt.Errorf("invalid slice name: %s", slice) - } - - // Append the component to the path and to the prefix. - path += "/" + prefix + component + suffix - prefix += component + "-" - } - return path, nil -} - -func groupPrefix(ruleType devices.Type) (string, error) { - switch ruleType { - case devices.BlockDevice: - return "block-", nil - case devices.CharDevice: - return "char-", nil - default: - return "", fmt.Errorf("device type %v has no group prefix", ruleType) - } -} - -// findDeviceGroup tries to find the device group name (as listed in -// /proc/devices) with the type prefixed as required for DeviceAllow, for a -// given (type, major) combination. If more than one device group exists, an -// arbitrary one is chosen. -func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) { - fh, err := os.Open("/proc/devices") - if err != nil { - return "", err - } - defer fh.Close() - - prefix, err := groupPrefix(ruleType) - if err != nil { - return "", err - } - - scanner := bufio.NewScanner(fh) - var currentType devices.Type - for scanner.Scan() { - // We need to strip spaces because the first number is column-aligned. - line := strings.TrimSpace(scanner.Text()) - - // Handle the "header" lines. - switch line { - case "Block devices:": - currentType = devices.BlockDevice - continue - case "Character devices:": - currentType = devices.CharDevice - continue - case "": - continue - } - - // Skip lines unrelated to our type. - if currentType != ruleType { - continue - } - - // Parse out the (major, name). - var ( - currMajor int64 - currName string - ) - if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 { - if err == nil { - err = errors.New("wrong number of fields") - } - return "", fmt.Errorf("scan /proc/devices line %q: %w", line, err) - } - - if currMajor == ruleMajor { - return prefix + currName, nil - } - } - if err := scanner.Err(); err != nil { - return "", fmt.Errorf("reading /proc/devices: %w", err) - } - // Couldn't find the device group. - return "", nil -} - -// DeviceAllow is the dbus type "a(ss)" which means we need a struct -// to represent it in Go. -type deviceAllowEntry struct { - Path string - Perms string -} - -func allowAllDevices() []systemdDbus.Property { - // Setting mode to auto and removing all DeviceAllow rules - // results in allowing access to all devices. - return []systemdDbus.Property{ - newProp("DevicePolicy", "auto"), - newProp("DeviceAllow", []deviceAllowEntry{}), - } -} - -// generateDeviceProperties takes the configured device rules and generates a -// corresponding set of systemd properties to configure the devices correctly. -func generateDeviceProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) { - if r.SkipDevices { - return nil, nil - } - - properties := []systemdDbus.Property{ - // Always run in the strictest white-list mode. - newProp("DevicePolicy", "strict"), - // Empty the DeviceAllow array before filling it. - newProp("DeviceAllow", []deviceAllowEntry{}), - } - - // Figure out the set of rules. - configEmu := &cgroupdevices.Emulator{} - for _, rule := range r.Devices { - if err := configEmu.Apply(*rule); err != nil { - return nil, fmt.Errorf("unable to apply rule for systemd: %w", err) - } - } - // systemd doesn't support blacklists. So we log a warning, and tell - // systemd to act as a deny-all whitelist. This ruleset will be replaced - // with our normal fallback code. This may result in spurious errors, but - // the only other option is to error out here. - if configEmu.IsBlacklist() { - // However, if we're dealing with an allow-all rule then we can do it. - if configEmu.IsAllowAll() { - return allowAllDevices(), nil - } - logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule") - return properties, nil - } - - // Now generate the set of rules we actually need to apply. Unlike the - // normal devices cgroup, in "strict" mode systemd defaults to a deny-all - // whitelist which is the default for devices.Emulator. - finalRules, err := configEmu.Rules() - if err != nil { - return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err) - } - var deviceAllowList []deviceAllowEntry - for _, rule := range finalRules { - if !rule.Allow { - // Should never happen. - return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule) - } - switch rule.Type { - case devices.BlockDevice, devices.CharDevice: - default: - // Should never happen. - return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type) - } - - entry := deviceAllowEntry{ - Perms: string(rule.Permissions), - } - - // systemd has a fairly odd (though understandable) syntax here, and - // because of the OCI configuration format we have to do quite a bit of - // trickery to convert things: - // - // * Concrete rules with non-wildcard major/minor numbers have to use - // /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses - // stat(2) on such paths to look up device properties, meaning we - // cannot add whitelist rules for devices that don't exist. Since v240, - // device properties are parsed from the path string. - // - // However, path globbing is not support for path-based rules so we - // need to handle wildcards in some other manner. - // - // * Wildcard-minor rules have to specify a "device group name" (the - // second column in /proc/devices). - // - // * Wildcard (major and minor) rules can just specify a glob with the - // type ("char-*" or "block-*"). - // - // The only type of rule we can't handle is wildcard-major rules, and - // so we'll give a warning in that case (note that the fallback code - // will insert any rules systemd couldn't handle). What amazing fun. - - if rule.Major == devices.Wildcard { - // "_ *:n _" rules aren't supported by systemd. - if rule.Minor != devices.Wildcard { - logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule) - continue - } - - // "_ *:* _" rules just wildcard everything. - prefix, err := groupPrefix(rule.Type) - if err != nil { - return nil, err - } - entry.Path = prefix + "*" - } else if rule.Minor == devices.Wildcard { - // "_ n:* _" rules require a device group from /proc/devices. - group, err := findDeviceGroup(rule.Type, rule.Major) - if err != nil { - return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err) - } - if group == "" { - // Couldn't find a group. - logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule) - continue - } - entry.Path = group - } else { - // "_ n:m _" rules are just a path in /dev/{block,char}/. - switch rule.Type { - case devices.BlockDevice: - entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor) - case devices.CharDevice: - entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor) - } - if sdVer < 240 { - // Old systemd versions use stat(2) on path to find out device major:minor - // numbers and type. If the path doesn't exist, it will not add the rule, - // emitting a warning instead. - // Since all of this logic is best-effort anyway (we manually set these - // rules separately to systemd) we can safely skip entries that don't - // have a corresponding path. - if _, err := os.Stat(entry.Path); err != nil { - continue - } - } - } - deviceAllowList = append(deviceAllowList, entry) - } - - properties = append(properties, newProp("DeviceAllow", deviceAllowList)) - return properties, nil -} - -func newProp(name string, units interface{}) systemdDbus.Property { - return systemdDbus.Property{ - Name: name, - Value: dbus.MakeVariant(units), - } -} - -func getUnitName(c *configs.Cgroup) string { - // by default, we create a scope unless the user explicitly asks for a slice. - if !strings.HasSuffix(c.Name, ".slice") { - return c.ScopePrefix + "-" + c.Name + ".scope" - } - return c.Name -} - -// This code should be in sync with getUnitName. -func getUnitType(unitName string) string { - if strings.HasSuffix(unitName, ".slice") { - return "Slice" - } - return "Scope" -} - -// isDbusError returns true if the error is a specific dbus error. -func isDbusError(err error, name string) bool { - if err != nil { - var derr dbus.Error - if errors.As(err, &derr) { - return strings.Contains(derr.Name, name) - } - } - return false -} - -// isUnitExists returns true if the error is that a systemd unit already exists. -func isUnitExists(err error) bool { - return isDbusError(err, "org.freedesktop.systemd1.UnitExists") -} - -func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error { - statusChan := make(chan string, 1) - retry := true - -retry: - err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { - _, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan) - return err - }) - if err != nil { - if !isUnitExists(err) { - return err - } - if ignoreExist { - // TODO: remove this hack. - // This is kubelet making sure a slice exists (see - // https://github.com/opencontainers/runc/pull/1124). - return nil - } - if retry { - // In case a unit with the same name exists, this may - // be a leftover failed unit. Reset it, so systemd can - // remove it, and retry once. - err = resetFailedUnit(cm, unitName) - if err != nil { - logrus.Warnf("unable to reset failed unit: %v", err) - } - retry = false - goto retry - } - return err - } - - timeout := time.NewTimer(30 * time.Second) - defer timeout.Stop() - - select { - case s := <-statusChan: - close(statusChan) - // Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit - if s != "done" { - _ = resetFailedUnit(cm, unitName) - return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s) - } - case <-timeout.C: - _ = resetFailedUnit(cm, unitName) - return errors.New("Timeout waiting for systemd to create " + unitName) - } - - return nil -} - -func stopUnit(cm *dbusConnManager, unitName string) error { - statusChan := make(chan string, 1) - err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { - _, err := c.StopUnitContext(context.TODO(), unitName, "replace", statusChan) - return err - }) - if err == nil { - timeout := time.NewTimer(30 * time.Second) - defer timeout.Stop() - - select { - case s := <-statusChan: - close(statusChan) - // Please refer to https://godoc.org/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit - if s != "done" { - logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s) - } - case <-timeout.C: - return errors.New("Timed out while waiting for systemd to remove " + unitName) - } - } - - // In case of a failed unit, let systemd remove it. - _ = resetFailedUnit(cm, unitName) - - return nil -} - -func resetFailedUnit(cm *dbusConnManager, name string) error { - return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { - return c.ResetFailedUnitContext(context.TODO(), name) - }) -} - -func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) { - var prop *systemdDbus.Property - err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) { - prop, Err = c.GetUnitTypePropertyContext(context.TODO(), unitName, unitType, propertyName) - return Err - }) - return prop, err -} - -func setUnitProperties(cm *dbusConnManager, name string, properties ...systemdDbus.Property) error { - return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { - return c.SetUnitPropertiesContext(context.TODO(), name, true, properties...) - }) -} - -func getManagerProperty(cm *dbusConnManager, name string) (string, error) { - str := "" - err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { - var err error - str, err = c.GetManagerProperty(name) - return err - }) - if err != nil { - return "", err - } - return strconv.Unquote(str) -} - -func systemdVersion(cm *dbusConnManager) int { - versionOnce.Do(func() { - version = -1 - verStr, err := getManagerProperty(cm, "Version") - if err == nil { - version, err = systemdVersionAtoi(verStr) - } - - if err != nil { - logrus.WithError(err).Error("unable to get systemd version") - } - }) - - return version -} - -func systemdVersionAtoi(verStr string) (int, error) { - // verStr should be of the form: - // "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32" (without quotes). - // The result for all of the above should be 245. - // Thus, we unconditionally remove the "v" prefix - // and then match on the first integer we can grab. - re := regexp.MustCompile(`v?([0-9]+)`) - matches := re.FindStringSubmatch(verStr) - if len(matches) < 2 { - return 0, fmt.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches) - } - ver, err := strconv.Atoi(matches[1]) - if err != nil { - return -1, fmt.Errorf("can't parse version: %w", err) - } - return ver, nil -} - -func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota int64, period uint64) { - if period != 0 { - // systemd only supports CPUQuotaPeriodUSec since v242 - sdVer := systemdVersion(cm) - if sdVer >= 242 { - *properties = append(*properties, - newProp("CPUQuotaPeriodUSec", period)) - } else { - logrus.Debugf("systemd v%d is too old to support CPUQuotaPeriodSec "+ - " (setting will still be applied to cgroupfs)", sdVer) - } - } - if quota != 0 || period != 0 { - // corresponds to USEC_INFINITY in systemd - cpuQuotaPerSecUSec := uint64(math.MaxUint64) - if quota > 0 { - if period == 0 { - // assume the default - period = defCPUQuotaPeriod - } - // systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota - // (integer percentage of CPU) internally. This means that if a fractional percent of - // CPU is indicated by Resources.CpuQuota, we need to round up to the nearest - // 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect. - cpuQuotaPerSecUSec = uint64(quota*1000000) / period - if cpuQuotaPerSecUSec%10000 != 0 { - cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000 - } - } - *properties = append(*properties, - newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec)) - } -} - -func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems string) error { - if cpus == "" && mems == "" { - return nil - } - - // systemd only supports AllowedCPUs/AllowedMemoryNodes since v244 - sdVer := systemdVersion(cm) - if sdVer < 244 { - logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+ - " (settings will still be applied to cgroupfs)", sdVer) - return nil - } - - if cpus != "" { - bits, err := RangeToBits(cpus) - if err != nil { - return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w", - cpus, err) - } - *props = append(*props, - newProp("AllowedCPUs", bits)) - } - if mems != "" { - bits, err := RangeToBits(mems) - if err != nil { - return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w", - mems, err) - } - *props = append(*props, - newProp("AllowedMemoryNodes", bits)) - } - return nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go deleted file mode 100644 index dd474cf1b..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go +++ /dev/null @@ -1,60 +0,0 @@ -package systemd - -import ( - "errors" - "math/big" - "strconv" - "strings" -) - -// RangeToBits converts a text representation of a CPU mask (as written to -// or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes -// with the corresponding bits set (as consumed by systemd over dbus as -// AllowedCPUs/AllowedMemoryNodes unit property value). -func RangeToBits(str string) ([]byte, error) { - bits := new(big.Int) - - for _, r := range strings.Split(str, ",") { - // allow extra spaces around - r = strings.TrimSpace(r) - // allow empty elements (extra commas) - if r == "" { - continue - } - ranges := strings.SplitN(r, "-", 2) - if len(ranges) > 1 { - start, err := strconv.ParseUint(ranges[0], 10, 32) - if err != nil { - return nil, err - } - end, err := strconv.ParseUint(ranges[1], 10, 32) - if err != nil { - return nil, err - } - if start > end { - return nil, errors.New("invalid range: " + r) - } - for i := start; i <= end; i++ { - bits.SetBit(bits, int(i), 1) - } - } else { - val, err := strconv.ParseUint(ranges[0], 10, 32) - if err != nil { - return nil, err - } - bits.SetBit(bits, int(val), 1) - } - } - - ret := bits.Bytes() - if len(ret) == 0 { - // do not allow empty values - return nil, errors.New("empty value") - } - - // fit cpuset parsing order in systemd - for l, r := 0, len(ret)-1; l < r; l, r = l+1, r-1 { - ret[l], ret[r] = ret[r], ret[l] - } - return ret, nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/dbus.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/dbus.go deleted file mode 100644 index bb87ae83a..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/dbus.go +++ /dev/null @@ -1,102 +0,0 @@ -package systemd - -import ( - "context" - "errors" - "fmt" - "sync" - - systemdDbus "github.com/coreos/go-systemd/v22/dbus" - dbus "github.com/godbus/dbus/v5" -) - -var ( - dbusC *systemdDbus.Conn - dbusMu sync.RWMutex - dbusInited bool - dbusRootless bool -) - -type dbusConnManager struct{} - -// newDbusConnManager initializes systemd dbus connection manager. -func newDbusConnManager(rootless bool) *dbusConnManager { - dbusMu.Lock() - defer dbusMu.Unlock() - if dbusInited && rootless != dbusRootless { - panic("can't have both root and rootless dbus") - } - dbusInited = true - dbusRootless = rootless - return &dbusConnManager{} -} - -// getConnection lazily initializes and returns systemd dbus connection. -func (d *dbusConnManager) getConnection() (*systemdDbus.Conn, error) { - // In the case where dbusC != nil - // Use the read lock the first time to ensure - // that Conn can be acquired at the same time. - dbusMu.RLock() - if conn := dbusC; conn != nil { - dbusMu.RUnlock() - return conn, nil - } - dbusMu.RUnlock() - - // In the case where dbusC == nil - // Use write lock to ensure that only one - // will be created - dbusMu.Lock() - defer dbusMu.Unlock() - if conn := dbusC; conn != nil { - return conn, nil - } - - conn, err := d.newConnection() - if err != nil { - // When dbus-user-session is not installed, we can't detect whether we should try to connect to user dbus or system dbus, so d.dbusRootless is set to false. - // This may fail with a cryptic error "read unix @->/run/systemd/private: read: connection reset by peer: unknown." - // https://github.com/moby/moby/issues/42793 - return nil, fmt.Errorf("failed to connect to dbus (hint: for rootless containers, maybe you need to install dbus-user-session package, see https://github.com/opencontainers/runc/blob/master/docs/cgroup-v2.md): %w", err) - } - dbusC = conn - return conn, nil -} - -func (d *dbusConnManager) newConnection() (*systemdDbus.Conn, error) { - if dbusRootless { - return newUserSystemdDbus() - } - return systemdDbus.NewWithContext(context.TODO()) -} - -// resetConnection resets the connection to its initial state -// (so it can be reconnected if necessary). -func (d *dbusConnManager) resetConnection(conn *systemdDbus.Conn) { - dbusMu.Lock() - defer dbusMu.Unlock() - if dbusC != nil && dbusC == conn { - dbusC.Close() - dbusC = nil - } -} - -// retryOnDisconnect calls op, and if the error it returns is about closed dbus -// connection, the connection is re-established and the op is retried. This helps -// with the situation when dbus is restarted and we have a stale connection. -func (d *dbusConnManager) retryOnDisconnect(op func(*systemdDbus.Conn) error) error { - for { - conn, err := d.getConnection() - if err != nil { - return err - } - err = op(conn) - if err == nil { - return nil - } - if !errors.Is(err, dbus.ErrClosed) { - return err - } - d.resetConnection(conn) - } -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/user.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/user.go deleted file mode 100644 index 0f50f76ee..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/user.go +++ /dev/null @@ -1,106 +0,0 @@ -package systemd - -import ( - "bufio" - "bytes" - "errors" - "fmt" - "os" - "os/exec" - "path/filepath" - "strconv" - "strings" - - systemdDbus "github.com/coreos/go-systemd/v22/dbus" - dbus "github.com/godbus/dbus/v5" - - "github.com/opencontainers/runc/libcontainer/userns" -) - -// newUserSystemdDbus creates a connection for systemd user-instance. -func newUserSystemdDbus() (*systemdDbus.Conn, error) { - addr, err := DetectUserDbusSessionBusAddress() - if err != nil { - return nil, err - } - uid, err := DetectUID() - if err != nil { - return nil, err - } - - return systemdDbus.NewConnection(func() (*dbus.Conn, error) { - conn, err := dbus.Dial(addr) - if err != nil { - return nil, fmt.Errorf("error while dialing %q: %w", addr, err) - } - methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))} - err = conn.Auth(methods) - if err != nil { - conn.Close() - return nil, fmt.Errorf("error while authenticating connection (address=%q, UID=%d): %w", addr, uid, err) - } - if err = conn.Hello(); err != nil { - conn.Close() - return nil, fmt.Errorf("error while sending Hello message (address=%q, UID=%d): %w", addr, uid, err) - } - return conn, nil - }) -} - -// DetectUID detects UID from the OwnerUID field of `busctl --user status` -// if running in userNS. The value corresponds to sd_bus_creds_get_owner_uid(3) . -// -// Otherwise returns os.Getuid() . -func DetectUID() (int, error) { - if !userns.RunningInUserNS() { - return os.Getuid(), nil - } - b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput() - if err != nil { - return -1, fmt.Errorf("could not execute `busctl --user --no-pager status` (output: %q): %w", string(b), err) - } - scanner := bufio.NewScanner(bytes.NewReader(b)) - for scanner.Scan() { - s := strings.TrimSpace(scanner.Text()) - if strings.HasPrefix(s, "OwnerUID=") { - uidStr := strings.TrimPrefix(s, "OwnerUID=") - i, err := strconv.Atoi(uidStr) - if err != nil { - return -1, fmt.Errorf("could not detect the OwnerUID: %w", err) - } - return i, nil - } - } - if err := scanner.Err(); err != nil { - return -1, err - } - return -1, errors.New("could not detect the OwnerUID") -} - -// DetectUserDbusSessionBusAddress returns $DBUS_SESSION_BUS_ADDRESS if set. -// Otherwise returns "unix:path=$XDG_RUNTIME_DIR/bus" if $XDG_RUNTIME_DIR/bus exists. -// Otherwise parses the value from `systemctl --user show-environment` . -func DetectUserDbusSessionBusAddress() (string, error) { - if env := os.Getenv("DBUS_SESSION_BUS_ADDRESS"); env != "" { - return env, nil - } - if xdr := os.Getenv("XDG_RUNTIME_DIR"); xdr != "" { - busPath := filepath.Join(xdr, "bus") - if _, err := os.Stat(busPath); err == nil { - busAddress := "unix:path=" + busPath - return busAddress, nil - } - } - b, err := exec.Command("systemctl", "--user", "--no-pager", "show-environment").CombinedOutput() - if err != nil { - return "", fmt.Errorf("could not execute `systemctl --user --no-pager show-environment` (output=%q): %w", string(b), err) - } - scanner := bufio.NewScanner(bytes.NewReader(b)) - for scanner.Scan() { - s := strings.TrimSpace(scanner.Text()) - if strings.HasPrefix(s, "DBUS_SESSION_BUS_ADDRESS=") { - return strings.TrimPrefix(s, "DBUS_SESSION_BUS_ADDRESS="), nil - } - } - return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from `systemctl --user --no-pager show-environment`. Make sure you have installed the dbus-user-session or dbus-daemon package and then run: `systemctl --user start dbus`") -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go deleted file mode 100644 index a574552da..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go +++ /dev/null @@ -1,480 +0,0 @@ -package systemd - -import ( - "errors" - "os" - "path/filepath" - "reflect" - "strings" - "sync" - - systemdDbus "github.com/coreos/go-systemd/v22/dbus" - "github.com/godbus/dbus/v5" - "github.com/sirupsen/logrus" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fs" - "github.com/opencontainers/runc/libcontainer/configs" -) - -type legacyManager struct { - mu sync.Mutex - cgroups *configs.Cgroup - paths map[string]string - dbus *dbusConnManager -} - -func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) { - if cg.Rootless { - return nil, errors.New("cannot use rootless systemd cgroups manager on cgroup v1") - } - if cg.Resources != nil && cg.Resources.Unified != nil { - return nil, cgroups.ErrV1NoUnified - } - if paths == nil { - var err error - paths, err = initPaths(cg) - if err != nil { - return nil, err - } - } - return &legacyManager{ - cgroups: cg, - paths: paths, - dbus: newDbusConnManager(false), - }, nil -} - -type subsystem interface { - // Name returns the name of the subsystem. - Name() string - // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. - GetStats(path string, stats *cgroups.Stats) error - // Set sets cgroup resource limits. - Set(path string, r *configs.Resources) error -} - -var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") - -var legacySubsystems = []subsystem{ - &fs.CpusetGroup{}, - &fs.DevicesGroup{}, - &fs.MemoryGroup{}, - &fs.CpuGroup{}, - &fs.CpuacctGroup{}, - &fs.PidsGroup{}, - &fs.BlkioGroup{}, - &fs.HugetlbGroup{}, - &fs.PerfEventGroup{}, - &fs.FreezerGroup{}, - &fs.NetPrioGroup{}, - &fs.NetClsGroup{}, - &fs.NameGroup{GroupName: "name=systemd"}, - &fs.RdmaGroup{}, - &fs.NameGroup{GroupName: "misc"}, -} - -func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { - var properties []systemdDbus.Property - - deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm)) - if err != nil { - return nil, err - } - properties = append(properties, deviceProperties...) - - if r.Memory != 0 { - properties = append(properties, - newProp("MemoryLimit", uint64(r.Memory))) - } - - if r.CpuShares != 0 { - properties = append(properties, - newProp("CPUShares", r.CpuShares)) - } - - addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod) - - if r.BlkioWeight != 0 { - properties = append(properties, - newProp("BlockIOWeight", uint64(r.BlkioWeight))) - } - - if r.PidsLimit > 0 || r.PidsLimit == -1 { - properties = append(properties, - newProp("TasksMax", uint64(r.PidsLimit))) - } - - err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) - if err != nil { - return nil, err - } - - return properties, nil -} - -// initPaths figures out and returns paths to cgroups. -func initPaths(c *configs.Cgroup) (map[string]string, error) { - slice := "system.slice" - if c.Parent != "" { - var err error - slice, err = ExpandSlice(c.Parent) - if err != nil { - return nil, err - } - } - - unit := getUnitName(c) - - paths := make(map[string]string) - for _, s := range legacySubsystems { - subsystemPath, err := getSubsystemPath(slice, unit, s.Name()) - if err != nil { - // Even if it's `not found` error, we'll return err - // because devices cgroup is hard requirement for - // container security. - if s.Name() == "devices" { - return nil, err - } - // Don't fail if a cgroup hierarchy was not found, just skip this subsystem - if cgroups.IsNotFound(err) { - continue - } - return nil, err - } - paths[s.Name()] = subsystemPath - } - - // If systemd is using cgroups-hybrid mode then add the slice path of - // this container to the paths so the following process executed with - // "runc exec" joins that cgroup as well. - if cgroups.IsCgroup2HybridMode() { - // "" means cgroup-hybrid path - cgroupsHybridPath, err := getSubsystemPath(slice, unit, "") - if err != nil && cgroups.IsNotFound(err) { - return nil, err - } - paths[""] = cgroupsHybridPath - } - - return paths, nil -} - -func (m *legacyManager) Apply(pid int) error { - var ( - c = m.cgroups - unitName = getUnitName(c) - slice = "system.slice" - properties []systemdDbus.Property - ) - - m.mu.Lock() - defer m.mu.Unlock() - - if c.Parent != "" { - slice = c.Parent - } - - properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) - - if strings.HasSuffix(unitName, ".slice") { - // If we create a slice, the parent is defined via a Wants=. - properties = append(properties, systemdDbus.PropWants(slice)) - } else { - // Otherwise it's a scope, which we put into a Slice=. - properties = append(properties, systemdDbus.PropSlice(slice)) - // Assume scopes always support delegation (supported since systemd v218). - properties = append(properties, newProp("Delegate", true)) - } - - // only add pid if its valid, -1 is used w/ general slice creation. - if pid != -1 { - properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) - } - - // Always enable accounting, this gets us the same behaviour as the fs implementation, - // plus the kernel has some problems with joining the memory cgroup at a later time. - properties = append(properties, - newProp("MemoryAccounting", true), - newProp("CPUAccounting", true), - newProp("BlockIOAccounting", true), - newProp("TasksAccounting", true), - ) - - // Assume DefaultDependencies= will always work (the check for it was previously broken.) - properties = append(properties, - newProp("DefaultDependencies", false)) - - properties = append(properties, c.SystemdProps...) - - if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { - return err - } - - if err := m.joinCgroups(pid); err != nil { - return err - } - - return nil -} - -func (m *legacyManager) Destroy() error { - m.mu.Lock() - defer m.mu.Unlock() - - stopErr := stopUnit(m.dbus, getUnitName(m.cgroups)) - - // Both on success and on error, cleanup all the cgroups - // we are aware of, as some of them were created directly - // by Apply() and are not managed by systemd. - if err := cgroups.RemovePaths(m.paths); err != nil && stopErr == nil { - return err - } - - return stopErr -} - -func (m *legacyManager) Path(subsys string) string { - m.mu.Lock() - defer m.mu.Unlock() - return m.paths[subsys] -} - -func (m *legacyManager) joinCgroups(pid int) error { - for _, sys := range legacySubsystems { - name := sys.Name() - switch name { - case "name=systemd": - // let systemd handle this - case "cpuset": - if path, ok := m.paths[name]; ok { - s := &fs.CpusetGroup{} - if err := s.ApplyDir(path, m.cgroups.Resources, pid); err != nil { - return err - } - } - default: - if path, ok := m.paths[name]; ok { - if err := os.MkdirAll(path, 0o755); err != nil { - return err - } - if err := cgroups.WriteCgroupProc(path, pid); err != nil { - return err - } - } - } - } - - return nil -} - -func getSubsystemPath(slice, unit, subsystem string) (string, error) { - mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem) - if err != nil { - return "", err - } - - return filepath.Join(mountpoint, slice, unit), nil -} - -func (m *legacyManager) Freeze(state configs.FreezerState) error { - err := m.doFreeze(state) - if err == nil { - m.cgroups.Resources.Freezer = state - } - return err -} - -// doFreeze is the same as Freeze but without -// changing the m.cgroups.Resources.Frozen field. -func (m *legacyManager) doFreeze(state configs.FreezerState) error { - path, ok := m.paths["freezer"] - if !ok { - return errSubsystemDoesNotExist - } - freezer := &fs.FreezerGroup{} - resources := &configs.Resources{Freezer: state} - return freezer.Set(path, resources) -} - -func (m *legacyManager) GetPids() ([]int, error) { - path, ok := m.paths["devices"] - if !ok { - return nil, errSubsystemDoesNotExist - } - return cgroups.GetPids(path) -} - -func (m *legacyManager) GetAllPids() ([]int, error) { - path, ok := m.paths["devices"] - if !ok { - return nil, errSubsystemDoesNotExist - } - return cgroups.GetAllPids(path) -} - -func (m *legacyManager) GetStats() (*cgroups.Stats, error) { - m.mu.Lock() - defer m.mu.Unlock() - stats := cgroups.NewStats() - for _, sys := range legacySubsystems { - path := m.paths[sys.Name()] - if path == "" { - continue - } - if err := sys.GetStats(path, stats); err != nil { - return nil, err - } - } - - return stats, nil -} - -// freezeBeforeSet answers whether there is a need to freeze the cgroup before -// applying its systemd unit properties, and thaw after, while avoiding -// unnecessary freezer state changes. -// -// The reason why we have to freeze is that systemd's application of device -// rules is done disruptively, resulting in spurious errors to common devices -// (unlike our fs driver, they will happily write deny-all rules to running -// containers). So we have to freeze the container to avoid the container get -// an occasional "permission denied" error. -func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) (needsFreeze, needsThaw bool, err error) { - // Special case for SkipDevices, as used by Kubernetes to create pod - // cgroups with allow-all device policy). - if r.SkipDevices { - if r.SkipFreezeOnSet { - // Both needsFreeze and needsThaw are false. - return - } - - // No need to freeze if SkipDevices is set, and either - // (1) systemd unit does not (yet) exist, or - // (2) it has DevicePolicy=auto and empty DeviceAllow list. - // - // Interestingly, (1) and (2) are the same here because - // a non-existent unit returns default properties, - // and settings in (2) are the defaults. - // - // Do not return errors from getUnitTypeProperty, as they alone - // should not prevent Set from working. - - unitType := getUnitType(unitName) - - devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy") - if e == nil && devPolicy.Value == dbus.MakeVariant("auto") { - devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow") - if e == nil { - if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 { - needsFreeze = false - needsThaw = false - return - } - } - } - } - - needsFreeze = true - needsThaw = true - - // Check the current freezer state. - freezerState, err := m.GetFreezerState() - if err != nil { - return - } - if freezerState == configs.Frozen { - // Already frozen, and should stay frozen. - needsFreeze = false - needsThaw = false - } - - if r.Freezer == configs.Frozen { - // Will be frozen anyway -- no need to thaw. - needsThaw = false - } - return -} - -func (m *legacyManager) Set(r *configs.Resources) error { - if r == nil { - return nil - } - if r.Unified != nil { - return cgroups.ErrV1NoUnified - } - properties, err := genV1ResourcesProperties(r, m.dbus) - if err != nil { - return err - } - - unitName := getUnitName(m.cgroups) - needsFreeze, needsThaw, err := m.freezeBeforeSet(unitName, r) - if err != nil { - return err - } - - if needsFreeze { - if err := m.doFreeze(configs.Frozen); err != nil { - // If freezer cgroup isn't supported, we just warn about it. - logrus.Infof("freeze container before SetUnitProperties failed: %v", err) - // skip update the cgroup while frozen failed. #3803 - if !errors.Is(err, errSubsystemDoesNotExist) { - if needsThaw { - if thawErr := m.doFreeze(configs.Thawed); thawErr != nil { - logrus.Infof("thaw container after doFreeze failed: %v", thawErr) - } - } - return err - } - } - } - setErr := setUnitProperties(m.dbus, unitName, properties...) - if needsThaw { - if err := m.doFreeze(configs.Thawed); err != nil { - logrus.Infof("thaw container after SetUnitProperties failed: %v", err) - } - } - if setErr != nil { - return setErr - } - - for _, sys := range legacySubsystems { - // Get the subsystem path, but don't error out for not found cgroups. - path, ok := m.paths[sys.Name()] - if !ok { - continue - } - if err := sys.Set(path, r); err != nil { - return err - } - } - - return nil -} - -func (m *legacyManager) GetPaths() map[string]string { - m.mu.Lock() - defer m.mu.Unlock() - return m.paths -} - -func (m *legacyManager) GetCgroups() (*configs.Cgroup, error) { - return m.cgroups, nil -} - -func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) { - path, ok := m.paths["freezer"] - if !ok { - return configs.Undefined, nil - } - freezer := &fs.FreezerGroup{} - return freezer.GetState(path) -} - -func (m *legacyManager) Exists() bool { - return cgroups.PathExists(m.Path("devices")) -} - -func (m *legacyManager) OOMKillCount() (uint64, error) { - return fs.OOMKillCount(m.Path("memory")) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go deleted file mode 100644 index 919e5632f..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go +++ /dev/null @@ -1,472 +0,0 @@ -package systemd - -import ( - "bufio" - "errors" - "fmt" - "math" - "os" - "path/filepath" - "strconv" - "strings" - "sync" - - systemdDbus "github.com/coreos/go-systemd/v22/dbus" - securejoin "github.com/cyphar/filepath-securejoin" - "github.com/sirupsen/logrus" - - "github.com/opencontainers/runc/libcontainer/cgroups" - "github.com/opencontainers/runc/libcontainer/cgroups/fs2" - "github.com/opencontainers/runc/libcontainer/configs" -) - -type unifiedManager struct { - mu sync.Mutex - cgroups *configs.Cgroup - // path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope" - path string - dbus *dbusConnManager - fsMgr cgroups.Manager -} - -func NewUnifiedManager(config *configs.Cgroup, path string) (cgroups.Manager, error) { - m := &unifiedManager{ - cgroups: config, - path: path, - dbus: newDbusConnManager(config.Rootless), - } - if err := m.initPath(); err != nil { - return nil, err - } - - fsMgr, err := fs2.NewManager(config, m.path) - if err != nil { - return nil, err - } - m.fsMgr = fsMgr - - return m, nil -} - -// unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified -// key/value map (where key is cgroupfs file name) to systemd unit properties. -// This is on a best-effort basis, so the properties that are not known -// (to this function and/or systemd) are ignored (but logged with "debug" -// log level). -// -// For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt -// -// For the list of systemd unit properties, see systemd.resource-control(5). -func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) { - var err error - - for k, v := range res { - if strings.Contains(k, "/") { - return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k) - } - sk := strings.SplitN(k, ".", 2) - if len(sk) != 2 { - return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k) - } - // Kernel is quite forgiving to extra whitespace - // around the value, and so should we. - v = strings.TrimSpace(v) - // Please keep cases in alphabetical order. - switch k { - case "cpu.max": - // value: quota [period] - quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set - period := defCPUQuotaPeriod - sv := strings.Fields(v) - if len(sv) < 1 || len(sv) > 2 { - return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v) - } - // quota - if sv[0] != "max" { - quota, err = strconv.ParseInt(sv[0], 10, 64) - if err != nil { - return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err) - } - } - // period - if len(sv) == 2 { - period, err = strconv.ParseUint(sv[1], 10, 64) - if err != nil { - return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err) - } - } - addCpuQuota(cm, &props, quota, period) - - case "cpu.weight": - num, err := strconv.ParseUint(v, 10, 64) - if err != nil { - return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) - } - props = append(props, - newProp("CPUWeight", num)) - - case "cpuset.cpus", "cpuset.mems": - bits, err := RangeToBits(v) - if err != nil { - return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err) - } - m := map[string]string{ - "cpuset.cpus": "AllowedCPUs", - "cpuset.mems": "AllowedMemoryNodes", - } - // systemd only supports these properties since v244 - sdVer := systemdVersion(cm) - if sdVer >= 244 { - props = append(props, - newProp(m[k], bits)) - } else { - logrus.Debugf("systemd v%d is too old to support %s"+ - " (setting will still be applied to cgroupfs)", - sdVer, m[k]) - } - - case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max": - num := uint64(math.MaxUint64) - if v != "max" { - num, err = strconv.ParseUint(v, 10, 64) - if err != nil { - return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) - } - } - m := map[string]string{ - "memory.high": "MemoryHigh", - "memory.low": "MemoryLow", - "memory.min": "MemoryMin", - "memory.max": "MemoryMax", - "memory.swap.max": "MemorySwapMax", - } - props = append(props, - newProp(m[k], num)) - - case "pids.max": - num := uint64(math.MaxUint64) - if v != "max" { - var err error - num, err = strconv.ParseUint(v, 10, 64) - if err != nil { - return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) - } - } - props = append(props, - newProp("TasksMax", num)) - - case "memory.oom.group": - // Setting this to 1 is roughly equivalent to OOMPolicy=kill - // (as per systemd.service(5) and - // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html), - // but it's not clear what to do if it is unset or set - // to 0 in runc update, as there are two other possible - // values for OOMPolicy (continue/stop). - fallthrough - - default: - // Ignore the unknown resource here -- will still be - // applied in Set which calls fs2.Set. - logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v) - } - } - - return props, nil -} - -func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) { - var properties []systemdDbus.Property - - // NOTE: This is of questionable correctness because we insert our own - // devices eBPF program later. Two programs with identical rules - // aren't the end of the world, but it is a bit concerning. However - // it's unclear if systemd removes all eBPF programs attached when - // doing SetUnitProperties... - deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm)) - if err != nil { - return nil, err - } - properties = append(properties, deviceProperties...) - - if r.Memory != 0 { - properties = append(properties, - newProp("MemoryMax", uint64(r.Memory))) - } - if r.MemoryReservation != 0 { - properties = append(properties, - newProp("MemoryLow", uint64(r.MemoryReservation))) - } - - swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory) - if err != nil { - return nil, err - } - if swap != 0 { - properties = append(properties, - newProp("MemorySwapMax", uint64(swap))) - } - - if r.CpuWeight != 0 { - properties = append(properties, - newProp("CPUWeight", r.CpuWeight)) - } - - addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod) - - if r.PidsLimit > 0 || r.PidsLimit == -1 { - properties = append(properties, - newProp("TasksMax", uint64(r.PidsLimit))) - } - - err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) - if err != nil { - return nil, err - } - - // ignore r.KernelMemory - - // convert Resources.Unified map to systemd properties - if r.Unified != nil { - unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified) - if err != nil { - return nil, err - } - properties = append(properties, unifiedProps...) - } - - return properties, nil -} - -func (m *unifiedManager) Apply(pid int) error { - var ( - c = m.cgroups - unitName = getUnitName(c) - properties []systemdDbus.Property - ) - - slice := "system.slice" - if m.cgroups.Rootless { - slice = "user.slice" - } - if c.Parent != "" { - slice = c.Parent - } - - properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name)) - - if strings.HasSuffix(unitName, ".slice") { - // If we create a slice, the parent is defined via a Wants=. - properties = append(properties, systemdDbus.PropWants(slice)) - } else { - // Otherwise it's a scope, which we put into a Slice=. - properties = append(properties, systemdDbus.PropSlice(slice)) - // Assume scopes always support delegation (supported since systemd v218). - properties = append(properties, newProp("Delegate", true)) - } - - // only add pid if its valid, -1 is used w/ general slice creation. - if pid != -1 { - properties = append(properties, newProp("PIDs", []uint32{uint32(pid)})) - } - - // Always enable accounting, this gets us the same behaviour as the fs implementation, - // plus the kernel has some problems with joining the memory cgroup at a later time. - properties = append(properties, - newProp("MemoryAccounting", true), - newProp("CPUAccounting", true), - newProp("IOAccounting", true), - newProp("TasksAccounting", true), - ) - - // Assume DefaultDependencies= will always work (the check for it was previously broken.) - properties = append(properties, - newProp("DefaultDependencies", false)) - - properties = append(properties, c.SystemdProps...) - - if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil { - return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err) - } - - if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil { - return err - } - - if c.OwnerUID != nil { - // The directory itself must be chowned. - err := os.Chown(m.path, *c.OwnerUID, -1) - if err != nil { - return err - } - - filesToChown, err := cgroupFilesToChown() - if err != nil { - return err - } - - for _, v := range filesToChown { - err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1) - // Some files might not be present. - if err != nil && !errors.Is(err, os.ErrNotExist) { - return err - } - } - } - - return nil -} - -// The kernel exposes a list of files that should be chowned to the delegate -// uid in /sys/kernel/cgroup/delegate. If the file is not present -// (Linux < 4.15), use the initial values mentioned in cgroups(7). -func cgroupFilesToChown() ([]string, error) { - const cgroupDelegateFile = "/sys/kernel/cgroup/delegate" - - f, err := os.Open(cgroupDelegateFile) - if err != nil { - return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil - } - defer f.Close() - - filesToChown := []string{} - scanner := bufio.NewScanner(f) - for scanner.Scan() { - filesToChown = append(filesToChown, scanner.Text()) - } - if err := scanner.Err(); err != nil { - return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err) - } - - return filesToChown, nil -} - -func (m *unifiedManager) Destroy() error { - m.mu.Lock() - defer m.mu.Unlock() - - unitName := getUnitName(m.cgroups) - if err := stopUnit(m.dbus, unitName); err != nil { - return err - } - - // systemd 239 do not remove sub-cgroups. - err := m.fsMgr.Destroy() - // fsMgr.Destroy has handled ErrNotExist - if err != nil { - return err - } - - return nil -} - -func (m *unifiedManager) Path(_ string) string { - return m.path -} - -// getSliceFull value is used in initPath. -// The value is incompatible with systemdDbus.PropSlice. -func (m *unifiedManager) getSliceFull() (string, error) { - c := m.cgroups - slice := "system.slice" - if c.Rootless { - slice = "user.slice" - } - if c.Parent != "" { - var err error - slice, err = ExpandSlice(c.Parent) - if err != nil { - return "", err - } - } - - if c.Rootless { - // managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service". - managerCG, err := getManagerProperty(m.dbus, "ControlGroup") - if err != nil { - return "", err - } - slice = filepath.Join(managerCG, slice) - } - - // an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice" - // NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified. - return slice, nil -} - -func (m *unifiedManager) initPath() error { - if m.path != "" { - return nil - } - - sliceFull, err := m.getSliceFull() - if err != nil { - return err - } - - c := m.cgroups - path := filepath.Join(sliceFull, getUnitName(c)) - path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path) - if err != nil { - return err - } - - // an example of the final path in rootless: - // "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope" - m.path = path - - return nil -} - -func (m *unifiedManager) Freeze(state configs.FreezerState) error { - return m.fsMgr.Freeze(state) -} - -func (m *unifiedManager) GetPids() ([]int, error) { - return cgroups.GetPids(m.path) -} - -func (m *unifiedManager) GetAllPids() ([]int, error) { - return cgroups.GetAllPids(m.path) -} - -func (m *unifiedManager) GetStats() (*cgroups.Stats, error) { - return m.fsMgr.GetStats() -} - -func (m *unifiedManager) Set(r *configs.Resources) error { - if r == nil { - return nil - } - properties, err := genV2ResourcesProperties(r, m.dbus) - if err != nil { - return err - } - - if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil { - return fmt.Errorf("unable to set unit properties: %w", err) - } - - return m.fsMgr.Set(r) -} - -func (m *unifiedManager) GetPaths() map[string]string { - paths := make(map[string]string, 1) - paths[""] = m.path - return paths -} - -func (m *unifiedManager) GetCgroups() (*configs.Cgroup, error) { - return m.cgroups, nil -} - -func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) { - return m.fsMgr.GetFreezerState() -} - -func (m *unifiedManager) Exists() bool { - return cgroups.PathExists(m.path) -} - -func (m *unifiedManager) OOMKillCount() (uint64, error) { - return m.fsMgr.OOMKillCount() -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go deleted file mode 100644 index fc4ae44a4..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ /dev/null @@ -1,469 +0,0 @@ -package cgroups - -import ( - "bufio" - "errors" - "fmt" - "io" - "os" - "path/filepath" - "strconv" - "strings" - "sync" - "time" - - "github.com/opencontainers/runc/libcontainer/userns" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" -) - -const ( - CgroupProcesses = "cgroup.procs" - unifiedMountpoint = "/sys/fs/cgroup" - hybridMountpoint = "/sys/fs/cgroup/unified" -) - -var ( - isUnifiedOnce sync.Once - isUnified bool - isHybridOnce sync.Once - isHybrid bool -) - -// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode. -func IsCgroup2UnifiedMode() bool { - isUnifiedOnce.Do(func() { - var st unix.Statfs_t - err := unix.Statfs(unifiedMountpoint, &st) - if err != nil { - if os.IsNotExist(err) && userns.RunningInUserNS() { - // ignore the "not found" error if running in userns - logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint) - isUnified = false - return - } - panic(fmt.Sprintf("cannot statfs cgroup root: %s", err)) - } - isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC - }) - return isUnified -} - -// IsCgroup2HybridMode returns whether we are running in cgroup v2 hybrid mode. -func IsCgroup2HybridMode() bool { - isHybridOnce.Do(func() { - var st unix.Statfs_t - err := unix.Statfs(hybridMountpoint, &st) - if err != nil { - isHybrid = false - if !os.IsNotExist(err) { - // Report unexpected errors. - logrus.WithError(err).Debugf("statfs(%q) failed", hybridMountpoint) - } - return - } - isHybrid = st.Type == unix.CGROUP2_SUPER_MAGIC - }) - return isHybrid -} - -type Mount struct { - Mountpoint string - Root string - Subsystems []string -} - -// GetCgroupMounts returns the mounts for the cgroup subsystems. -// all indicates whether to return just the first instance or all the mounts. -// This function should not be used from cgroupv2 code, as in this case -// all the controllers are available under the constant unifiedMountpoint. -func GetCgroupMounts(all bool) ([]Mount, error) { - if IsCgroup2UnifiedMode() { - // TODO: remove cgroupv2 case once all external users are converted - availableControllers, err := GetAllSubsystems() - if err != nil { - return nil, err - } - m := Mount{ - Mountpoint: unifiedMountpoint, - Root: unifiedMountpoint, - Subsystems: availableControllers, - } - return []Mount{m}, nil - } - - return getCgroupMountsV1(all) -} - -// GetAllSubsystems returns all the cgroup subsystems supported by the kernel -func GetAllSubsystems() ([]string, error) { - // /proc/cgroups is meaningless for v2 - // https://github.com/torvalds/linux/blob/v5.3/Documentation/admin-guide/cgroup-v2.rst#deprecated-v1-core-features - if IsCgroup2UnifiedMode() { - // "pseudo" controllers do not appear in /sys/fs/cgroup/cgroup.controllers. - // - devices: implemented in kernel 4.15 - // - freezer: implemented in kernel 5.2 - // We assume these are always available, as it is hard to detect availability. - pseudo := []string{"devices", "freezer"} - data, err := ReadFile("/sys/fs/cgroup", "cgroup.controllers") - if err != nil { - return nil, err - } - subsystems := append(pseudo, strings.Fields(data)...) - return subsystems, nil - } - f, err := os.Open("/proc/cgroups") - if err != nil { - return nil, err - } - defer f.Close() - - subsystems := []string{} - - s := bufio.NewScanner(f) - for s.Scan() { - text := s.Text() - if text[0] != '#' { - parts := strings.Fields(text) - if len(parts) >= 4 && parts[3] != "0" { - subsystems = append(subsystems, parts[0]) - } - } - } - if err := s.Err(); err != nil { - return nil, err - } - return subsystems, nil -} - -func readProcsFile(dir string) ([]int, error) { - f, err := OpenFile(dir, CgroupProcesses, os.O_RDONLY) - if err != nil { - return nil, err - } - defer f.Close() - - var ( - s = bufio.NewScanner(f) - out = []int{} - ) - - for s.Scan() { - if t := s.Text(); t != "" { - pid, err := strconv.Atoi(t) - if err != nil { - return nil, err - } - out = append(out, pid) - } - } - return out, s.Err() -} - -// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup -// or /proc//cgroup, into a map of subsystems to cgroup paths, e.g. -// -// "cpu": "/user.slice/user-1000.slice" -// "pids": "/user.slice/user-1000.slice" -// -// etc. -// -// Note that for cgroup v2 unified hierarchy, there are no per-controller -// cgroup paths, so the resulting map will have a single element where the key -// is empty string ("") and the value is the cgroup path the is in. -func ParseCgroupFile(path string) (map[string]string, error) { - f, err := os.Open(path) - if err != nil { - return nil, err - } - defer f.Close() - - return parseCgroupFromReader(f) -} - -// helper function for ParseCgroupFile to make testing easier -func parseCgroupFromReader(r io.Reader) (map[string]string, error) { - s := bufio.NewScanner(r) - cgroups := make(map[string]string) - - for s.Scan() { - text := s.Text() - // from cgroups(7): - // /proc/[pid]/cgroup - // ... - // For each cgroup hierarchy ... there is one entry - // containing three colon-separated fields of the form: - // hierarchy-ID:subsystem-list:cgroup-path - parts := strings.SplitN(text, ":", 3) - if len(parts) < 3 { - return nil, fmt.Errorf("invalid cgroup entry: must contain at least two colons: %v", text) - } - - for _, subs := range strings.Split(parts[1], ",") { - cgroups[subs] = parts[2] - } - } - if err := s.Err(); err != nil { - return nil, err - } - - return cgroups, nil -} - -func PathExists(path string) bool { - if _, err := os.Stat(path); err != nil { - return false - } - return true -} - -func EnterPid(cgroupPaths map[string]string, pid int) error { - for _, path := range cgroupPaths { - if PathExists(path) { - if err := WriteCgroupProc(path, pid); err != nil { - return err - } - } - } - return nil -} - -func rmdir(path string) error { - err := unix.Rmdir(path) - if err == nil || err == unix.ENOENT { //nolint:errorlint // unix errors are bare - return nil - } - return &os.PathError{Op: "rmdir", Path: path, Err: err} -} - -// RemovePath aims to remove cgroup path. It does so recursively, -// by removing any subdirectories (sub-cgroups) first. -func RemovePath(path string) error { - // try the fast path first - if err := rmdir(path); err == nil { - return nil - } - - infos, err := os.ReadDir(path) - if err != nil { - if os.IsNotExist(err) { - err = nil - } - return err - } - for _, info := range infos { - if info.IsDir() { - // We should remove subcgroups dir first - if err = RemovePath(filepath.Join(path, info.Name())); err != nil { - break - } - } - } - if err == nil { - err = rmdir(path) - } - return err -} - -// RemovePaths iterates over the provided paths removing them. -// We trying to remove all paths five times with increasing delay between tries. -// If after all there are not removed cgroups - appropriate error will be -// returned. -func RemovePaths(paths map[string]string) (err error) { - const retries = 5 - delay := 10 * time.Millisecond - for i := 0; i < retries; i++ { - if i != 0 { - time.Sleep(delay) - delay *= 2 - } - for s, p := range paths { - if err := RemovePath(p); err != nil { - // do not log intermediate iterations - switch i { - case 0: - logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)") - case retries - 1: - logrus.WithError(err).Error("Failed to remove cgroup") - } - } - _, err := os.Stat(p) - // We need this strange way of checking cgroups existence because - // RemoveAll almost always returns error, even on already removed - // cgroups - if os.IsNotExist(err) { - delete(paths, s) - } - } - if len(paths) == 0 { - //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506 - paths = make(map[string]string) - return nil - } - } - return fmt.Errorf("Failed to remove paths: %v", paths) -} - -var ( - hugePageSizes []string - initHPSOnce sync.Once -) - -func HugePageSizes() []string { - initHPSOnce.Do(func() { - dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0) - if err != nil { - return - } - files, err := dir.Readdirnames(0) - dir.Close() - if err != nil { - return - } - - hugePageSizes, err = getHugePageSizeFromFilenames(files) - if err != nil { - logrus.Warn("HugePageSizes: ", err) - } - }) - - return hugePageSizes -} - -func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) { - pageSizes := make([]string, 0, len(fileNames)) - var warn error - - for _, file := range fileNames { - // example: hugepages-1048576kB - val := strings.TrimPrefix(file, "hugepages-") - if len(val) == len(file) { - // Unexpected file name: no prefix found, ignore it. - continue - } - // The suffix is always "kB" (as of Linux 5.13). If we find - // something else, produce an error but keep going. - eLen := len(val) - 2 - val = strings.TrimSuffix(val, "kB") - if len(val) != eLen { - // Highly unlikely. - if warn == nil { - warn = errors.New(file + `: invalid suffix (expected "kB")`) - } - continue - } - size, err := strconv.Atoi(val) - if err != nil { - // Highly unlikely. - if warn == nil { - warn = fmt.Errorf("%s: %w", file, err) - } - continue - } - // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574 - // but in our case the size is in KB already. - if size >= (1 << 20) { - val = strconv.Itoa(size>>20) + "GB" - } else if size >= (1 << 10) { - val = strconv.Itoa(size>>10) + "MB" - } else { - val += "KB" - } - pageSizes = append(pageSizes, val) - } - - return pageSizes, warn -} - -// GetPids returns all pids, that were added to cgroup at path. -func GetPids(dir string) ([]int, error) { - return readProcsFile(dir) -} - -// WriteCgroupProc writes the specified pid into the cgroup's cgroup.procs file -func WriteCgroupProc(dir string, pid int) error { - // Normally dir should not be empty, one case is that cgroup subsystem - // is not mounted, we will get empty dir, and we want it fail here. - if dir == "" { - return fmt.Errorf("no such directory for %s", CgroupProcesses) - } - - // Dont attach any pid to the cgroup if -1 is specified as a pid - if pid == -1 { - return nil - } - - file, err := OpenFile(dir, CgroupProcesses, os.O_WRONLY) - if err != nil { - return fmt.Errorf("failed to write %v: %w", pid, err) - } - defer file.Close() - - for i := 0; i < 5; i++ { - _, err = file.WriteString(strconv.Itoa(pid)) - if err == nil { - return nil - } - - // EINVAL might mean that the task being added to cgroup.procs is in state - // TASK_NEW. We should attempt to do so again. - if errors.Is(err, unix.EINVAL) { - time.Sleep(30 * time.Millisecond) - continue - } - - return fmt.Errorf("failed to write %v: %w", pid, err) - } - return err -} - -// Since the OCI spec is designed for cgroup v1, in some cases -// there is need to convert from the cgroup v1 configuration to cgroup v2 -// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142) -// convert from [2-262144] to [1-10000] -// 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)" -func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 { - if cpuShares == 0 { - return 0 - } - return (1 + ((cpuShares-2)*9999)/262142) -} - -// ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec -// for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap -// is defined as memory+swap combined, while in cgroup v2 swap is a separate value. -func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) { - // for compatibility with cgroup1 controller, set swap to unlimited in - // case the memory is set to unlimited, and swap is not explicitly set, - // treating the request as "set both memory and swap to unlimited". - if memory == -1 && memorySwap == 0 { - return -1, nil - } - if memorySwap == -1 || memorySwap == 0 { - // -1 is "max", 0 is "unset", so treat as is - return memorySwap, nil - } - // sanity checks - if memory == 0 || memory == -1 { - return 0, errors.New("unable to set swap limit without memory limit") - } - if memory < 0 { - return 0, fmt.Errorf("invalid memory value: %d", memory) - } - if memorySwap < memory { - return 0, errors.New("memory+swap limit should be >= memory limit") - } - - return memorySwap - memory, nil -} - -// Since the OCI spec is designed for cgroup v1, in some cases -// there is need to convert from the cgroup v1 configuration to cgroup v2 -// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990) -// convert linearly from [10-1000] to [1-10000] -func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 { - if blkIoWeight == 0 { - return 0 - } - return 1 + (uint64(blkIoWeight)-10)*9999/990 -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go deleted file mode 100644 index 47c75f22b..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go +++ /dev/null @@ -1,290 +0,0 @@ -package cgroups - -import ( - "errors" - "fmt" - "os" - "path/filepath" - "strings" - "sync" - "syscall" - - securejoin "github.com/cyphar/filepath-securejoin" - "github.com/moby/sys/mountinfo" - "golang.org/x/sys/unix" -) - -// Code in this source file are specific to cgroup v1, -// and must not be used from any cgroup v2 code. - -const ( - CgroupNamePrefix = "name=" - defaultPrefix = "/sys/fs/cgroup" -) - -var ( - errUnified = errors.New("not implemented for cgroup v2 unified hierarchy") - ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1") - - readMountinfoOnce sync.Once - readMountinfoErr error - cgroupMountinfo []*mountinfo.Info -) - -type NotFoundError struct { - Subsystem string -} - -func (e *NotFoundError) Error() string { - return fmt.Sprintf("mountpoint for %s not found", e.Subsystem) -} - -func NewNotFoundError(sub string) error { - return &NotFoundError{ - Subsystem: sub, - } -} - -func IsNotFound(err error) bool { - var nfErr *NotFoundError - return errors.As(err, &nfErr) -} - -func tryDefaultPath(cgroupPath, subsystem string) string { - if !strings.HasPrefix(defaultPrefix, cgroupPath) { - return "" - } - - // remove possible prefix - subsystem = strings.TrimPrefix(subsystem, CgroupNamePrefix) - - // Make sure we're still under defaultPrefix, and resolve - // a possible symlink (like cpu -> cpu,cpuacct). - path, err := securejoin.SecureJoin(defaultPrefix, subsystem) - if err != nil { - return "" - } - - // (1) path should be a directory. - st, err := os.Lstat(path) - if err != nil || !st.IsDir() { - return "" - } - - // (2) path should be a mount point. - pst, err := os.Lstat(filepath.Dir(path)) - if err != nil { - return "" - } - - if st.Sys().(*syscall.Stat_t).Dev == pst.Sys().(*syscall.Stat_t).Dev { - // parent dir has the same dev -- path is not a mount point - return "" - } - - // (3) path should have 'cgroup' fs type. - fst := unix.Statfs_t{} - err = unix.Statfs(path, &fst) - if err != nil || fst.Type != unix.CGROUP_SUPER_MAGIC { - return "" - } - - return path -} - -// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones -// with fstype of "cgroup") for the current running process. -// -// The results are cached (to avoid re-reading mountinfo which is relatively -// expensive), so it is assumed that cgroup mounts are not being changed. -func readCgroupMountinfo() ([]*mountinfo.Info, error) { - readMountinfoOnce.Do(func() { - cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts( - mountinfo.FSTypeFilter("cgroup"), - ) - }) - - return cgroupMountinfo, readMountinfoErr -} - -// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt -func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { - if IsCgroup2UnifiedMode() { - return "", errUnified - } - - // If subsystem is empty, we look for the cgroupv2 hybrid path. - if len(subsystem) == 0 { - return hybridMountpoint, nil - } - - // Avoid parsing mountinfo by trying the default path first, if possible. - if path := tryDefaultPath(cgroupPath, subsystem); path != "" { - return path, nil - } - - mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem) - return mnt, err -} - -func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) { - if IsCgroup2UnifiedMode() { - return "", "", errUnified - } - - mi, err := readCgroupMountinfo() - if err != nil { - return "", "", err - } - - return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem) -} - -func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) { - for _, mi := range mounts { - if strings.HasPrefix(mi.Mountpoint, cgroupPath) { - for _, opt := range strings.Split(mi.VFSOptions, ",") { - if opt == subsystem { - return mi.Mountpoint, mi.Root, nil - } - } - } - } - - return "", "", NewNotFoundError(subsystem) -} - -func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) { - if len(m.Subsystems) == 0 { - return "", errors.New("no subsystem for mount") - } - - return getControllerPath(m.Subsystems[0], cgroups) -} - -func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) { - res := make([]Mount, 0, len(ss)) - numFound := 0 - for _, mi := range mounts { - m := Mount{ - Mountpoint: mi.Mountpoint, - Root: mi.Root, - } - for _, opt := range strings.Split(mi.VFSOptions, ",") { - seen, known := ss[opt] - if !known || (!all && seen) { - continue - } - ss[opt] = true - opt = strings.TrimPrefix(opt, CgroupNamePrefix) - m.Subsystems = append(m.Subsystems, opt) - numFound++ - } - if len(m.Subsystems) > 0 || all { - res = append(res, m) - } - if !all && numFound >= len(ss) { - break - } - } - return res, nil -} - -func getCgroupMountsV1(all bool) ([]Mount, error) { - mi, err := readCgroupMountinfo() - if err != nil { - return nil, err - } - - allSubsystems, err := ParseCgroupFile("/proc/self/cgroup") - if err != nil { - return nil, err - } - - allMap := make(map[string]bool) - for s := range allSubsystems { - allMap[s] = false - } - - return getCgroupMountsHelper(allMap, mi, all) -} - -// GetOwnCgroup returns the relative path to the cgroup docker is running in. -func GetOwnCgroup(subsystem string) (string, error) { - if IsCgroup2UnifiedMode() { - return "", errUnified - } - cgroups, err := ParseCgroupFile("/proc/self/cgroup") - if err != nil { - return "", err - } - - return getControllerPath(subsystem, cgroups) -} - -func GetOwnCgroupPath(subsystem string) (string, error) { - cgroup, err := GetOwnCgroup(subsystem) - if err != nil { - return "", err - } - - // If subsystem is empty, we look for the cgroupv2 hybrid path. - if len(subsystem) == 0 { - return hybridMountpoint, nil - } - - return getCgroupPathHelper(subsystem, cgroup) -} - -func GetInitCgroup(subsystem string) (string, error) { - if IsCgroup2UnifiedMode() { - return "", errUnified - } - cgroups, err := ParseCgroupFile("/proc/1/cgroup") - if err != nil { - return "", err - } - - return getControllerPath(subsystem, cgroups) -} - -func GetInitCgroupPath(subsystem string) (string, error) { - cgroup, err := GetInitCgroup(subsystem) - if err != nil { - return "", err - } - - return getCgroupPathHelper(subsystem, cgroup) -} - -func getCgroupPathHelper(subsystem, cgroup string) (string, error) { - mnt, root, err := FindCgroupMountpointAndRoot("", subsystem) - if err != nil { - return "", err - } - - // This is needed for nested containers, because in /proc/self/cgroup we - // see paths from host, which don't exist in container. - relCgroup, err := filepath.Rel(root, cgroup) - if err != nil { - return "", err - } - - return filepath.Join(mnt, relCgroup), nil -} - -func getControllerPath(subsystem string, cgroups map[string]string) (string, error) { - if IsCgroup2UnifiedMode() { - return "", errUnified - } - - if p, ok := cgroups[subsystem]; ok { - return p, nil - } - - if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok { - return p, nil - } - - return "", NewNotFoundError(subsystem) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go deleted file mode 100644 index fa195bf90..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/blkio_device.go +++ /dev/null @@ -1,66 +0,0 @@ -package configs - -import "fmt" - -// blockIODevice holds major:minor format supported in blkio cgroup -type blockIODevice struct { - // Major is the device's major number - Major int64 `json:"major"` - // Minor is the device's minor number - Minor int64 `json:"minor"` -} - -// WeightDevice struct holds a `major:minor weight`|`major:minor leaf_weight` pair -type WeightDevice struct { - blockIODevice - // Weight is the bandwidth rate for the device, range is from 10 to 1000 - Weight uint16 `json:"weight"` - // LeafWeight is the bandwidth rate for the device while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only - LeafWeight uint16 `json:"leafWeight"` -} - -// NewWeightDevice returns a configured WeightDevice pointer -func NewWeightDevice(major, minor int64, weight, leafWeight uint16) *WeightDevice { - wd := &WeightDevice{} - wd.Major = major - wd.Minor = minor - wd.Weight = weight - wd.LeafWeight = leafWeight - return wd -} - -// WeightString formats the struct to be writable to the cgroup specific file -func (wd *WeightDevice) WeightString() string { - return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight) -} - -// LeafWeightString formats the struct to be writable to the cgroup specific file -func (wd *WeightDevice) LeafWeightString() string { - return fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight) -} - -// ThrottleDevice struct holds a `major:minor rate_per_second` pair -type ThrottleDevice struct { - blockIODevice - // Rate is the IO rate limit per cgroup per device - Rate uint64 `json:"rate"` -} - -// NewThrottleDevice returns a configured ThrottleDevice pointer -func NewThrottleDevice(major, minor int64, rate uint64) *ThrottleDevice { - td := &ThrottleDevice{} - td.Major = major - td.Minor = minor - td.Rate = rate - return td -} - -// String formats the struct to be writable to the cgroup specific file -func (td *ThrottleDevice) String() string { - return fmt.Sprintf("%d:%d %d", td.Major, td.Minor, td.Rate) -} - -// StringName formats the struct to be writable to the cgroup specific file -func (td *ThrottleDevice) StringName(name string) string { - return fmt.Sprintf("%d:%d %s=%d", td.Major, td.Minor, name, td.Rate) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go deleted file mode 100644 index 2d4a89871..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go +++ /dev/null @@ -1,158 +0,0 @@ -package configs - -import ( - systemdDbus "github.com/coreos/go-systemd/v22/dbus" - "github.com/opencontainers/runc/libcontainer/devices" -) - -type FreezerState string - -const ( - Undefined FreezerState = "" - Frozen FreezerState = "FROZEN" - Thawed FreezerState = "THAWED" -) - -// Cgroup holds properties of a cgroup on Linux. -type Cgroup struct { - // Name specifies the name of the cgroup - Name string `json:"name,omitempty"` - - // Parent specifies the name of parent of cgroup or slice - Parent string `json:"parent,omitempty"` - - // Path specifies the path to cgroups that are created and/or joined by the container. - // The path is assumed to be relative to the host system cgroup mountpoint. - Path string `json:"path"` - - // ScopePrefix describes prefix for the scope name - ScopePrefix string `json:"scope_prefix"` - - // Resources contains various cgroups settings to apply - *Resources - - // Systemd tells if systemd should be used to manage cgroups. - Systemd bool - - // SystemdProps are any additional properties for systemd, - // derived from org.systemd.property.xxx annotations. - // Ignored unless systemd is used for managing cgroups. - SystemdProps []systemdDbus.Property `json:"-"` - - // Rootless tells if rootless cgroups should be used. - Rootless bool - - // The host UID that should own the cgroup, or nil to accept - // the default ownership. This should only be set when the - // cgroupfs is to be mounted read/write. - // Not all cgroup manager implementations support changing - // the ownership. - OwnerUID *int `json:"owner_uid,omitempty"` -} - -type Resources struct { - // Devices is the set of access rules for devices in the container. - Devices []*devices.Rule `json:"devices"` - - // Memory limit (in bytes) - Memory int64 `json:"memory"` - - // Memory reservation or soft_limit (in bytes) - MemoryReservation int64 `json:"memory_reservation"` - - // Total memory usage (memory + swap); set `-1` to enable unlimited swap - MemorySwap int64 `json:"memory_swap"` - - // CPU shares (relative weight vs. other containers) - CpuShares uint64 `json:"cpu_shares"` - - // CPU hardcap limit (in usecs). Allowed cpu time in a given period. - CpuQuota int64 `json:"cpu_quota"` - - // CPU period to be used for hardcapping (in usecs). 0 to use system default. - CpuPeriod uint64 `json:"cpu_period"` - - // How many time CPU will use in realtime scheduling (in usecs). - CpuRtRuntime int64 `json:"cpu_rt_quota"` - - // CPU period to be used for realtime scheduling (in usecs). - CpuRtPeriod uint64 `json:"cpu_rt_period"` - - // CPU to use - CpusetCpus string `json:"cpuset_cpus"` - - // MEM to use - CpusetMems string `json:"cpuset_mems"` - - // Process limit; set <= `0' to disable limit. - PidsLimit int64 `json:"pids_limit"` - - // Specifies per cgroup weight, range is from 10 to 1000. - BlkioWeight uint16 `json:"blkio_weight"` - - // Specifies tasks' weight in the given cgroup while competing with the cgroup's child cgroups, range is from 10 to 1000, cfq scheduler only - BlkioLeafWeight uint16 `json:"blkio_leaf_weight"` - - // Weight per cgroup per device, can override BlkioWeight. - BlkioWeightDevice []*WeightDevice `json:"blkio_weight_device"` - - // IO read rate limit per cgroup per device, bytes per second. - BlkioThrottleReadBpsDevice []*ThrottleDevice `json:"blkio_throttle_read_bps_device"` - - // IO write rate limit per cgroup per device, bytes per second. - BlkioThrottleWriteBpsDevice []*ThrottleDevice `json:"blkio_throttle_write_bps_device"` - - // IO read rate limit per cgroup per device, IO per second. - BlkioThrottleReadIOPSDevice []*ThrottleDevice `json:"blkio_throttle_read_iops_device"` - - // IO write rate limit per cgroup per device, IO per second. - BlkioThrottleWriteIOPSDevice []*ThrottleDevice `json:"blkio_throttle_write_iops_device"` - - // set the freeze value for the process - Freezer FreezerState `json:"freezer"` - - // Hugetlb limit (in bytes) - HugetlbLimit []*HugepageLimit `json:"hugetlb_limit"` - - // Whether to disable OOM Killer - OomKillDisable bool `json:"oom_kill_disable"` - - // Tuning swappiness behaviour per cgroup - MemorySwappiness *uint64 `json:"memory_swappiness"` - - // Set priority of network traffic for container - NetPrioIfpriomap []*IfPrioMap `json:"net_prio_ifpriomap"` - - // Set class identifier for container's network packets - NetClsClassid uint32 `json:"net_cls_classid_u"` - - // Rdma resource restriction configuration - Rdma map[string]LinuxRdma `json:"rdma"` - - // Used on cgroups v2: - - // CpuWeight sets a proportional bandwidth limit. - CpuWeight uint64 `json:"cpu_weight"` - - // Unified is cgroupv2-only key-value map. - Unified map[string]string `json:"unified"` - - // SkipDevices allows to skip configuring device permissions. - // Used by e.g. kubelet while creating a parent cgroup (kubepods) - // common for many containers, and by runc update. - // - // NOTE it is impossible to start a container which has this flag set. - SkipDevices bool `json:"-"` - - // SkipFreezeOnSet is a flag for cgroup manager to skip the cgroup - // freeze when setting resources. Only applicable to systemd legacy - // (i.e. cgroup v1) manager (which uses freeze by default to avoid - // spurious permission errors caused by systemd inability to update - // device rules in a non-disruptive manner). - // - // If not set, a few methods (such as looking into cgroup's - // devices.list and querying the systemd unit properties) are used - // during Set() to figure out whether the freeze is required. Those - // methods may be relatively slow, thus this flag. - SkipFreezeOnSet bool `json:"-"` -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go deleted file mode 100644 index 7e383020f..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_unsupported.go +++ /dev/null @@ -1,9 +0,0 @@ -//go:build !linux -// +build !linux - -package configs - -// Cgroup holds properties of a cgroup on Linux -// TODO Windows: This can ultimately be entirely factored out on Windows as -// cgroups are a Unix-specific construct. -type Cgroup struct{} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go deleted file mode 100644 index c1b4a0041..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ /dev/null @@ -1,414 +0,0 @@ -package configs - -import ( - "bytes" - "encoding/json" - "fmt" - "os/exec" - "time" - - "github.com/sirupsen/logrus" - - "github.com/opencontainers/runc/libcontainer/devices" - "github.com/opencontainers/runtime-spec/specs-go" -) - -type Rlimit struct { - Type int `json:"type"` - Hard uint64 `json:"hard"` - Soft uint64 `json:"soft"` -} - -// IDMap represents UID/GID Mappings for User Namespaces. -type IDMap struct { - ContainerID int `json:"container_id"` - HostID int `json:"host_id"` - Size int `json:"size"` -} - -// Seccomp represents syscall restrictions -// By default, only the native architecture of the kernel is allowed to be used -// for syscalls. Additional architectures can be added by specifying them in -// Architectures. -type Seccomp struct { - DefaultAction Action `json:"default_action"` - Architectures []string `json:"architectures"` - Syscalls []*Syscall `json:"syscalls"` - DefaultErrnoRet *uint `json:"default_errno_ret"` - ListenerPath string `json:"listener_path,omitempty"` - ListenerMetadata string `json:"listener_metadata,omitempty"` -} - -// Action is taken upon rule match in Seccomp -type Action int - -const ( - Kill Action = iota + 1 - Errno - Trap - Allow - Trace - Log - Notify - KillThread - KillProcess -) - -// Operator is a comparison operator to be used when matching syscall arguments in Seccomp -type Operator int - -const ( - EqualTo Operator = iota + 1 - NotEqualTo - GreaterThan - GreaterThanOrEqualTo - LessThan - LessThanOrEqualTo - MaskEqualTo -) - -// Arg is a rule to match a specific syscall argument in Seccomp -type Arg struct { - Index uint `json:"index"` - Value uint64 `json:"value"` - ValueTwo uint64 `json:"value_two"` - Op Operator `json:"op"` -} - -// Syscall is a rule to match a syscall in Seccomp -type Syscall struct { - Name string `json:"name"` - Action Action `json:"action"` - ErrnoRet *uint `json:"errnoRet"` - Args []*Arg `json:"args"` -} - -// TODO Windows. Many of these fields should be factored out into those parts -// which are common across platforms, and those which are platform specific. - -// Config defines configuration options for executing a process inside a contained environment. -type Config struct { - // NoPivotRoot will use MS_MOVE and a chroot to jail the process into the container's rootfs - // This is a common option when the container is running in ramdisk - NoPivotRoot bool `json:"no_pivot_root"` - - // ParentDeathSignal specifies the signal that is sent to the container's process in the case - // that the parent process dies. - ParentDeathSignal int `json:"parent_death_signal"` - - // Path to a directory containing the container's root filesystem. - Rootfs string `json:"rootfs"` - - // Umask is the umask to use inside of the container. - Umask *uint32 `json:"umask"` - - // Readonlyfs will remount the container's rootfs as readonly where only externally mounted - // bind mounts are writtable. - Readonlyfs bool `json:"readonlyfs"` - - // Specifies the mount propagation flags to be applied to /. - RootPropagation int `json:"rootPropagation"` - - // Mounts specify additional source and destination paths that will be mounted inside the container's - // rootfs and mount namespace if specified - Mounts []*Mount `json:"mounts"` - - // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well! - Devices []*devices.Device `json:"devices"` - - MountLabel string `json:"mount_label"` - - // Hostname optionally sets the container's hostname if provided - Hostname string `json:"hostname"` - - // Namespaces specifies the container's namespaces that it should setup when cloning the init process - // If a namespace is not provided that namespace is shared from the container's parent process - Namespaces Namespaces `json:"namespaces"` - - // Capabilities specify the capabilities to keep when executing the process inside the container - // All capabilities not specified will be dropped from the processes capability mask - Capabilities *Capabilities `json:"capabilities"` - - // Networks specifies the container's network setup to be created - Networks []*Network `json:"networks"` - - // Routes can be specified to create entries in the route table as the container is started - Routes []*Route `json:"routes"` - - // Cgroups specifies specific cgroup settings for the various subsystems that the container is - // placed into to limit the resources the container has available - Cgroups *Cgroup `json:"cgroups"` - - // AppArmorProfile specifies the profile to apply to the process running in the container and is - // change at the time the process is execed - AppArmorProfile string `json:"apparmor_profile,omitempty"` - - // ProcessLabel specifies the label to apply to the process running in the container. It is - // commonly used by selinux - ProcessLabel string `json:"process_label,omitempty"` - - // Rlimits specifies the resource limits, such as max open files, to set in the container - // If Rlimits are not set, the container will inherit rlimits from the parent process - Rlimits []Rlimit `json:"rlimits,omitempty"` - - // OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores - // for a process. Valid values are between the range [-1000, '1000'], where processes with - // higher scores are preferred for being killed. If it is unset then we don't touch the current - // value. - // More information about kernel oom score calculation here: https://lwn.net/Articles/317814/ - OomScoreAdj *int `json:"oom_score_adj,omitempty"` - - // UidMappings is an array of User ID mappings for User Namespaces - UidMappings []IDMap `json:"uid_mappings"` - - // GidMappings is an array of Group ID mappings for User Namespaces - GidMappings []IDMap `json:"gid_mappings"` - - // MaskPaths specifies paths within the container's rootfs to mask over with a bind - // mount pointing to /dev/null as to prevent reads of the file. - MaskPaths []string `json:"mask_paths"` - - // ReadonlyPaths specifies paths within the container's rootfs to remount as read-only - // so that these files prevent any writes. - ReadonlyPaths []string `json:"readonly_paths"` - - // Sysctl is a map of properties and their values. It is the equivalent of using - // sysctl -w my.property.name value in Linux. - Sysctl map[string]string `json:"sysctl"` - - // Seccomp allows actions to be taken whenever a syscall is made within the container. - // A number of rules are given, each having an action to be taken if a syscall matches it. - // A default action to be taken if no rules match is also given. - Seccomp *Seccomp `json:"seccomp"` - - // NoNewPrivileges controls whether processes in the container can gain additional privileges. - NoNewPrivileges bool `json:"no_new_privileges,omitempty"` - - // Hooks are a collection of actions to perform at various container lifecycle events. - // CommandHooks are serialized to JSON, but other hooks are not. - Hooks Hooks - - // Version is the version of opencontainer specification that is supported. - Version string `json:"version"` - - // Labels are user defined metadata that is stored in the config and populated on the state - Labels []string `json:"labels"` - - // NoNewKeyring will not allocated a new session keyring for the container. It will use the - // callers keyring in this case. - NoNewKeyring bool `json:"no_new_keyring"` - - // IntelRdt specifies settings for Intel RDT group that the container is placed into - // to limit the resources (e.g., L3 cache, memory bandwidth) the container has available - IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` - - // RootlessEUID is set when the runc was launched with non-zero EUID. - // Note that RootlessEUID is set to false when launched with EUID=0 in userns. - // When RootlessEUID is set, runc creates a new userns for the container. - // (config.json needs to contain userns settings) - RootlessEUID bool `json:"rootless_euid,omitempty"` - - // RootlessCgroups is set when unlikely to have the full access to cgroups. - // When RootlessCgroups is set, cgroups errors are ignored. - RootlessCgroups bool `json:"rootless_cgroups,omitempty"` -} - -type ( - HookName string - HookList []Hook - Hooks map[HookName]HookList -) - -const ( - // Prestart commands are executed after the container namespaces are created, - // but before the user supplied command is executed from init. - // Note: This hook is now deprecated - // Prestart commands are called in the Runtime namespace. - Prestart HookName = "prestart" - - // CreateRuntime commands MUST be called as part of the create operation after - // the runtime environment has been created but before the pivot_root has been executed. - // CreateRuntime is called immediately after the deprecated Prestart hook. - // CreateRuntime commands are called in the Runtime Namespace. - CreateRuntime HookName = "createRuntime" - - // CreateContainer commands MUST be called as part of the create operation after - // the runtime environment has been created but before the pivot_root has been executed. - // CreateContainer commands are called in the Container namespace. - CreateContainer HookName = "createContainer" - - // StartContainer commands MUST be called as part of the start operation and before - // the container process is started. - // StartContainer commands are called in the Container namespace. - StartContainer HookName = "startContainer" - - // Poststart commands are executed after the container init process starts. - // Poststart commands are called in the Runtime Namespace. - Poststart HookName = "poststart" - - // Poststop commands are executed after the container init process exits. - // Poststop commands are called in the Runtime Namespace. - Poststop HookName = "poststop" -) - -// KnownHookNames returns the known hook names. -// Used by `runc features`. -func KnownHookNames() []string { - return []string{ - string(Prestart), // deprecated - string(CreateRuntime), - string(CreateContainer), - string(StartContainer), - string(Poststart), - string(Poststop), - } -} - -type Capabilities struct { - // Bounding is the set of capabilities checked by the kernel. - Bounding []string - // Effective is the set of capabilities checked by the kernel. - Effective []string - // Inheritable is the capabilities preserved across execve. - Inheritable []string - // Permitted is the limiting superset for effective capabilities. - Permitted []string - // Ambient is the ambient set of capabilities that are kept. - Ambient []string -} - -func (hooks HookList) RunHooks(state *specs.State) error { - for i, h := range hooks { - if err := h.Run(state); err != nil { - return fmt.Errorf("error running hook #%d: %w", i, err) - } - } - - return nil -} - -func (hooks *Hooks) UnmarshalJSON(b []byte) error { - var state map[HookName][]CommandHook - - if err := json.Unmarshal(b, &state); err != nil { - return err - } - - *hooks = Hooks{} - for n, commandHooks := range state { - if len(commandHooks) == 0 { - continue - } - - (*hooks)[n] = HookList{} - for _, h := range commandHooks { - (*hooks)[n] = append((*hooks)[n], h) - } - } - - return nil -} - -func (hooks *Hooks) MarshalJSON() ([]byte, error) { - serialize := func(hooks []Hook) (serializableHooks []CommandHook) { - for _, hook := range hooks { - switch chook := hook.(type) { - case CommandHook: - serializableHooks = append(serializableHooks, chook) - default: - logrus.Warnf("cannot serialize hook of type %T, skipping", hook) - } - } - - return serializableHooks - } - - return json.Marshal(map[string]interface{}{ - "prestart": serialize((*hooks)[Prestart]), - "createRuntime": serialize((*hooks)[CreateRuntime]), - "createContainer": serialize((*hooks)[CreateContainer]), - "startContainer": serialize((*hooks)[StartContainer]), - "poststart": serialize((*hooks)[Poststart]), - "poststop": serialize((*hooks)[Poststop]), - }) -} - -type Hook interface { - // Run executes the hook with the provided state. - Run(*specs.State) error -} - -// NewFunctionHook will call the provided function when the hook is run. -func NewFunctionHook(f func(*specs.State) error) FuncHook { - return FuncHook{ - run: f, - } -} - -type FuncHook struct { - run func(*specs.State) error -} - -func (f FuncHook) Run(s *specs.State) error { - return f.run(s) -} - -type Command struct { - Path string `json:"path"` - Args []string `json:"args"` - Env []string `json:"env"` - Dir string `json:"dir"` - Timeout *time.Duration `json:"timeout"` -} - -// NewCommandHook will execute the provided command when the hook is run. -func NewCommandHook(cmd Command) CommandHook { - return CommandHook{ - Command: cmd, - } -} - -type CommandHook struct { - Command -} - -func (c Command) Run(s *specs.State) error { - b, err := json.Marshal(s) - if err != nil { - return err - } - var stdout, stderr bytes.Buffer - cmd := exec.Cmd{ - Path: c.Path, - Args: c.Args, - Env: c.Env, - Stdin: bytes.NewReader(b), - Stdout: &stdout, - Stderr: &stderr, - } - if err := cmd.Start(); err != nil { - return err - } - errC := make(chan error, 1) - go func() { - err := cmd.Wait() - if err != nil { - err = fmt.Errorf("error running hook: %w, stdout: %s, stderr: %s", err, stdout.String(), stderr.String()) - } - errC <- err - }() - var timerCh <-chan time.Time - if c.Timeout != nil { - timer := time.NewTimer(*c.Timeout) - defer timer.Stop() - timerCh = timer.C - } - select { - case err := <-errC: - return err - case <-timerCh: - _ = cmd.Process.Kill() - <-errC - return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds()) - } -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go deleted file mode 100644 index 8c02848b7..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config_linux.go +++ /dev/null @@ -1,68 +0,0 @@ -package configs - -import "errors" - -var ( - errNoUIDMap = errors.New("User namespaces enabled, but no uid mappings found.") - errNoUserMap = errors.New("User namespaces enabled, but no user mapping found.") - errNoGIDMap = errors.New("User namespaces enabled, but no gid mappings found.") - errNoGroupMap = errors.New("User namespaces enabled, but no group mapping found.") -) - -// HostUID gets the translated uid for the process on host which could be -// different when user namespaces are enabled. -func (c Config) HostUID(containerId int) (int, error) { - if c.Namespaces.Contains(NEWUSER) { - if c.UidMappings == nil { - return -1, errNoUIDMap - } - id, found := c.hostIDFromMapping(containerId, c.UidMappings) - if !found { - return -1, errNoUserMap - } - return id, nil - } - // Return unchanged id. - return containerId, nil -} - -// HostRootUID gets the root uid for the process on host which could be non-zero -// when user namespaces are enabled. -func (c Config) HostRootUID() (int, error) { - return c.HostUID(0) -} - -// HostGID gets the translated gid for the process on host which could be -// different when user namespaces are enabled. -func (c Config) HostGID(containerId int) (int, error) { - if c.Namespaces.Contains(NEWUSER) { - if c.GidMappings == nil { - return -1, errNoGIDMap - } - id, found := c.hostIDFromMapping(containerId, c.GidMappings) - if !found { - return -1, errNoGroupMap - } - return id, nil - } - // Return unchanged id. - return containerId, nil -} - -// HostRootGID gets the root gid for the process on host which could be non-zero -// when user namespaces are enabled. -func (c Config) HostRootGID() (int, error) { - return c.HostGID(0) -} - -// Utility function that gets a host ID for a container ID from user namespace map -// if that ID is present in the map. -func (c Config) hostIDFromMapping(containerID int, uMap []IDMap) (int, bool) { - for _, m := range uMap { - if (containerID >= m.ContainerID) && (containerID <= (m.ContainerID + m.Size - 1)) { - hostID := m.HostID + (containerID - m.ContainerID) - return hostID, true - } - } - return -1, false -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go deleted file mode 100644 index bce829e29..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go +++ /dev/null @@ -1,10 +0,0 @@ -//go:build gofuzz -// +build gofuzz - -package configs - -func FuzzUnmarshalJSON(data []byte) int { - hooks := Hooks{} - _ = hooks.UnmarshalJSON(data) - return 1 -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go deleted file mode 100644 index d30216380..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/hugepage_limit.go +++ /dev/null @@ -1,9 +0,0 @@ -package configs - -type HugepageLimit struct { - // which type of hugepage to limit. - Pagesize string `json:"page_size"` - - // usage limit for hugepage. - Limit uint64 `json:"limit"` -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go deleted file mode 100644 index f8d951ab8..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/intelrdt.go +++ /dev/null @@ -1,16 +0,0 @@ -package configs - -type IntelRdt struct { - // The identity for RDT Class of Service - ClosID string `json:"closID,omitempty"` - - // The schema for L3 cache id and capacity bitmask (CBM) - // Format: "L3:=;=;..." - L3CacheSchema string `json:"l3_cache_schema,omitempty"` - - // The schema of memory bandwidth per L3 cache id - // Format: "MB:=bandwidth0;=bandwidth1;..." - // The unit of memory bandwidth is specified in "percentages" by - // default, and in "MBps" if MBA Software Controller is enabled. - MemBwSchema string `json:"memBwSchema,omitempty"` -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go deleted file mode 100644 index 9a0395eaf..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/interface_priority_map.go +++ /dev/null @@ -1,14 +0,0 @@ -package configs - -import ( - "fmt" -) - -type IfPrioMap struct { - Interface string `json:"interface"` - Priority int64 `json:"priority"` -} - -func (i *IfPrioMap) CgroupString() string { - return fmt.Sprintf("%s %d", i.Interface, i.Priority) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go deleted file mode 100644 index 784c61820..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/mount.go +++ /dev/null @@ -1,48 +0,0 @@ -package configs - -import "golang.org/x/sys/unix" - -const ( - // EXT_COPYUP is a directive to copy up the contents of a directory when - // a tmpfs is mounted over it. - EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning -) - -type Mount struct { - // Source path for the mount. - Source string `json:"source"` - - // Destination path for the mount inside the container. - Destination string `json:"destination"` - - // Device the mount is for. - Device string `json:"device"` - - // Mount flags. - Flags int `json:"flags"` - - // Propagation Flags - PropagationFlags []int `json:"propagation_flags"` - - // Mount data applied to the mount. - Data string `json:"data"` - - // Relabel source if set, "z" indicates shared, "Z" indicates unshared. - Relabel string `json:"relabel"` - - // RecAttr represents mount properties to be applied recursively (AT_RECURSIVE), see mount_setattr(2). - RecAttr *unix.MountAttr `json:"rec_attr"` - - // Extensions are additional flags that are specific to runc. - Extensions int `json:"extensions"` - - // Optional Command to be run before Source is mounted. - PremountCmds []Command `json:"premount_cmds"` - - // Optional Command to be run after Source is mounted. - PostmountCmds []Command `json:"postmount_cmds"` -} - -func (m *Mount) IsBind() bool { - return m.Flags&unix.MS_BIND != 0 -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go deleted file mode 100644 index a3329a31a..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces.go +++ /dev/null @@ -1,5 +0,0 @@ -package configs - -type NamespaceType string - -type Namespaces []Namespace diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go deleted file mode 100644 index d52d6fcd1..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go +++ /dev/null @@ -1,126 +0,0 @@ -package configs - -import ( - "fmt" - "os" - "sync" -) - -const ( - NEWNET NamespaceType = "NEWNET" - NEWPID NamespaceType = "NEWPID" - NEWNS NamespaceType = "NEWNS" - NEWUTS NamespaceType = "NEWUTS" - NEWIPC NamespaceType = "NEWIPC" - NEWUSER NamespaceType = "NEWUSER" - NEWCGROUP NamespaceType = "NEWCGROUP" -) - -var ( - nsLock sync.Mutex - supportedNamespaces = make(map[NamespaceType]bool) -) - -// NsName converts the namespace type to its filename -func NsName(ns NamespaceType) string { - switch ns { - case NEWNET: - return "net" - case NEWNS: - return "mnt" - case NEWPID: - return "pid" - case NEWIPC: - return "ipc" - case NEWUSER: - return "user" - case NEWUTS: - return "uts" - case NEWCGROUP: - return "cgroup" - } - return "" -} - -// IsNamespaceSupported returns whether a namespace is available or -// not -func IsNamespaceSupported(ns NamespaceType) bool { - nsLock.Lock() - defer nsLock.Unlock() - supported, ok := supportedNamespaces[ns] - if ok { - return supported - } - nsFile := NsName(ns) - // if the namespace type is unknown, just return false - if nsFile == "" { - return false - } - _, err := os.Stat("/proc/self/ns/" + nsFile) - // a namespace is supported if it exists and we have permissions to read it - supported = err == nil - supportedNamespaces[ns] = supported - return supported -} - -func NamespaceTypes() []NamespaceType { - return []NamespaceType{ - NEWUSER, // Keep user NS always first, don't move it. - NEWIPC, - NEWUTS, - NEWNET, - NEWPID, - NEWNS, - NEWCGROUP, - } -} - -// Namespace defines configuration for each namespace. It specifies an -// alternate path that is able to be joined via setns. -type Namespace struct { - Type NamespaceType `json:"type"` - Path string `json:"path"` -} - -func (n *Namespace) GetPath(pid int) string { - return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type)) -} - -func (n *Namespaces) Remove(t NamespaceType) bool { - i := n.index(t) - if i == -1 { - return false - } - *n = append((*n)[:i], (*n)[i+1:]...) - return true -} - -func (n *Namespaces) Add(t NamespaceType, path string) { - i := n.index(t) - if i == -1 { - *n = append(*n, Namespace{Type: t, Path: path}) - return - } - (*n)[i].Path = path -} - -func (n *Namespaces) index(t NamespaceType) int { - for i, ns := range *n { - if ns.Type == t { - return i - } - } - return -1 -} - -func (n *Namespaces) Contains(t NamespaceType) bool { - return n.index(t) != -1 -} - -func (n *Namespaces) PathOf(t NamespaceType) string { - i := n.index(t) - if i == -1 { - return "" - } - return (*n)[i].Path -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go deleted file mode 100644 index 0516dba8d..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall.go +++ /dev/null @@ -1,33 +0,0 @@ -//go:build linux -// +build linux - -package configs - -import "golang.org/x/sys/unix" - -func (n *Namespace) Syscall() int { - return namespaceInfo[n.Type] -} - -var namespaceInfo = map[NamespaceType]int{ - NEWNET: unix.CLONE_NEWNET, - NEWNS: unix.CLONE_NEWNS, - NEWUSER: unix.CLONE_NEWUSER, - NEWIPC: unix.CLONE_NEWIPC, - NEWUTS: unix.CLONE_NEWUTS, - NEWPID: unix.CLONE_NEWPID, - NEWCGROUP: unix.CLONE_NEWCGROUP, -} - -// CloneFlags parses the container's Namespaces options to set the correct -// flags on clone, unshare. This function returns flags only for new namespaces. -func (n *Namespaces) CloneFlags() uintptr { - var flag int - for _, v := range *n { - if v.Path != "" { - continue - } - flag |= namespaceInfo[v.Type] - } - return uintptr(flag) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go deleted file mode 100644 index fbb0d4907..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_syscall_unsupported.go +++ /dev/null @@ -1,14 +0,0 @@ -//go:build !linux && !windows -// +build !linux,!windows - -package configs - -func (n *Namespace) Syscall() int { - panic("No namespace syscall support") -} - -// CloneFlags parses the container's Namespaces options to set the correct -// flags on clone, unshare. This function returns flags only for new namespaces. -func (n *Namespaces) CloneFlags() uintptr { - panic("No namespace syscall support") -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go deleted file mode 100644 index 946db30a5..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_unsupported.go +++ /dev/null @@ -1,8 +0,0 @@ -//go:build !linux -// +build !linux - -package configs - -// Namespace defines configuration for each namespace. It specifies an -// alternate path that is able to be joined via setns. -type Namespace struct{} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/network.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/network.go deleted file mode 100644 index c44c3ea71..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/network.go +++ /dev/null @@ -1,75 +0,0 @@ -package configs - -// Network defines configuration for a container's networking stack -// -// The network configuration can be omitted from a container causing the -// container to be setup with the host's networking stack -type Network struct { - // Type sets the networks type, commonly veth and loopback - Type string `json:"type"` - - // Name of the network interface - Name string `json:"name"` - - // The bridge to use. - Bridge string `json:"bridge"` - - // MacAddress contains the MAC address to set on the network interface - MacAddress string `json:"mac_address"` - - // Address contains the IPv4 and mask to set on the network interface - Address string `json:"address"` - - // Gateway sets the gateway address that is used as the default for the interface - Gateway string `json:"gateway"` - - // IPv6Address contains the IPv6 and mask to set on the network interface - IPv6Address string `json:"ipv6_address"` - - // IPv6Gateway sets the ipv6 gateway address that is used as the default for the interface - IPv6Gateway string `json:"ipv6_gateway"` - - // Mtu sets the mtu value for the interface and will be mirrored on both the host and - // container's interfaces if a pair is created, specifically in the case of type veth - // Note: This does not apply to loopback interfaces. - Mtu int `json:"mtu"` - - // TxQueueLen sets the tx_queuelen value for the interface and will be mirrored on both the host and - // container's interfaces if a pair is created, specifically in the case of type veth - // Note: This does not apply to loopback interfaces. - TxQueueLen int `json:"txqueuelen"` - - // HostInterfaceName is a unique name of a veth pair that resides on in the host interface of the - // container. - HostInterfaceName string `json:"host_interface_name"` - - // HairpinMode specifies if hairpin NAT should be enabled on the virtual interface - // bridge port in the case of type veth - // Note: This is unsupported on some systems. - // Note: This does not apply to loopback interfaces. - HairpinMode bool `json:"hairpin_mode"` -} - -// Route defines a routing table entry. -// -// Routes can be specified to create entries in the routing table as the container -// is started. -// -// All of destination, source, and gateway should be either IPv4 or IPv6. -// One of the three options must be present, and omitted entries will use their -// IP family default for the route table. For IPv4 for example, setting the -// gateway to 1.2.3.4 and the interface to eth0 will set up a standard -// destination of 0.0.0.0(or *) when viewed in the route table. -type Route struct { - // Destination specifies the destination IP address and mask in the CIDR form. - Destination string `json:"destination"` - - // Source specifies the source IP address and mask in the CIDR form. - Source string `json:"source"` - - // Gateway specifies the gateway IP address. - Gateway string `json:"gateway"` - - // InterfaceName specifies the device to set this route up for, for example eth0. - InterfaceName string `json:"interface_name"` -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/rdma.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/rdma.go deleted file mode 100644 index c69f2c802..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/rdma.go +++ /dev/null @@ -1,9 +0,0 @@ -package configs - -// LinuxRdma for Linux cgroup 'rdma' resource management (Linux 4.11) -type LinuxRdma struct { - // Maximum number of HCA handles that can be opened. Default is "no limit". - HcaHandles *uint32 `json:"hca_handles,omitempty"` - // Maximum number of HCA objects that can be created. Default is "no limit". - HcaObjects *uint32 `json:"hca_objects,omitempty"` -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go deleted file mode 100644 index f6cb98e5e..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go +++ /dev/null @@ -1,5 +0,0 @@ -package userns - -// RunningInUserNS detects whether we are currently running in a user namespace. -// Originally copied from github.com/lxc/lxd/shared/util.go -var RunningInUserNS = runningInUserNS diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go deleted file mode 100644 index 1e00ab8b5..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go +++ /dev/null @@ -1,16 +0,0 @@ -//go:build gofuzz -// +build gofuzz - -package userns - -import ( - "strings" - - "github.com/opencontainers/runc/libcontainer/user" -) - -func FuzzUIDMap(data []byte) int { - uidmap, _ := user.ParseIDMap(strings.NewReader(string(data))) - _ = uidMapInUserNS(uidmap) - return 1 -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go deleted file mode 100644 index 724e6df01..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go +++ /dev/null @@ -1,37 +0,0 @@ -package userns - -import ( - "sync" - - "github.com/opencontainers/runc/libcontainer/user" -) - -var ( - inUserNS bool - nsOnce sync.Once -) - -// runningInUserNS detects whether we are currently running in a user namespace. -// Originally copied from github.com/lxc/lxd/shared/util.go -func runningInUserNS() bool { - nsOnce.Do(func() { - uidmap, err := user.CurrentProcessUIDMap() - if err != nil { - // This kernel-provided file only exists if user namespaces are supported - return - } - inUserNS = uidMapInUserNS(uidmap) - }) - return inUserNS -} - -func uidMapInUserNS(uidmap []user.IDMap) bool { - /* - * We assume we are in the initial user namespace if we have a full - * range - 4294967295 uids starting at uid 0. - */ - if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 { - return false - } - return true -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go deleted file mode 100644 index f35c13a10..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go +++ /dev/null @@ -1,18 +0,0 @@ -//go:build !linux -// +build !linux - -package userns - -import "github.com/opencontainers/runc/libcontainer/user" - -// runningInUserNS is a stub for non-Linux systems -// Always returns false -func runningInUserNS() bool { - return false -} - -// uidMapInUserNS is a stub for non-Linux systems -// Always returns false -func uidMapInUserNS(uidmap []user.IDMap) bool { - return false -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go deleted file mode 100644 index 7ef9da21f..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/cmsg.go +++ /dev/null @@ -1,96 +0,0 @@ -package utils - -/* - * Copyright 2016, 2017 SUSE LLC - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import ( - "fmt" - "os" - - "golang.org/x/sys/unix" -) - -// MaxSendfdLen is the maximum length of the name of a file descriptor being -// sent using SendFd. The name of the file handle returned by RecvFd will never -// be larger than this value. -const MaxNameLen = 4096 - -// oobSpace is the size of the oob slice required to store a single FD. Note -// that unix.UnixRights appears to make the assumption that fd is always int32, -// so sizeof(fd) = 4. -var oobSpace = unix.CmsgSpace(4) - -// RecvFd waits for a file descriptor to be sent over the given AF_UNIX -// socket. The file name of the remote file descriptor will be recreated -// locally (it is sent as non-auxiliary data in the same payload). -func RecvFd(socket *os.File) (*os.File, error) { - // For some reason, unix.Recvmsg uses the length rather than the capacity - // when passing the msg_controllen and other attributes to recvmsg. So we - // have to actually set the length. - name := make([]byte, MaxNameLen) - oob := make([]byte, oobSpace) - - sockfd := socket.Fd() - n, oobn, _, _, err := unix.Recvmsg(int(sockfd), name, oob, 0) - if err != nil { - return nil, err - } - - if n >= MaxNameLen || oobn != oobSpace { - return nil, fmt.Errorf("recvfd: incorrect number of bytes read (n=%d oobn=%d)", n, oobn) - } - - // Truncate. - name = name[:n] - oob = oob[:oobn] - - scms, err := unix.ParseSocketControlMessage(oob) - if err != nil { - return nil, err - } - if len(scms) != 1 { - return nil, fmt.Errorf("recvfd: number of SCMs is not 1: %d", len(scms)) - } - scm := scms[0] - - fds, err := unix.ParseUnixRights(&scm) - if err != nil { - return nil, err - } - if len(fds) != 1 { - return nil, fmt.Errorf("recvfd: number of fds is not 1: %d", len(fds)) - } - fd := uintptr(fds[0]) - - return os.NewFile(fd, string(name)), nil -} - -// SendFd sends a file descriptor over the given AF_UNIX socket. In -// addition, the file.Name() of the given file will also be sent as -// non-auxiliary data in the same payload (allowing to send contextual -// information for a file descriptor). -func SendFd(socket *os.File, name string, fd uintptr) error { - if len(name) >= MaxNameLen { - return fmt.Errorf("sendfd: filename too long: %s", name) - } - return SendFds(socket, []byte(name), int(fd)) -} - -// SendFds sends a list of files descriptor and msg over the given AF_UNIX socket. -func SendFds(socket *os.File, msg []byte, fds ...int) error { - oob := unix.UnixRights(fds...) - return unix.Sendmsg(int(socket.Fd()), msg, oob, nil, 0) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go deleted file mode 100644 index 6b9fc3435..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go +++ /dev/null @@ -1,167 +0,0 @@ -package utils - -import ( - "encoding/binary" - "encoding/json" - "fmt" - "io" - "os" - "path/filepath" - "strconv" - "strings" - "unsafe" - - securejoin "github.com/cyphar/filepath-securejoin" - "golang.org/x/sys/unix" -) - -const ( - exitSignalOffset = 128 -) - -// NativeEndian is the native byte order of the host system. -var NativeEndian binary.ByteOrder - -func init() { - // Copied from . - i := uint32(1) - b := (*[4]byte)(unsafe.Pointer(&i)) - if b[0] == 1 { - NativeEndian = binary.LittleEndian - } else { - NativeEndian = binary.BigEndian - } -} - -// ExitStatus returns the correct exit status for a process based on if it -// was signaled or exited cleanly -func ExitStatus(status unix.WaitStatus) int { - if status.Signaled() { - return exitSignalOffset + int(status.Signal()) - } - return status.ExitStatus() -} - -// WriteJSON writes the provided struct v to w using standard json marshaling -func WriteJSON(w io.Writer, v interface{}) error { - data, err := json.Marshal(v) - if err != nil { - return err - } - _, err = w.Write(data) - return err -} - -// CleanPath makes a path safe for use with filepath.Join. This is done by not -// only cleaning the path, but also (if the path is relative) adding a leading -// '/' and cleaning it (then removing the leading '/'). This ensures that a -// path resulting from prepending another path will always resolve to lexically -// be a subdirectory of the prefixed path. This is all done lexically, so paths -// that include symlinks won't be safe as a result of using CleanPath. -func CleanPath(path string) string { - // Deal with empty strings nicely. - if path == "" { - return "" - } - - // Ensure that all paths are cleaned (especially problematic ones like - // "/../../../../../" which can cause lots of issues). - path = filepath.Clean(path) - - // If the path isn't absolute, we need to do more processing to fix paths - // such as "../../../..//some/path". We also shouldn't convert absolute - // paths to relative ones. - if !filepath.IsAbs(path) { - path = filepath.Clean(string(os.PathSeparator) + path) - // This can't fail, as (by definition) all paths are relative to root. - path, _ = filepath.Rel(string(os.PathSeparator), path) - } - - // Clean the path again for good measure. - return filepath.Clean(path) -} - -// stripRoot returns the passed path, stripping the root path if it was -// (lexicially) inside it. Note that both passed paths will always be treated -// as absolute, and the returned path will also always be absolute. In -// addition, the paths are cleaned before stripping the root. -func stripRoot(root, path string) string { - // Make the paths clean and absolute. - root, path = CleanPath("/"+root), CleanPath("/"+path) - switch { - case path == root: - path = "/" - case root == "/": - // do nothing - case strings.HasPrefix(path, root+"/"): - path = strings.TrimPrefix(path, root+"/") - } - return CleanPath("/" + path) -} - -// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...) -// corresponding to the unsafePath resolved within the root. Before passing the -// fd, this path is verified to have been inside the root -- so operating on it -// through the passed fdpath should be safe. Do not access this path through -// the original path strings, and do not attempt to use the pathname outside of -// the passed closure (the file handle will be freed once the closure returns). -func WithProcfd(root, unsafePath string, fn func(procfd string) error) error { - // Remove the root then forcefully resolve inside the root. - unsafePath = stripRoot(root, unsafePath) - path, err := securejoin.SecureJoin(root, unsafePath) - if err != nil { - return fmt.Errorf("resolving path inside rootfs failed: %w", err) - } - - // Open the target path. - fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0) - if err != nil { - return fmt.Errorf("open o_path procfd: %w", err) - } - defer fh.Close() - - // Double-check the path is the one we expected. - procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd())) - if realpath, err := os.Readlink(procfd); err != nil { - return fmt.Errorf("procfd verification failed: %w", err) - } else if realpath != path { - return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath) - } - - // Run the closure. - return fn(procfd) -} - -// SearchLabels searches a list of key-value pairs for the provided key and -// returns the corresponding value. The pairs must be separated with '='. -func SearchLabels(labels []string, query string) string { - for _, l := range labels { - parts := strings.SplitN(l, "=", 2) - if len(parts) < 2 { - continue - } - if parts[0] == query { - return parts[1] - } - } - return "" -} - -// Annotations returns the bundle path and user defined annotations from the -// libcontainer state. We need to remove the bundle because that is a label -// added by libcontainer. -func Annotations(labels []string) (bundle string, userAnnotations map[string]string) { - userAnnotations = make(map[string]string) - for _, l := range labels { - parts := strings.SplitN(l, "=", 2) - if len(parts) < 2 { - continue - } - if parts[0] == "bundle" { - bundle = parts[1] - } else { - userAnnotations[parts[0]] = parts[1] - } - } - return -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go deleted file mode 100644 index 220d0b439..000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils_unix.go +++ /dev/null @@ -1,69 +0,0 @@ -//go:build !windows -// +build !windows - -package utils - -import ( - "fmt" - "os" - "strconv" - - "golang.org/x/sys/unix" -) - -// EnsureProcHandle returns whether or not the given file handle is on procfs. -func EnsureProcHandle(fh *os.File) error { - var buf unix.Statfs_t - if err := unix.Fstatfs(int(fh.Fd()), &buf); err != nil { - return fmt.Errorf("ensure %s is on procfs: %w", fh.Name(), err) - } - if buf.Type != unix.PROC_SUPER_MAGIC { - return fmt.Errorf("%s is not on procfs", fh.Name()) - } - return nil -} - -// CloseExecFrom applies O_CLOEXEC to all file descriptors currently open for -// the process (except for those below the given fd value). -func CloseExecFrom(minFd int) error { - fdDir, err := os.Open("/proc/self/fd") - if err != nil { - return err - } - defer fdDir.Close() - - if err := EnsureProcHandle(fdDir); err != nil { - return err - } - - fdList, err := fdDir.Readdirnames(-1) - if err != nil { - return err - } - for _, fdStr := range fdList { - fd, err := strconv.Atoi(fdStr) - // Ignore non-numeric file names. - if err != nil { - continue - } - // Ignore descriptors lower than our specified minimum. - if fd < minFd { - continue - } - // Intentionally ignore errors from unix.CloseOnExec -- the cases where - // this might fail are basically file descriptors that have already - // been closed (including and especially the one that was created when - // os.ReadDir did the "opendir" syscall). - unix.CloseOnExec(fd) - } - return nil -} - -// NewSockPair returns a new unix socket pair -func NewSockPair(name string) (parent *os.File, child *os.File, err error) { - fds, err := unix.Socketpair(unix.AF_LOCAL, unix.SOCK_STREAM|unix.SOCK_CLOEXEC, 0) - if err != nil { - return nil, nil, err - } - return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil -} diff --git a/vendor/modules.txt b/vendor/modules.txt index cc8ea14b8..ecff62193 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -190,9 +190,6 @@ github.com/coreos/go-systemd/v22/dbus # github.com/cpuguy83/go-md2man/v2 v2.0.2 ## explicit; go 1.11 github.com/cpuguy83/go-md2man/v2/md2man -# github.com/cyphar/filepath-securejoin v0.2.3 -## explicit; go 1.13 -github.com/cyphar/filepath-securejoin # github.com/davecgh/go-spew v1.1.1 ## explicit github.com/davecgh/go-spew/spew @@ -342,19 +339,8 @@ github.com/opencontainers/image-spec/specs-go github.com/opencontainers/image-spec/specs-go/v1 # github.com/opencontainers/runc v1.1.9 ## explicit; go 1.17 -github.com/opencontainers/runc/libcontainer/cgroups -github.com/opencontainers/runc/libcontainer/cgroups/devices -github.com/opencontainers/runc/libcontainer/cgroups/ebpf -github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter -github.com/opencontainers/runc/libcontainer/cgroups/fs -github.com/opencontainers/runc/libcontainer/cgroups/fs2 -github.com/opencontainers/runc/libcontainer/cgroups/fscommon -github.com/opencontainers/runc/libcontainer/cgroups/systemd -github.com/opencontainers/runc/libcontainer/configs github.com/opencontainers/runc/libcontainer/devices github.com/opencontainers/runc/libcontainer/user -github.com/opencontainers/runc/libcontainer/userns -github.com/opencontainers/runc/libcontainer/utils # github.com/opencontainers/runtime-spec v1.1.0 ## explicit github.com/opencontainers/runtime-spec/specs-go