vendor: update google/cadvisor and opencontainers/runc

Signed-off-by: Giuseppe Scrivano <gscrivan@redhat.com>
2020-06-24 10:56:34 +02:00
parent 78d295d168
commit a6a3bf2eb4
632 changed files with 36493 additions and 89280 deletions
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/BUILD
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/BUILD
@@ -7,6 +7,7 @@ go_library(
        "cgroups_unsupported.go",
        "stats.go",
        "utils.go",
+        "v1_utils.go",
    ],
    importmap = "k8s.io/kubernetes/vendor/github.com/opencontainers/runc/libcontainer/cgroups",
    importpath = "github.com/opencontainers/runc/libcontainer/cgroups",
@@ -37,6 +38,7 @@ filegroup(
    name = "all-srcs",
    srcs = [
        ":package-srcs",
+        "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices:all-srcs",
        "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf:all-srcs",
        "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:all-srcs",
        "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2:all-srcs",
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
@@ -3,8 +3,6 @@
 package cgroups

 import (
-	"fmt"
-
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

@@ -27,48 +25,27 @@ type Manager interface {
 	// Destroys the cgroup set
 	Destroy() error

-	// The option func SystemdCgroups() and Cgroupfs() require following attributes:
-	// 	Paths   map[string]string
-	// 	Cgroups *configs.Cgroup
-	// Paths maps cgroup subsystem to path at which it is mounted.
-	// Cgroups specifies specific cgroup settings for the various subsystems
-
-	// Returns cgroup paths to save in a state file and to be able to
-	// restore the object later.
-	GetPaths() map[string]string
-
-	// GetUnifiedPath returns the unified path when running in unified mode.
-	// The value corresponds to the all values of GetPaths() map.
-	//
-	// GetUnifiedPath returns error when running in hybrid mode as well as
-	// in legacy mode.
-	GetUnifiedPath() (string, error)
+	// Path returns a cgroup path to the specified controller/subsystem.
+	// For cgroupv2, the argument is unused and can be empty.
+	Path(string) string

 	// Sets the cgroup as configured.
 	Set(container *configs.Config) error

-	// Gets the cgroup as configured.
+	// GetPaths returns cgroup path(s) to save in a state file in order to restore later.
+	//
+	// For cgroup v1, a key is cgroup subsystem name, and the value is the path
+	// to the cgroup for this subsystem.
+	//
+	// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
+	GetPaths() map[string]string
+
+	// GetCgroups returns the cgroup data as configured.
 	GetCgroups() (*configs.Cgroup, error)
-}

-type NotFoundError struct {
-	Subsystem string
-}
+	// GetFreezerState retrieves the current FreezerState of the cgroup.
+	GetFreezerState() (configs.FreezerState, error)

-func (e *NotFoundError) Error() string {
-	return fmt.Sprintf("mountpoint for %s not found", e.Subsystem)
-}
-
-func NewNotFoundError(sub string) error {
-	return &NotFoundError{
-		Subsystem: sub,
-	}
-}
-
-func IsNotFound(err error) bool {
-	if err == nil {
-		return false
-	}
-	_, ok := err.(*NotFoundError)
-	return ok
+	// Whether the cgroup path exists or not
+	Exists() bool
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/BUILD
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/BUILD
@@ -0,0 +1,34 @@
+load("@io_bazel_rules_go//go:def.bzl", "go_library")
+
+go_library(
+    name = "go_default_library",
+    srcs = ["devices_emulator.go"],
+    importmap = "k8s.io/kubernetes/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices",
+    importpath = "github.com/opencontainers/runc/libcontainer/cgroups/devices",
+    visibility = ["//visibility:public"],
+    deps = select({
+        "@io_bazel_rules_go//go/platform:android": [
+            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
+            "//vendor/github.com/pkg/errors:go_default_library",
+        ],
+        "@io_bazel_rules_go//go/platform:linux": [
+            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
+            "//vendor/github.com/pkg/errors:go_default_library",
+        ],
+        "//conditions:default": [],
+    }),
+)
+
+filegroup(
+    name = "package-srcs",
+    srcs = glob(["**"]),
+    tags = ["automanaged"],
+    visibility = ["//visibility:private"],
+)
+
+filegroup(
+    name = "all-srcs",
+    srcs = [":package-srcs"],
+    tags = ["automanaged"],
+    visibility = ["//visibility:public"],
+)
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go
@@ -0,0 +1,373 @@
+// +build linux
+
+// SPDX-License-Identifier: Apache-2.0
+/*
+ * Copyright (C) 2020 Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2020 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package devices
+
+import (
+	"bufio"
+	"io"
+	"regexp"
+	"sort"
+	"strconv"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+
+	"github.com/pkg/errors"
+)
+
+// deviceMeta is a DeviceRule without the Allow or Permissions fields, and no
+// wildcard-type support. It's effectively the "match" portion of a metadata
+// rule, for the purposes of our emulation.
+type deviceMeta struct {
+	node  configs.DeviceType
+	major int64
+	minor int64
+}
+
+// deviceRule is effectively the tuple (deviceMeta, DevicePermissions).
+type deviceRule struct {
+	meta  deviceMeta
+	perms configs.DevicePermissions
+}
+
+// deviceRules is a mapping of device metadata rules to the associated
+// permissions in the ruleset.
+type deviceRules map[deviceMeta]configs.DevicePermissions
+
+func (r deviceRules) orderedEntries() []deviceRule {
+	var rules []deviceRule
+	for meta, perms := range r {
+		rules = append(rules, deviceRule{meta: meta, perms: perms})
+	}
+	sort.Slice(rules, func(i, j int) bool {
+		// Sort by (major, minor, type).
+		a, b := rules[i].meta, rules[j].meta
+		return a.major < b.major ||
+			(a.major == b.major && a.minor < b.minor) ||
+			(a.major == b.major && a.minor == b.minor && a.node < b.node)
+	})
+	return rules
+}
+
+type Emulator struct {
+	defaultAllow bool
+	rules        deviceRules
+}
+
+func (e *Emulator) IsBlacklist() bool {
+	return e.defaultAllow
+}
+
+func (e *Emulator) IsAllowAll() bool {
+	return e.IsBlacklist() && len(e.rules) == 0
+}
+
+var devicesListRegexp = regexp.MustCompile(`^([abc])\s+(\d+|\*):(\d+|\*)\s+([rwm]+)$`)
+
+func parseLine(line string) (*deviceRule, error) {
+	matches := devicesListRegexp.FindStringSubmatch(line)
+	if matches == nil {
+		return nil, errors.Errorf("line doesn't match devices.list format")
+	}
+	var (
+		rule  deviceRule
+		node  = matches[1]
+		major = matches[2]
+		minor = matches[3]
+		perms = matches[4]
+	)
+
+	// Parse the node type.
+	switch node {
+	case "a":
+		// Super-special case -- "a" always means every device with every
+		// access mode. In fact, for devices.list this actually indicates that
+		// the cgroup is in black-list mode.
+		// TODO: Double-check that the entire file is "a *:* rwm".
+		return nil, nil
+	case "b":
+		rule.meta.node = configs.BlockDevice
+	case "c":
+		rule.meta.node = configs.CharDevice
+	default:
+		// Should never happen!
+		return nil, errors.Errorf("unknown device type %q", node)
+	}
+
+	// Parse the major number.
+	if major == "*" {
+		rule.meta.major = configs.Wildcard
+	} else {
+		val, err := strconv.ParseUint(major, 10, 32)
+		if err != nil {
+			return nil, errors.Wrap(err, "parse major number")
+		}
+		rule.meta.major = int64(val)
+	}
+
+	// Parse the minor number.
+	if minor == "*" {
+		rule.meta.minor = configs.Wildcard
+	} else {
+		val, err := strconv.ParseUint(minor, 10, 32)
+		if err != nil {
+			return nil, errors.Wrap(err, "parse minor number")
+		}
+		rule.meta.minor = int64(val)
+	}
+
+	// Parse the access permissions.
+	rule.perms = configs.DevicePermissions(perms)
+	if !rule.perms.IsValid() || rule.perms.IsEmpty() {
+		// Should never happen!
+		return nil, errors.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
+	}
+	return &rule, nil
+}
+
+func (e *Emulator) addRule(rule deviceRule) error {
+	if e.rules == nil {
+		e.rules = make(map[deviceMeta]configs.DevicePermissions)
+	}
+
+	// Merge with any pre-existing permissions.
+	oldPerms := e.rules[rule.meta]
+	newPerms := rule.perms.Union(oldPerms)
+	e.rules[rule.meta] = newPerms
+	return nil
+}
+
+func (e *Emulator) rmRule(rule deviceRule) error {
+	// Give an error if any of the permissions requested to be removed are
+	// present in a partially-matching wildcard rule, because such rules will
+	// be ignored by cgroupv1.
+	//
+	// This is a diversion from cgroupv1, but is necessary to avoid leading
+	// users into a false sense of security. cgroupv1 will silently(!) ignore
+	// requests to remove partial exceptions, but we really shouldn't do that.
+	//
+	// It may seem like we could just "split" wildcard rules which hit this
+	// issue, but unfortunately there are 2^32 possible major and minor
+	// numbers, which would exhaust kernel memory quickly if we did this. Not
+	// to mention it'd be really slow (the kernel side is implemented as a
+	// linked-list of exceptions).
+	for _, partialMeta := range []deviceMeta{
+		{node: rule.meta.node, major: configs.Wildcard, minor: rule.meta.minor},
+		{node: rule.meta.node, major: rule.meta.major, minor: configs.Wildcard},
+		{node: rule.meta.node, major: configs.Wildcard, minor: configs.Wildcard},
+	} {
+		// This wildcard rule is equivalent to the requested rule, so skip it.
+		if rule.meta == partialMeta {
+			continue
+		}
+		// Only give an error if the set of permissions overlap.
+		partialPerms := e.rules[partialMeta]
+		if !partialPerms.Intersection(rule.perms).IsEmpty() {
+			return errors.Errorf("requested rule [%v %v] not supported by devices cgroupv1 (cannot punch hole in existing wildcard rule [%v %v])", rule.meta, rule.perms, partialMeta, partialPerms)
+		}
+	}
+
+	// Subtract all of the permissions listed from the full match rule. If the
+	// rule didn't exist, all of this is a no-op.
+	newPerms := e.rules[rule.meta].Difference(rule.perms)
+	if newPerms.IsEmpty() {
+		delete(e.rules, rule.meta)
+	} else {
+		e.rules[rule.meta] = newPerms
+	}
+	// TODO: The actual cgroup code doesn't care if an exception didn't exist
+	//       during removal, so not erroring out here is /accurate/ but quite
+	//       worrying. Maybe we should do additional validation, but again we
+	//       have to worry about backwards-compatibility.
+	return nil
+}
+
+func (e *Emulator) allow(rule *deviceRule) error {
+	// This cgroup is configured as a black-list. Reset the entire emulator,
+	// and put is into black-list mode.
+	if rule == nil || rule.meta.node == configs.WildcardDevice {
+		*e = Emulator{
+			defaultAllow: true,
+			rules:        nil,
+		}
+		return nil
+	}
+
+	var err error
+	if e.defaultAllow {
+		err = errors.Wrap(e.rmRule(*rule), "remove 'deny' exception")
+	} else {
+		err = errors.Wrap(e.addRule(*rule), "add 'allow' exception")
+	}
+	return err
+}
+
+func (e *Emulator) deny(rule *deviceRule) error {
+	// This cgroup is configured as a white-list. Reset the entire emulator,
+	// and put is into white-list mode.
+	if rule == nil || rule.meta.node == configs.WildcardDevice {
+		*e = Emulator{
+			defaultAllow: false,
+			rules:        nil,
+		}
+		return nil
+	}
+
+	var err error
+	if e.defaultAllow {
+		err = errors.Wrap(e.addRule(*rule), "add 'deny' exception")
+	} else {
+		err = errors.Wrap(e.rmRule(*rule), "remove 'allow' exception")
+	}
+	return err
+}
+
+func (e *Emulator) Apply(rule configs.DeviceRule) error {
+	if !rule.Type.CanCgroup() {
+		return errors.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
+	}
+
+	innerRule := &deviceRule{
+		meta: deviceMeta{
+			node:  rule.Type,
+			major: rule.Major,
+			minor: rule.Minor,
+		},
+		perms: rule.Permissions,
+	}
+	if innerRule.meta.node == configs.WildcardDevice {
+		innerRule = nil
+	}
+
+	if rule.Allow {
+		return e.allow(innerRule)
+	} else {
+		return e.deny(innerRule)
+	}
+}
+
+// EmulatorFromList takes a reader to a "devices.list"-like source, and returns
+// a new Emulator that represents the state of the devices cgroup. Note that
+// black-list devices cgroups cannot be fully reconstructed, due to limitations
+// in the devices cgroup API. Instead, such cgroups are always treated as
+// "allow all" cgroups.
+func EmulatorFromList(list io.Reader) (*Emulator, error) {
+	// Normally cgroups are in black-list mode by default, but the way we
+	// figure out the current mode is whether or not devices.list has an
+	// allow-all rule. So we default to a white-list, and the existence of an
+	// "a *:* rwm" entry will tell us otherwise.
+	e := &Emulator{
+		defaultAllow: false,
+	}
+
+	// Parse the "devices.list".
+	s := bufio.NewScanner(list)
+	for s.Scan() {
+		line := s.Text()
+		deviceRule, err := parseLine(line)
+		if err != nil {
+			return nil, errors.Wrapf(err, "parsing line %q", line)
+		}
+		// "devices.list" is an allow list. Note that this means that in
+		// black-list mode, we have no idea what rules are in play. As a
+		// result, we need to be very careful in Transition().
+		if err := e.allow(deviceRule); err != nil {
+			return nil, errors.Wrapf(err, "adding devices.list rule")
+		}
+	}
+	if err := s.Err(); err != nil {
+		return nil, errors.Wrap(err, "reading devices.list lines")
+	}
+	return e, nil
+}
+
+// Transition calculates what is the minimally-disruptive set of rules need to
+// be applied to a devices cgroup in order to transition to the given target.
+// This means that any already-existing rules will not be applied, and
+// disruptive rules (like denying all device access) will only be applied if
+// necessary.
+//
+// This function is the sole reason for all of Emulator -- to allow us
+// to figure out how to update a containers' cgroups without causing spurrious
+// device errors (if possible).
+func (source *Emulator) Transition(target *Emulator) ([]*configs.DeviceRule, error) {
+	var transitionRules []*configs.DeviceRule
+	oldRules := source.rules
+
+	// If the default policy doesn't match, we need to include a "disruptive"
+	// rule (either allow-all or deny-all) in order to switch the cgroup to the
+	// correct default policy.
+	//
+	// However, due to a limitation in "devices.list" we cannot be sure what
+	// deny rules are in place in a black-list cgroup. Thus if the source is a
+	// black-list we also have to include a disruptive rule.
+	if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
+		transitionRules = append(transitionRules, &configs.DeviceRule{
+			Type:        'a',
+			Major:       -1,
+			Minor:       -1,
+			Permissions: configs.DevicePermissions("rwm"),
+			Allow:       target.defaultAllow,
+		})
+		// The old rules are only relevant if we aren't starting out with a
+		// disruptive rule.
+		oldRules = nil
+	}
+
+	// NOTE: We traverse through the rules in a sorted order so we always write
+	//       the same set of rules (this is to aid testing).
+
+	// First, we create inverse rules for any old rules not in the new set.
+	// This includes partial-inverse rules for specific permissions. This is a
+	// no-op if we added a disruptive rule, since oldRules will be empty.
+	for _, rule := range oldRules.orderedEntries() {
+		meta, oldPerms := rule.meta, rule.perms
+		newPerms := target.rules[meta]
+		droppedPerms := oldPerms.Difference(newPerms)
+		if !droppedPerms.IsEmpty() {
+			transitionRules = append(transitionRules, &configs.DeviceRule{
+				Type:        meta.node,
+				Major:       meta.major,
+				Minor:       meta.minor,
+				Permissions: droppedPerms,
+				Allow:       target.defaultAllow,
+			})
+		}
+	}
+
+	// Add any additional rules which weren't in the old set. We happen to
+	// filter out rules which are present in both sets, though this isn't
+	// strictly necessary.
+	for _, rule := range target.rules.orderedEntries() {
+		meta, newPerms := rule.meta, rule.perms
+		oldPerms := oldRules[meta]
+		gainedPerms := newPerms.Difference(oldPerms)
+		if !gainedPerms.IsEmpty() {
+			transitionRules = append(transitionRules, &configs.DeviceRule{
+				Type:        meta.node,
+				Major:       meta.major,
+				Minor:       meta.minor,
+				Permissions: gainedPerms,
+				Allow:       !target.defaultAllow,
+			})
+		}
+	}
+	return transitionRules, nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
@@ -22,7 +22,7 @@ const (
 )

 // DeviceFilter returns eBPF device filter program and its license string
-func DeviceFilter(devices []*configs.Device) (asm.Instructions, string, error) {
+func DeviceFilter(devices []*configs.DeviceRule) (asm.Instructions, string, error) {
 	p := &program{}
 	p.init()
 	for i := len(devices) - 1; i >= 0; i-- {
@@ -49,7 +49,8 @@ func (p *program) init() {
 	*/
 	// R2 <- type (lower 16 bit of u32 access_type at R1[0])
 	p.insts = append(p.insts,
-		asm.LoadMem(asm.R2, asm.R1, 0, asm.Half))
+		asm.LoadMem(asm.R2, asm.R1, 0, asm.Word),
+		asm.And.Imm32(asm.R2, 0xFFFF))

 	// R3 <- access (upper 16 bit of u32 access_type at R1[0])
 	p.insts = append(p.insts,
@@ -67,7 +68,7 @@ func (p *program) init() {
 }

 // appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element.
-func (p *program) appendDevice(dev *configs.Device) error {
+func (p *program) appendDevice(dev *configs.DeviceRule) error {
 	if p.blockID < 0 {
 		return errors.New("the program is finalized")
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/BUILD
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/BUILD
@@ -3,14 +3,13 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
 go_library(
    name = "go_default_library",
    srcs = [
-        "apply_raw.go",
        "blkio.go",
        "cpu.go",
        "cpuacct.go",
        "cpuset.go",
        "devices.go",
        "freezer.go",
-        "fs_unsupported.go",
+        "fs.go",
        "hugetlb.go",
        "kmem.go",
        "memory.go",
@@ -19,13 +18,16 @@ go_library(
        "net_prio.go",
        "perf_event.go",
        "pids.go",
+        "unsupported.go",
    ],
    importmap = "k8s.io/kubernetes/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs",
    importpath = "github.com/opencontainers/runc/libcontainer/cgroups/fs",
    visibility = ["//visibility:public"],
    deps = select({
        "@io_bazel_rules_go//go/platform:android": [
+            "//vendor/github.com/moby/sys/mountinfo:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
+            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/system:go_default_library",
@@ -34,7 +36,9 @@ go_library(
            "//vendor/golang.org/x/sys/unix:go_default_library",
        ],
        "@io_bazel_rules_go//go/platform:linux": [
+            "//vendor/github.com/moby/sys/mountinfo:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
+            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/system:go_default_library",
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go
@@ -4,6 +4,7 @@ package fs

 import (
 	"bufio"
+	"fmt"
 	"os"
 	"path/filepath"
 	"strconv"
@@ -66,9 +67,21 @@ func (s *CpuGroup) SetRtSched(path string, cgroup *configs.Cgroup) error {

 func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
 	if cgroup.Resources.CpuShares != 0 {
-		if err := fscommon.WriteFile(path, "cpu.shares", strconv.FormatUint(cgroup.Resources.CpuShares, 10)); err != nil {
+		shares := cgroup.Resources.CpuShares
+		if err := fscommon.WriteFile(path, "cpu.shares", strconv.FormatUint(shares, 10)); err != nil {
 			return err
 		}
+		// read it back
+		sharesRead, err := fscommon.GetCgroupParamUint(path, "cpu.shares")
+		if err != nil {
+			return err
+		}
+		// ... and check
+		if shares > sharesRead {
+			return fmt.Errorf("the maximum allowed cpu-shares is %d", sharesRead)
+		} else if shares < sharesRead {
+			return fmt.Errorf("the minimum allowed cpu-shares is %d", sharesRead)
+		}
 	}
 	if cgroup.Resources.CpuPeriod != 0 {
 		if err := fscommon.WriteFile(path, "cpu.cfs_period_us", strconv.FormatUint(cgroup.Resources.CpuPeriod, 10)); err != nil {
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go
@@ -3,8 +3,10 @@
 package fs

 import (
+	"bufio"
 	"fmt"
 	"io/ioutil"
+	"os"
 	"path/filepath"
 	"strconv"
 	"strings"
@@ -12,15 +14,24 @@ import (
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
-	"github.com/opencontainers/runc/libcontainer/system"
 )

 const (
-	cgroupCpuacctStat   = "cpuacct.stat"
-	nanosecondsInSecond = 1000000000
-)
+	cgroupCpuacctStat     = "cpuacct.stat"
+	cgroupCpuacctUsageAll = "cpuacct.usage_all"

-var clockTicks = uint64(system.GetClockTicks())
+	nanosecondsInSecond = 1000000000
+
+	userModeColumn              = 1
+	kernelModeColumn            = 2
+	cuacctUsageAllColumnsNumber = 3
+
+	// The value comes from `C.sysconf(C._SC_CLK_TCK)`, and
+	// on Linux it's a constant which is safe to be hard coded,
+	// so we can avoid using cgo here. For details, see:
+	// https://github.com/containerd/cgroups/pull/12
+	clockTicks uint64 = 100
+)

 type CpuacctGroup struct {
 }
@@ -62,8 +73,15 @@ func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {
 		return err
 	}

+	percpuUsageInKernelmode, percpuUsageInUsermode, err := getPercpuUsageInModes(path)
+	if err != nil {
+		return err
+	}
+
 	stats.CpuStats.CpuUsage.TotalUsage = totalUsage
 	stats.CpuStats.CpuUsage.PercpuUsage = percpuUsage
+	stats.CpuStats.CpuUsage.PercpuUsageInKernelmode = percpuUsageInKernelmode
+	stats.CpuStats.CpuUsage.PercpuUsageInUsermode = percpuUsageInUsermode
 	stats.CpuStats.CpuUsage.UsageInUsermode = userModeUsage
 	stats.CpuStats.CpuUsage.UsageInKernelmode = kernelModeUsage
 	return nil
@@ -120,3 +138,44 @@ func getPercpuUsage(path string) ([]uint64, error) {
 	}
 	return percpuUsage, nil
 }
+
+func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
+	usageKernelMode := []uint64{}
+	usageUserMode := []uint64{}
+
+	file, err := os.Open(filepath.Join(path, cgroupCpuacctUsageAll))
+	if os.IsNotExist(err) {
+		return usageKernelMode, usageUserMode, nil
+	} else if err != nil {
+		return nil, nil, err
+	}
+	defer file.Close()
+
+	scanner := bufio.NewScanner(file)
+	scanner.Scan() //skipping header line
+
+	for scanner.Scan() {
+		lineFields := strings.SplitN(scanner.Text(), " ", cuacctUsageAllColumnsNumber+1)
+		if len(lineFields) != cuacctUsageAllColumnsNumber {
+			continue
+		}
+
+		usageInKernelMode, err := strconv.ParseUint(lineFields[kernelModeColumn], 10, 64)
+		if err != nil {
+			return nil, nil, fmt.Errorf("Unable to convert CPU usage in kernel mode to uint64: %s", err)
+		}
+		usageKernelMode = append(usageKernelMode, usageInKernelMode)
+
+		usageInUserMode, err := strconv.ParseUint(lineFields[userModeColumn], 10, 64)
+		if err != nil {
+			return nil, nil, fmt.Errorf("Unable to convert CPU usage in user mode to uint64: %s", err)
+		}
+		usageUserMode = append(usageUserMode, usageInUserMode)
+	}
+
+	if err := scanner.Err(); err != nil {
+		return nil, nil, fmt.Errorf("Problem in reading %s line by line, %s", cgroupCpuacctUsageAll, err)
+	}
+
+	return usageKernelMode, usageUserMode, nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go
@@ -4,15 +4,16 @@ package fs

 import (
 	"bytes"
-	"fmt"
 	"io/ioutil"
 	"os"
 	"path/filepath"

+	"github.com/moby/sys/mountinfo"
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
+	"github.com/pkg/errors"
 )

 type CpusetGroup struct {
@@ -52,17 +53,39 @@ func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
 	return nil
 }

+// Get the source mount point of directory passed in as argument.
+func getMount(dir string) (string, error) {
+	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(dir))
+	if err != nil {
+		return "", err
+	}
+	if len(mi) < 1 {
+		return "", errors.Errorf("Can't find mount point of %s", dir)
+	}
+
+	// find the longest mount point
+	var idx, maxlen int
+	for i := range mi {
+		if len(mi[i].Mountpoint) > maxlen {
+			maxlen = len(mi[i].Mountpoint)
+			idx = i
+		}
+	}
+
+	return mi[idx].Mountpoint, nil
+}
+
 func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error {
 	// This might happen if we have no cpuset cgroup mounted.
 	// Just do nothing and don't fail.
 	if dir == "" {
 		return nil
 	}
-	mountInfo, err := ioutil.ReadFile("/proc/self/mountinfo")
+	root, err := getMount(dir)
 	if err != nil {
 		return err
 	}
-	root := filepath.Dir(cgroups.GetClosestMountpointAncestor(dir, string(mountInfo)))
+	root = filepath.Dir(root)
 	// 'ensureParent' start with parent because we don't want to
 	// explicitly inherit from parent, it could conflict with
 	// 'cpuset.cpu_exclusive'.
@@ -108,7 +131,7 @@ func (s *CpusetGroup) ensureParent(current, root string) error {
 	}
 	// Avoid infinite recursion.
 	if parent == current {
-		return fmt.Errorf("cpuset: cgroup parent path outside cgroup root")
+		return errors.New("cpuset: cgroup parent path outside cgroup root")
 	}
 	if err := s.ensureParent(parent, root); err != nil {
 		return err
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go
@@ -3,13 +3,19 @@
 package fs

 import (
+	"bytes"
+	"errors"
+	"reflect"
+
 	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/devices"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/system"
 )

 type DevicesGroup struct {
+	testingSkipFinalCheck bool
 }

 func (s *DevicesGroup) Name() string {
@@ -26,49 +32,74 @@ func (s *DevicesGroup) Apply(d *cgroupData) error {
 	return nil
 }

+func loadEmulator(path string) (*devices.Emulator, error) {
+	list, err := fscommon.ReadFile(path, "devices.list")
+	if err != nil {
+		return nil, err
+	}
+	return devices.EmulatorFromList(bytes.NewBufferString(list))
+}
+
+func buildEmulator(rules []*configs.DeviceRule) (*devices.Emulator, error) {
+	// This defaults to a white-list -- which is what we want!
+	emu := &devices.Emulator{}
+	for _, rule := range rules {
+		if err := emu.Apply(*rule); err != nil {
+			return nil, err
+		}
+	}
+	return emu, nil
+}
+
 func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error {
 	if system.RunningInUserNS() {
 		return nil
 	}

-	devices := cgroup.Resources.Devices
-	if len(devices) > 0 {
-		for _, dev := range devices {
-			file := "devices.deny"
-			if dev.Allow {
-				file = "devices.allow"
-			}
-			if err := fscommon.WriteFile(path, file, dev.CgroupString()); err != nil {
-				return err
-			}
-		}
-		return nil
+	// Generate two emulators, one for the current state of the cgroup and one
+	// for the requested state by the user.
+	current, err := loadEmulator(path)
+	if err != nil {
+		return err
+	}
+	target, err := buildEmulator(cgroup.Resources.Devices)
+	if err != nil {
+		return err
 	}
-	if cgroup.Resources.AllowAllDevices != nil {
-		if *cgroup.Resources.AllowAllDevices == false {
-			if err := fscommon.WriteFile(path, "devices.deny", "a"); err != nil {
-				return err
-			}

-			for _, dev := range cgroup.Resources.AllowedDevices {
-				if err := fscommon.WriteFile(path, "devices.allow", dev.CgroupString()); err != nil {
-					return err
-				}
-			}
-			return nil
+	// Compute the minimal set of transition rules needed to achieve the
+	// requested state.
+	transitionRules, err := current.Transition(target)
+	if err != nil {
+		return err
+	}
+	for _, rule := range transitionRules {
+		file := "devices.deny"
+		if rule.Allow {
+			file = "devices.allow"
 		}
-
-		if err := fscommon.WriteFile(path, "devices.allow", "a"); err != nil {
+		if err := fscommon.WriteFile(path, file, rule.CgroupString()); err != nil {
 			return err
 		}
 	}

-	for _, dev := range cgroup.Resources.DeniedDevices {
-		if err := fscommon.WriteFile(path, "devices.deny", dev.CgroupString()); err != nil {
+	// Final safety check -- ensure that the resulting state is what was
+	// requested. This is only really correct for white-lists, but for
+	// black-lists we can at least check that the cgroup is in the right mode.
+	//
+	// This safety-check is skipped for the unit tests because we cannot
+	// currently mock devices.list correctly.
+	if !s.testingSkipFinalCheck {
+		currentAfter, err := loadEmulator(path)
+		if err != nil {
 			return err
 		}
+		if !target.IsBlacklist() && !reflect.DeepEqual(currentAfter, target) {
+			return errors.New("resulting devices cgroup doesn't precisely match target")
+		} else if target.IsBlacklist() != currentAfter.IsBlacklist() {
+			return errors.New("resulting devices cgroup doesn't match target mode")
+		}
 	}
-
 	return nil
 }

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go
@@ -3,13 +3,16 @@
 package fs

 import (
+	"errors"
 	"fmt"
+	"os"
 	"strings"
 	"time"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"golang.org/x/sys/unix"
 )

 type FreezerGroup struct {
@@ -39,11 +42,11 @@ func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
 				return err
 			}

-			state, err := fscommon.ReadFile(path, "freezer.state")
+			state, err := s.GetState(path)
 			if err != nil {
 				return err
 			}
-			if strings.TrimSpace(state) == string(cgroup.Resources.Freezer) {
+			if state == cgroup.Resources.Freezer {
 				break
 			}

@@ -65,3 +68,30 @@ func (s *FreezerGroup) Remove(d *cgroupData) error {
 func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
 	return nil
 }
+
+func (s *FreezerGroup) GetState(path string) (configs.FreezerState, error) {
+	for {
+		state, err := fscommon.ReadFile(path, "freezer.state")
+		if err != nil {
+			// If the kernel is too old, then we just treat the freezer as
+			// being in an "undefined" state.
+			if os.IsNotExist(err) || errors.Is(err, unix.ENODEV) {
+				err = nil
+			}
+			return configs.Undefined, err
+		}
+		switch strings.TrimSpace(state) {
+		case "THAWED":
+			return configs.Thawed, nil
+		case "FROZEN":
+			return configs.Frozen, nil
+		case "FREEZING":
+			// Make sure we get a stable freezer state, so retry if the cgroup
+			// is still undergoing freezing. This should be a temporary delay.
+			time.Sleep(1 * time.Millisecond)
+			continue
+		default:
+			return configs.Undefined, fmt.Errorf("unknown freezer.state %q", state)
+		}
+	}
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/apply_raw.go
@@ -3,10 +3,11 @@
 package fs

 import (
+	"bufio"
 	"fmt"
-	"io"
 	"os"
 	"path/filepath"
+	"strings"
 	"sync"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -35,7 +36,7 @@ var (
 	HugePageSizes, _ = cgroups.GetHugePageSize()
 )

-var errSubsystemDoesNotExist = fmt.Errorf("cgroup: subsystem does not exist")
+var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")

 type subsystemSet []subsystem

@@ -61,11 +62,19 @@ type subsystem interface {
 	Set(path string, cgroup *configs.Cgroup) error
 }

-type Manager struct {
+type manager struct {
 	mu       sync.Mutex
-	Cgroups  *configs.Cgroup
-	Rootless bool // ignore permission-related errors
-	Paths    map[string]string
+	cgroups  *configs.Cgroup
+	rootless bool // ignore permission-related errors
+	paths    map[string]string
+}
+
+func NewManager(cg *configs.Cgroup, paths map[string]string, rootless bool) cgroups.Manager {
+	return &manager{
+		cgroups:  cg,
+		paths:    paths,
+		rootless: rootless,
+	}
 }

 // The absolute path to the root of the cgroup hierarchies.
@@ -81,10 +90,43 @@ func getCgroupRoot() (string, error) {
 		return cgroupRoot, nil
 	}

-	root, err := cgroups.FindCgroupMountpointDir()
+	f, err := os.Open("/proc/self/mountinfo")
 	if err != nil {
 		return "", err
 	}
+	defer f.Close()
+
+	var root string
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		text := scanner.Text()
+		fields := strings.Split(text, " ")
+		// Safe as mountinfo encodes mountpoints with spaces as \040.
+		index := strings.Index(text, " - ")
+		postSeparatorFields := strings.Fields(text[index+3:])
+		numPostFields := len(postSeparatorFields)
+
+		// This is an error as we can't detect if the mount is for "cgroup"
+		if numPostFields == 0 {
+			return "", fmt.Errorf("mountinfo: found no fields post '-' in %q", text)
+		}
+
+		if postSeparatorFields[0] == "cgroup" {
+			// Check that the mount is properly formatted.
+			if numPostFields < 3 {
+				return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
+			}
+
+			root = filepath.Dir(fields[4])
+			break
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return "", err
+	}
+	if root == "" {
+		return "", errors.New("no cgroup mount found in mountinfo")
+	}

 	if _, err := os.Stat(root); err != nil {
 		return "", err
@@ -110,43 +152,39 @@ func isIgnorableError(rootless bool, err error) bool {
 	if !rootless {
 		return false
 	}
+	// TODO: rm errors.Cause once we switch to %w everywhere
+	err = errors.Cause(err)
 	// Is it an ordinary EPERM?
-	if os.IsPermission(errors.Cause(err)) {
+	if errors.Is(err, os.ErrPermission) {
 		return true
 	}
-
-	// Try to handle other errnos.
-	var errno error
-	switch err := errors.Cause(err).(type) {
-	case *os.PathError:
-		errno = err.Err
-	case *os.LinkError:
-		errno = err.Err
-	case *os.SyscallError:
-		errno = err.Err
+	// Handle some specific syscall errors.
+	var errno unix.Errno
+	if errors.As(err, &errno) {
+		return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
 	}
-	return errno == unix.EROFS || errno == unix.EPERM || errno == unix.EACCES
+	return false
 }

-func (m *Manager) getSubsystems() subsystemSet {
+func (m *manager) getSubsystems() subsystemSet {
 	return subsystemsLegacy
 }

-func (m *Manager) Apply(pid int) (err error) {
-	if m.Cgroups == nil {
+func (m *manager) Apply(pid int) (err error) {
+	if m.cgroups == nil {
 		return nil
 	}
 	m.mu.Lock()
 	defer m.mu.Unlock()

-	var c = m.Cgroups
+	var c = m.cgroups

-	d, err := getCgroupData(m.Cgroups, pid)
+	d, err := getCgroupData(m.cgroups, pid)
 	if err != nil {
 		return err
 	}

-	m.Paths = make(map[string]string)
+	m.paths = make(map[string]string)
 	if c.Paths != nil {
 		for name, path := range c.Paths {
 			_, err := d.path(name)
@@ -156,15 +194,12 @@ func (m *Manager) Apply(pid int) (err error) {
 				}
 				return err
 			}
-			m.Paths[name] = path
+			m.paths[name] = path
 		}
-		return cgroups.EnterPid(m.Paths, pid)
+		return cgroups.EnterPid(m.paths, pid)
 	}

 	for _, sys := range m.getSubsystems() {
-		// TODO: Apply should, ideally, be reentrant or be broken up into a separate
-		// create and join phase so that the cgroup hierarchy for a container can be
-		// created then join consists of writing the process pids to cgroup.procs
 		p, err := d.path(sys.Name())
 		if err != nil {
 			// The non-presence of the devices subsystem is
@@ -174,15 +209,15 @@ func (m *Manager) Apply(pid int) (err error) {
 			}
 			return err
 		}
-		m.Paths[sys.Name()] = p
+		m.paths[sys.Name()] = p

 		if err := sys.Apply(d); err != nil {
-			// In the case of rootless (including euid=0 in userns), where an explicit cgroup path hasn't
-			// been set, we don't bail on error in case of permission problems.
-			// Cases where limits have been set (and we couldn't create our own
-			// cgroup) are handled by Set.
-			if isIgnorableError(m.Rootless, err) && m.Cgroups.Path == "" {
-				delete(m.Paths, sys.Name())
+			// In the case of rootless (including euid=0 in userns), where an
+			// explicit cgroup path hasn't been set, we don't bail on error in
+			// case of permission problems. Cases where limits have been set
+			// (and we couldn't create our own cgroup) are handled by Set.
+			if isIgnorableError(m.rootless, err) && m.cgroups.Path == "" {
+				delete(m.paths, sys.Name())
 				continue
 			}
 			return err
@@ -192,35 +227,30 @@ func (m *Manager) Apply(pid int) (err error) {
 	return nil
 }

-func (m *Manager) Destroy() error {
-	if m.Cgroups == nil || m.Cgroups.Paths != nil {
+func (m *manager) Destroy() error {
+	if m.cgroups == nil || m.cgroups.Paths != nil {
 		return nil
 	}
 	m.mu.Lock()
 	defer m.mu.Unlock()
-	if err := cgroups.RemovePaths(m.Paths); err != nil {
+	if err := cgroups.RemovePaths(m.paths); err != nil {
 		return err
 	}
-	m.Paths = make(map[string]string)
+	m.paths = make(map[string]string)
 	return nil
 }

-func (m *Manager) GetPaths() map[string]string {
+func (m *manager) Path(subsys string) string {
 	m.mu.Lock()
-	paths := m.Paths
-	m.mu.Unlock()
-	return paths
+	defer m.mu.Unlock()
+	return m.paths[subsys]
 }

-func (m *Manager) GetUnifiedPath() (string, error) {
-	return "", errors.New("unified path is only supported when running in unified mode")
-}
-
-func (m *Manager) GetStats() (*cgroups.Stats, error) {
+func (m *manager) GetStats() (*cgroups.Stats, error) {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	stats := cgroups.NewStats()
-	for name, path := range m.Paths {
+	for name, path := range m.paths {
 		sys, err := m.getSubsystems().Get(name)
 		if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
 			continue
@@ -232,22 +262,23 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) {
 	return stats, nil
 }

-func (m *Manager) Set(container *configs.Config) error {
+func (m *manager) Set(container *configs.Config) error {
 	if container.Cgroups == nil {
 		return nil
 	}

 	// If Paths are set, then we are just joining cgroups paths
 	// and there is no need to set any values.
-	if m.Cgroups != nil && m.Cgroups.Paths != nil {
+	if m.cgroups != nil && m.cgroups.Paths != nil {
 		return nil
 	}

-	paths := m.GetPaths()
+	m.mu.Lock()
+	defer m.mu.Unlock()
 	for _, sys := range m.getSubsystems() {
-		path := paths[sys.Name()]
+		path := m.paths[sys.Name()]
 		if err := sys.Set(path, container.Cgroups); err != nil {
-			if m.Rootless && sys.Name() == "devices" {
+			if m.rootless && sys.Name() == "devices" {
 				continue
 			}
 			// When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
@@ -262,45 +293,41 @@ func (m *Manager) Set(container *configs.Config) error {
 		}
 	}

-	if m.Paths["cpu"] != "" {
-		if err := CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
-			return err
-		}
-	}
 	return nil
 }

 // Freeze toggles the container's freezer cgroup depending on the state
 // provided
-func (m *Manager) Freeze(state configs.FreezerState) error {
-	if m.Cgroups == nil {
+func (m *manager) Freeze(state configs.FreezerState) (Err error) {
+	path := m.Path("freezer")
+	if m.cgroups == nil || path == "" {
 		return errors.New("cannot toggle freezer: cgroups not configured for container")
 	}

-	paths := m.GetPaths()
-	dir := paths["freezer"]
-	prevState := m.Cgroups.Resources.Freezer
-	m.Cgroups.Resources.Freezer = state
+	prevState := m.cgroups.Resources.Freezer
+	m.cgroups.Resources.Freezer = state
+	defer func() {
+		if Err != nil {
+			m.cgroups.Resources.Freezer = prevState
+		}
+	}()
+
 	freezer, err := m.getSubsystems().Get("freezer")
 	if err != nil {
 		return err
 	}
-	err = freezer.Set(dir, m.Cgroups)
-	if err != nil {
-		m.Cgroups.Resources.Freezer = prevState
+	if err := freezer.Set(path, m.cgroups); err != nil {
 		return err
 	}
 	return nil
 }

-func (m *Manager) GetPids() ([]int, error) {
-	paths := m.GetPaths()
-	return cgroups.GetPids(paths["devices"])
+func (m *manager) GetPids() ([]int, error) {
+	return cgroups.GetPids(m.Path("devices"))
 }

-func (m *Manager) GetAllPids() ([]int, error) {
-	paths := m.GetPaths()
-	return cgroups.GetAllPids(paths["devices"])
+func (m *manager) GetAllPids() ([]int, error) {
+	return cgroups.GetAllPids(m.Path("devices"))
 }

 func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
@@ -310,7 +337,7 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) {
 	}

 	if (c.Name != "" || c.Parent != "") && c.Path != "" {
-		return nil, fmt.Errorf("cgroup: either Path or Name and Parent should be used")
+		return nil, errors.New("cgroup: either Path or Name and Parent should be used")
 	}

 	// XXX: Do not remove this code. Path safety is important! -- cyphar
@@ -379,33 +406,27 @@ func removePath(p string, err error) error {
 	return nil
 }

-func CheckCpushares(path string, c uint64) error {
-	var cpuShares uint64
-
-	if c == 0 {
-		return nil
-	}
-
-	fd, err := os.Open(filepath.Join(path, "cpu.shares"))
-	if err != nil {
-		return err
-	}
-	defer fd.Close()
-
-	_, err = fmt.Fscanf(fd, "%d", &cpuShares)
-	if err != nil && err != io.EOF {
-		return err
-	}
-
-	if c > cpuShares {
-		return fmt.Errorf("The maximum allowed cpu-shares is %d", cpuShares)
-	} else if c < cpuShares {
-		return fmt.Errorf("The minimum allowed cpu-shares is %d", cpuShares)
-	}
-
-	return nil
+func (m *manager) GetPaths() map[string]string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	return m.paths
 }

-func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
-	return m.Cgroups, nil
+func (m *manager) GetCgroups() (*configs.Cgroup, error) {
+	return m.cgroups, nil
+}
+
+func (m *manager) GetFreezerState() (configs.FreezerState, error) {
+	dir := m.Path("freezer")
+	freezer, err := m.getSubsystems().Get("freezer")
+
+	// If the container doesn't have the freezer cgroup, say it's undefined.
+	if err != nil || dir == "" {
+		return configs.Undefined, nil
+	}
+	return freezer.(*FreezerGroup).GetState(dir)
+}
+
+func (m *manager) Exists() bool {
+	return cgroups.PathExists(m.Path("devices"))
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go
@@ -6,10 +6,8 @@ import (
 	"errors"
 	"fmt"
 	"io/ioutil"
-	"os"
 	"path/filepath"
 	"strconv"
-	"syscall" // for Errno type only

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"golang.org/x/sys/unix"
@@ -49,12 +47,8 @@ func setKernelMemory(path string, kernelMemoryLimit int64) error {
 		// The EBUSY signal is returned on attempts to write to the
 		// memory.kmem.limit_in_bytes file if the cgroup has children or
 		// once tasks have been attached to the cgroup
-		if pathErr, ok := err.(*os.PathError); ok {
-			if errNo, ok := pathErr.Err.(syscall.Errno); ok {
-				if errNo == unix.EBUSY {
-					return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
-				}
-			}
+		if errors.Is(err, unix.EBUSY) {
+			return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
 		}
 		return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go
@@ -5,7 +5,9 @@ package fs
 import (
 	"bufio"
 	"fmt"
+	"math"
 	"os"
+	"path"
 	"path/filepath"
 	"strconv"
 	"strings"
@@ -16,8 +18,16 @@ import (
 )

 const (
-	cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
-	cgroupMemoryLimit     = "memory.limit_in_bytes"
+	numaNodeSymbol            = "N"
+	numaStatColumnSeparator   = " "
+	numaStatKeyValueSeparator = "="
+	numaStatMaxColumns        = math.MaxUint8 + 1
+	numaStatValueIndex        = 1
+	numaStatTypeIndex         = 0
+	numaStatColumnSliceLength = 2
+	cgroupMemorySwapLimit     = "memory.memsw.limit_in_bytes"
+	cgroupMemoryLimit         = "memory.limit_in_bytes"
+	cgroupMemoryPagesByNuma   = "memory.numa_stat"
 )

 type MemoryGroup struct {
@@ -64,9 +74,9 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) {
 }

 func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error {
-	// If the memory update is set to -1 we should also
-	// set swap to -1, it means unlimited memory.
-	if cgroup.Resources.Memory == -1 {
+	// If the memory update is set to -1 and the swap is not explicitly
+	// set, we should also set swap to -1, it means unlimited memory.
+	if cgroup.Resources.Memory == -1 && cgroup.Resources.MemorySwap == 0 {
 		// Only set swap if it's enabled in kernel
 		if cgroups.PathExists(filepath.Join(path, cgroupMemorySwapLimit)) {
 			cgroup.Resources.MemorySwap = -1
@@ -209,6 +219,13 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
 	if value == 1 {
 		stats.MemoryStats.UseHierarchy = true
 	}
+
+	pagesByNUMA, err := getPageUsageByNUMA(path)
+	if err != nil {
+		return err
+	}
+	stats.MemoryStats.PageUsageByNUMA = pagesByNUMA
+
 	return nil
 }

@@ -269,3 +286,79 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {

 	return memoryData, nil
 }
+
+func getPageUsageByNUMA(cgroupPath string) (cgroups.PageUsageByNUMA, error) {
+	stats := cgroups.PageUsageByNUMA{}
+
+	file, err := os.Open(path.Join(cgroupPath, cgroupMemoryPagesByNuma))
+	if os.IsNotExist(err) {
+		return stats, nil
+	} else if err != nil {
+		return stats, err
+	}
+
+	scanner := bufio.NewScanner(file)
+	for scanner.Scan() {
+		var statsType string
+		statsByType := cgroups.PageStats{Nodes: map[uint8]uint64{}}
+		columns := strings.SplitN(scanner.Text(), numaStatColumnSeparator, numaStatMaxColumns)
+
+		for _, column := range columns {
+			pagesByNode := strings.SplitN(column, numaStatKeyValueSeparator, numaStatColumnSliceLength)
+
+			if strings.HasPrefix(pagesByNode[numaStatTypeIndex], numaNodeSymbol) {
+				nodeID, err := strconv.ParseUint(pagesByNode[numaStatTypeIndex][1:], 10, 8)
+				if err != nil {
+					return cgroups.PageUsageByNUMA{}, err
+				}
+
+				statsByType.Nodes[uint8(nodeID)], err = strconv.ParseUint(pagesByNode[numaStatValueIndex], 0, 64)
+				if err != nil {
+					return cgroups.PageUsageByNUMA{}, err
+				}
+			} else {
+				statsByType.Total, err = strconv.ParseUint(pagesByNode[numaStatValueIndex], 0, 64)
+				if err != nil {
+					return cgroups.PageUsageByNUMA{}, err
+				}
+
+				statsType = pagesByNode[numaStatTypeIndex]
+			}
+
+			err := addNUMAStatsByType(&stats, statsByType, statsType)
+			if err != nil {
+				return cgroups.PageUsageByNUMA{}, err
+			}
+		}
+	}
+	err = scanner.Err()
+	if err != nil {
+		return cgroups.PageUsageByNUMA{}, err
+	}
+
+	return stats, nil
+}
+
+func addNUMAStatsByType(stats *cgroups.PageUsageByNUMA, byTypeStats cgroups.PageStats, statsType string) error {
+	switch statsType {
+	case "total":
+		stats.Total = byTypeStats
+	case "file":
+		stats.File = byTypeStats
+	case "anon":
+		stats.Anon = byTypeStats
+	case "unevictable":
+		stats.Unevictable = byTypeStats
+	case "hierarchical_total":
+		stats.Hierarchical.Total = byTypeStats
+	case "hierarchical_file":
+		stats.Hierarchical.File = byTypeStats
+	case "hierarchical_anon":
+		stats.Hierarchical.Anon = byTypeStats
+	case "hierarchical_unevictable":
+		stats.Hierarchical.Unevictable = byTypeStats
+	default:
+		return fmt.Errorf("unsupported NUMA page type found: %s", statsType)
+	}
+	return nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs_unsupported.go
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/BUILD
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/BUILD
@@ -5,10 +5,12 @@ go_library(
    srcs = [
        "cpu.go",
        "cpuset.go",
+        "create.go",
        "defaultpath.go",
        "devices.go",
        "freezer.go",
        "fs2.go",
+        "hugetlb.go",
        "io.go",
        "memory.go",
        "pids.go",
@@ -22,7 +24,6 @@ go_library(
        "//vendor/github.com/pkg/errors:go_default_library",
    ] + select({
        "@io_bazel_rules_go//go/platform:android": [
-            "//vendor/github.com/cyphar/filepath-securejoin:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter:go_default_library",
@@ -30,7 +31,6 @@ go_library(
            "//vendor/golang.org/x/sys/unix:go_default_library",
        ],
        "@io_bazel_rules_go//go/platform:linux": [
-            "//vendor/github.com/cyphar/filepath-securejoin:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter:go_default_library",
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go
@@ -13,15 +13,36 @@ import (
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

+func isCpuSet(cgroup *configs.Cgroup) bool {
+	return cgroup.Resources.CpuWeight != 0 || cgroup.Resources.CpuQuota != 0 || cgroup.Resources.CpuPeriod != 0
+}
+
 func setCpu(dirPath string, cgroup *configs.Cgroup) error {
-	if cgroup.Resources.CpuWeight != 0 {
-		if err := fscommon.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(cgroup.Resources.CpuWeight, 10)); err != nil {
+	if !isCpuSet(cgroup) {
+		return nil
+	}
+	r := cgroup.Resources
+
+	// NOTE: .CpuShares is not used here. Conversion is the caller's responsibility.
+	if r.CpuWeight != 0 {
+		if err := fscommon.WriteFile(dirPath, "cpu.weight", strconv.FormatUint(r.CpuWeight, 10)); err != nil {
 			return err
 		}
 	}

-	if cgroup.Resources.CpuMax != "" {
-		if err := fscommon.WriteFile(dirPath, "cpu.max", cgroup.Resources.CpuMax); err != nil {
+	if r.CpuQuota != 0 || r.CpuPeriod != 0 {
+		str := "max"
+		if r.CpuQuota > 0 {
+			str = strconv.FormatInt(r.CpuQuota, 10)
+		}
+		period := r.CpuPeriod
+		if period == 0 {
+			// This default value is documented in
+			// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
+			period = 100000
+		}
+		str += " " + strconv.FormatUint(period, 10)
+		if err := fscommon.WriteFile(dirPath, "cpu.max", str); err != nil {
 			return err
 		}
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpuset.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpuset.go
@@ -7,7 +7,15 @@ import (
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

+func isCpusetSet(cgroup *configs.Cgroup) bool {
+	return cgroup.Resources.CpusetCpus != "" || cgroup.Resources.CpusetMems != ""
+}
+
 func setCpuset(dirPath string, cgroup *configs.Cgroup) error {
+	if !isCpusetSet(cgroup) {
+		return nil
+	}
+
 	if cgroup.Resources.CpusetCpus != "" {
 		if err := fscommon.WriteFile(dirPath, "cpuset.cpus", cgroup.Resources.CpusetCpus); err != nil {
 			return err
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/create.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/create.go
@@ -0,0 +1,151 @@
+package fs2
+
+import (
+	"bytes"
+	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func supportedControllers(cgroup *configs.Cgroup) ([]byte, error) {
+	const file = UnifiedMountpoint + "/cgroup.controllers"
+	return ioutil.ReadFile(file)
+}
+
+// needAnyControllers returns whether we enable some supported controllers or not,
+// based on (1) controllers available and (2) resources that are being set.
+// We don't check "pseudo" controllers such as
+// "freezer" and "devices".
+func needAnyControllers(cgroup *configs.Cgroup) (bool, error) {
+	if cgroup == nil {
+		return false, nil
+	}
+
+	// list of all available controllers
+	content, err := supportedControllers(cgroup)
+	if err != nil {
+		return false, err
+	}
+	avail := make(map[string]struct{})
+	for _, ctr := range strings.Fields(string(content)) {
+		avail[ctr] = struct{}{}
+	}
+
+	// check whether the controller if available or not
+	have := func(controller string) bool {
+		_, ok := avail[controller]
+		return ok
+	}
+
+	if isPidsSet(cgroup) && have("pids") {
+		return true, nil
+	}
+	if isMemorySet(cgroup) && have("memory") {
+		return true, nil
+	}
+	if isIoSet(cgroup) && have("io") {
+		return true, nil
+	}
+	if isCpuSet(cgroup) && have("cpu") {
+		return true, nil
+	}
+	if isCpusetSet(cgroup) && have("cpuset") {
+		return true, nil
+	}
+	if isHugeTlbSet(cgroup) && have("hugetlb") {
+		return true, nil
+	}
+
+	return false, nil
+}
+
+// containsDomainController returns whether the current config contains domain controller or not.
+// Refer to: http://man7.org/linux/man-pages/man7/cgroups.7.html
+// As at Linux 4.19, the following controllers are threaded: cpu, perf_event, and pids.
+func containsDomainController(cg *configs.Cgroup) bool {
+	return isMemorySet(cg) || isIoSet(cg) || isCpuSet(cg) || isHugeTlbSet(cg)
+}
+
+// CreateCgroupPath creates cgroupv2 path, enabling all the supported controllers.
+func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
+	if !strings.HasPrefix(path, UnifiedMountpoint) {
+		return fmt.Errorf("invalid cgroup path %s", path)
+	}
+
+	content, err := supportedControllers(c)
+	if err != nil {
+		return err
+	}
+
+	ctrs := bytes.Fields(content)
+	res := append([]byte("+"), bytes.Join(ctrs, []byte(" +"))...)
+
+	elements := strings.Split(path, "/")
+	elements = elements[3:]
+	current := "/sys/fs"
+	for i, e := range elements {
+		current = filepath.Join(current, e)
+		if i > 0 {
+			if err := os.Mkdir(current, 0755); err != nil {
+				if !os.IsExist(err) {
+					return err
+				}
+			} else {
+				// If the directory was created, be sure it is not left around on errors.
+				current := current
+				defer func() {
+					if Err != nil {
+						os.Remove(current)
+					}
+				}()
+			}
+			cgTypeFile := filepath.Join(current, "cgroup.type")
+			cgType, _ := ioutil.ReadFile(cgTypeFile)
+			switch strings.TrimSpace(string(cgType)) {
+			// If the cgroup is in an invalid mode (usually this means there's an internal
+			// process in the cgroup tree, because we created a cgroup under an
+			// already-populated-by-other-processes cgroup), then we have to error out if
+			// the user requested controllers which are not thread-aware. However, if all
+			// the controllers requested are thread-aware we can simply put the cgroup into
+			// threaded mode.
+			case "domain invalid":
+				if containsDomainController(c) {
+					return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in an invalid state", current)
+				} else {
+					// Not entirely correct (in theory we'd always want to be a domain --
+					// since that means we're a properly delegated cgroup subtree) but in
+					// this case there's not much we can do and it's better than giving an
+					// error.
+					_ = ioutil.WriteFile(cgTypeFile, []byte("threaded"), 0644)
+				}
+			// If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers
+			// (and you cannot usually take a cgroup out of threaded mode).
+			case "domain threaded":
+				fallthrough
+			case "threaded":
+				if containsDomainController(c) {
+					return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, strings.TrimSpace(string(cgType)))
+				}
+			}
+		}
+		// enable all supported controllers
+		if i < len(elements)-1 {
+			file := filepath.Join(current, "cgroup.subtree_control")
+			if err := ioutil.WriteFile(file, res, 0644); err != nil {
+				// try write one by one
+				allCtrs := bytes.Split(res, []byte(" "))
+				for _, ctr := range allCtrs {
+					_ = ioutil.WriteFile(file, ctr, 0644)
+				}
+			}
+			// Some controllers might not be enabled when rootless or containerized,
+			// but we don't catch the error here. (Caught in setXXX() functions.)
+		}
+	}
+
+	return nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/defaultpath.go
@@ -44,14 +44,10 @@ func defaultDirPath(c *configs.Cgroup) (string, error) {
 	cgParent := libcontainerUtils.CleanPath(c.Parent)
 	cgName := libcontainerUtils.CleanPath(c.Name)

-	ownCgroup, err := parseCgroupFile("/proc/self/cgroup")
-	if err != nil {
-		return "", err
-	}
-	return _defaultDirPath(UnifiedMountpoint, cgPath, cgParent, cgName, ownCgroup)
+	return _defaultDirPath(UnifiedMountpoint, cgPath, cgParent, cgName)
 }

-func _defaultDirPath(root, cgPath, cgParent, cgName, ownCgroup string) (string, error) {
+func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) {
 	if (cgName != "" || cgParent != "") && cgPath != "" {
 		return "", errors.New("cgroup: either Path or Name and Parent should be used")
 	}
@@ -62,6 +58,16 @@ func _defaultDirPath(root, cgPath, cgParent, cgName, ownCgroup string) (string,
 	if filepath.IsAbs(innerPath) {
 		return filepath.Join(root, innerPath), nil
 	}
+
+	ownCgroup, err := parseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		return "", err
+	}
+	// The current user scope most probably has tasks in it already,
+	// making it impossible to enable controllers for its sub-cgroup.
+	// A parent cgroup (with no tasks in it) is what we need.
+	ownCgroup = filepath.Dir(ownCgroup)
+
 	return filepath.Join(root, ownCgroup, innerPath), nil
 }

@@ -80,9 +86,6 @@ func parseCgroupFromReader(r io.Reader) (string, error) {
 		s = bufio.NewScanner(r)
 	)
 	for s.Scan() {
-		if err := s.Err(); err != nil {
-			return "", err
-		}
 		var (
 			text  = s.Text()
 			parts = strings.SplitN(text, ":", 3)
@@ -95,5 +98,8 @@ func parseCgroupFromReader(r io.Reader) (string, error) {
 			return parts[2], nil
 		}
 	}
+	if err := s.Err(); err != nil {
+		return "", err
+	}
 	return "", errors.New("cgroup path not found")
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/devices.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/devices.go
@@ -10,12 +10,10 @@ import (
 	"golang.org/x/sys/unix"
 )

-func isRWM(cgroupPermissions string) bool {
-	r := false
-	w := false
-	m := false
-	for _, rn := range cgroupPermissions {
-		switch rn {
+func isRWM(perms configs.DevicePermissions) bool {
+	var r, w, m bool
+	for _, perm := range perms {
+		switch perm {
 		case 'r':
 			r = true
 		case 'w':
@@ -39,22 +37,10 @@ func canSkipEBPFError(cgroup *configs.Cgroup) bool {
 }

 func setDevices(dirPath string, cgroup *configs.Cgroup) error {
+	// XXX: This is currently a white-list (but all callers pass a blacklist of
+	//      devices). This is bad for a whole variety of reasons, but will need
+	//      to be fixed with co-ordinated effort with downstreams.
 	devices := cgroup.Devices
-	if allowAllDevices := cgroup.Resources.AllowAllDevices; allowAllDevices != nil {
-		// never set by OCI specconv, but *allowAllDevices=false is still used by the integration test
-		if *allowAllDevices == true {
-			return errors.New("libcontainer AllowAllDevices is not supported, use Devices")
-		}
-		for _, ad := range cgroup.Resources.AllowedDevices {
-			d := *ad
-			d.Allow = true
-			devices = append(devices, &d)
-		}
-	}
-	if len(cgroup.Resources.DeniedDevices) != 0 {
-		// never set by OCI specconv
-		return errors.New("libcontainer DeniedDevices is not supported, use Devices")
-	}
 	insts, license, err := devicefilter.DeviceFilter(devices)
 	if err != nil {
 		return err
@@ -64,6 +50,17 @@ func setDevices(dirPath string, cgroup *configs.Cgroup) error {
 		return errors.Errorf("cannot get dir FD for %s", dirPath)
 	}
 	defer unix.Close(dirFD)
+	// XXX: This code is currently incorrect when it comes to updating an
+	//      existing cgroup with new rules (new rulesets are just appended to
+	//      the program list because this uses BPF_F_ALLOW_MULTI). If we didn't
+	//      use BPF_F_ALLOW_MULTI we could actually atomically swap the
+	//      programs.
+	//
+	//      The real issue is that BPF_F_ALLOW_MULTI makes it hard to have a
+	//      race-free blacklist because it acts as a whitelist by default, and
+	//      having a deny-everything program cannot be overriden by other
+	//      programs. You could temporarily insert a deny-everything program
+	//      but that would result in spurrious failures during updates.
 	if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
 		if !canSkipEBPFError(cgroup) {
 			return err
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go
@@ -3,32 +3,49 @@
 package fs2

 import (
-	"strconv"
+	stdErrors "errors"
+	"os"
 	"strings"

 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/pkg/errors"
+	"golang.org/x/sys/unix"
 )

 func setFreezer(dirPath string, state configs.FreezerState) error {
-	var desired int
+	if err := supportsFreezer(dirPath); err != nil {
+		// We can ignore this request as long as the user didn't ask us to
+		// freeze the container (since without the freezer cgroup, that's a
+		// no-op).
+		if state == configs.Undefined || state == configs.Thawed {
+			err = nil
+		}
+		return errors.Wrap(err, "freezer not supported")
+	}
+
+	var stateStr string
 	switch state {
 	case configs.Undefined:
 		return nil
 	case configs.Frozen:
-		desired = 1
+		stateStr = "1"
 	case configs.Thawed:
-		desired = 0
+		stateStr = "0"
 	default:
-		return errors.Errorf("unknown freezer state %+v", state)
+		return errors.Errorf("invalid freezer state %q requested", state)
 	}
-	supportedErr := supportsFreezer(dirPath)
-	if supportedErr != nil && desired != 0 {
-		// can ignore error if desired == 1
-		return errors.Wrap(supportedErr, "freezer not supported")
+
+	if err := fscommon.WriteFile(dirPath, "cgroup.freeze", stateStr); err != nil {
+		return err
 	}
-	return freezeWithInt(dirPath, desired)
+	// Confirm that the cgroup did actually change states.
+	if actualState, err := getFreezer(dirPath); err != nil {
+		return err
+	} else if actualState != state {
+		return errors.Errorf(`expected "cgroup.freeze" to be in state %q but was in %q`, state, actualState)
+	}
+	return nil
 }

 func supportsFreezer(dirPath string) error {
@@ -36,18 +53,22 @@ func supportsFreezer(dirPath string) error {
 	return err
 }

-// freeze writes desired int to "cgroup.freeze".
-func freezeWithInt(dirPath string, desired int) error {
-	desiredS := strconv.Itoa(desired)
-	if err := fscommon.WriteFile(dirPath, "cgroup.freeze", desiredS); err != nil {
-		return err
-	}
-	got, err := fscommon.ReadFile(dirPath, "cgroup.freeze")
+func getFreezer(dirPath string) (configs.FreezerState, error) {
+	state, err := fscommon.ReadFile(dirPath, "cgroup.freeze")
 	if err != nil {
-		return err
+		// If the kernel is too old, then we just treat the freezer as being in
+		// an "undefined" state.
+		if os.IsNotExist(err) || stdErrors.Is(err, unix.ENODEV) {
+			err = nil
+		}
+		return configs.Undefined, err
 	}
-	if gotS := strings.TrimSpace(string(got)); gotS != desiredS {
-		return errors.Errorf("expected \"cgroup.freeze\" in %q to be %q, got %q", dirPath, desiredS, gotS)
+	switch strings.TrimSpace(state) {
+	case "0":
+		return configs.Thawed, nil
+	case "1":
+		return configs.Frozen, nil
+	default:
+		return configs.Undefined, errors.Errorf(`unknown "cgroup.freeze" state: %q`, state)
 	}
-	return nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go
@@ -8,64 +8,12 @@ import (
 	"path/filepath"
 	"strings"

-	securejoin "github.com/cyphar/filepath-securejoin"
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/pkg/errors"
+	"golang.org/x/sys/unix"
 )

-// NewManager creates a manager for cgroup v2 unified hierarchy.
-// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope".
-// If dirPath is empty, it is automatically set using config.
-func NewManager(config *configs.Cgroup, dirPath string, rootless bool) (cgroups.Manager, error) {
-	if config == nil {
-		config = &configs.Cgroup{}
-	}
-	if dirPath != "" {
-		if filepath.Clean(dirPath) != dirPath || !filepath.IsAbs(dirPath) {
-			return nil, errors.Errorf("invalid dir path %q", dirPath)
-		}
-	} else {
-		var err error
-		dirPath, err = defaultDirPath(config)
-		if err != nil {
-			return nil, err
-		}
-	}
-	controllers, err := detectControllers(dirPath)
-	if err != nil && !rootless {
-		return nil, err
-	}
-
-	m := &manager{
-		config:      config,
-		dirPath:     dirPath,
-		controllers: controllers,
-		rootless:    rootless,
-	}
-	return m, nil
-}
-
-func detectControllers(dirPath string) (map[string]struct{}, error) {
-	if err := os.MkdirAll(dirPath, 0755); err != nil {
-		return nil, err
-	}
-	controllersPath, err := securejoin.SecureJoin(dirPath, "cgroup.controllers")
-	if err != nil {
-		return nil, err
-	}
-	controllersData, err := ioutil.ReadFile(controllersPath)
-	if err != nil {
-		return nil, err
-	}
-	controllersFields := strings.Fields(string(controllersData))
-	controllers := make(map[string]struct{}, len(controllersFields))
-	for _, c := range controllersFields {
-		controllers[c] = struct{}{}
-	}
-	return controllers, nil
-}
-
 type manager struct {
 	config *configs.Cgroup
 	// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
@@ -76,8 +24,68 @@ type manager struct {
 	rootless    bool
 }

+// NewManager creates a manager for cgroup v2 unified hierarchy.
+// dirPath is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope".
+// If dirPath is empty, it is automatically set using config.
+func NewManager(config *configs.Cgroup, dirPath string, rootless bool) (cgroups.Manager, error) {
+	if config == nil {
+		config = &configs.Cgroup{}
+	}
+	if dirPath == "" {
+		var err error
+		dirPath, err = defaultDirPath(config)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	m := &manager{
+		config:   config,
+		dirPath:  dirPath,
+		rootless: rootless,
+	}
+	return m, nil
+}
+
+func (m *manager) getControllers() error {
+	if m.controllers != nil {
+		return nil
+	}
+
+	file := filepath.Join(m.dirPath, "cgroup.controllers")
+	data, err := ioutil.ReadFile(file)
+	if err != nil {
+		if m.rootless && m.config.Path == "" {
+			return nil
+		}
+		return err
+	}
+	fields := strings.Fields(string(data))
+	m.controllers = make(map[string]struct{}, len(fields))
+	for _, c := range fields {
+		m.controllers[c] = struct{}{}
+	}
+
+	return nil
+}
+
 func (m *manager) Apply(pid int) error {
-	if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil && !m.rootless {
+	if err := CreateCgroupPath(m.dirPath, m.config); err != nil {
+		// Related tests:
+		// - "runc create (no limits + no cgrouppath + no permission) succeeds"
+		// - "runc create (rootless + no limits + cgrouppath + no permission) fails with permission error"
+		// - "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
+		if m.rootless {
+			if m.config.Path == "" {
+				if blNeed, nErr := needAnyControllers(m.config); nErr == nil && !blNeed {
+					return nil
+				}
+				return errors.Wrap(err, "rootless needs no limits + no cgrouppath when no permission is granted for cgroups")
+			}
+		}
+		return err
+	}
+	if err := cgroups.WriteCgroupProc(m.dirPath, pid); err != nil {
 		return err
 	}
 	return nil
@@ -93,41 +101,52 @@ func (m *manager) GetAllPids() ([]int, error) {

 func (m *manager) GetStats() (*cgroups.Stats, error) {
 	var (
-		st   cgroups.Stats
 		errs []error
 	)
+
+	st := cgroups.NewStats()
+	if err := m.getControllers(); err != nil {
+		return st, err
+	}
+
 	// pids (since kernel 4.5)
 	if _, ok := m.controllers["pids"]; ok {
-		if err := statPids(m.dirPath, &st); err != nil {
+		if err := statPids(m.dirPath, st); err != nil {
 			errs = append(errs, err)
 		}
 	} else {
-		if err := statPidsWithoutController(m.dirPath, &st); err != nil {
+		if err := statPidsWithoutController(m.dirPath, st); err != nil {
 			errs = append(errs, err)
 		}
 	}
-	// memory (since kenrel 4.5)
+	// memory (since kernel 4.5)
 	if _, ok := m.controllers["memory"]; ok {
-		if err := statMemory(m.dirPath, &st); err != nil {
+		if err := statMemory(m.dirPath, st); err != nil {
 			errs = append(errs, err)
 		}
 	}
 	// io (since kernel 4.5)
 	if _, ok := m.controllers["io"]; ok {
-		if err := statIo(m.dirPath, &st); err != nil {
+		if err := statIo(m.dirPath, st); err != nil {
 			errs = append(errs, err)
 		}
 	}
 	// cpu (since kernel 4.15)
 	if _, ok := m.controllers["cpu"]; ok {
-		if err := statCpu(m.dirPath, &st); err != nil {
+		if err := statCpu(m.dirPath, st); err != nil {
+			errs = append(errs, err)
+		}
+	}
+	// hugetlb (since kernel 5.6)
+	if _, ok := m.controllers["hugetlb"]; ok {
+		if err := statHugeTlb(m.dirPath, st); err != nil {
 			errs = append(errs, err)
 		}
 	}
 	if len(errs) > 0 && !m.rootless {
-		return &st, errors.Errorf("error while statting cgroup v2: %+v", errs)
+		return st, errors.Errorf("error while statting cgroup v2: %+v", errs)
 	}
-	return &st, nil
+	return st, nil
 }

 func (m *manager) Freeze(state configs.FreezerState) error {
@@ -138,77 +157,112 @@ func (m *manager) Freeze(state configs.FreezerState) error {
 	return nil
 }

+func rmdir(path string) error {
+	err := unix.Rmdir(path)
+	if err == nil || err == unix.ENOENT {
+		return nil
+	}
+	return &os.PathError{Op: "rmdir", Path: path, Err: err}
+}
+
+// removeCgroupPath aims to remove cgroup path recursively
+// Because there may be subcgroups in it.
+func removeCgroupPath(path string) error {
+	// try the fast path first
+	if err := rmdir(path); err == nil {
+		return nil
+	}
+
+	infos, err := ioutil.ReadDir(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			err = nil
+		}
+		return err
+	}
+	for _, info := range infos {
+		if info.IsDir() {
+			// We should remove subcgroups dir first
+			if err = removeCgroupPath(filepath.Join(path, info.Name())); err != nil {
+				break
+			}
+		}
+	}
+	if err == nil {
+		err = rmdir(path)
+	}
+	return err
+}
+
 func (m *manager) Destroy() error {
-	return os.RemoveAll(m.dirPath)
+	return removeCgroupPath(m.dirPath)
 }

-// GetPaths is for compatibility purpose and should be removed in future
-func (m *manager) GetPaths() map[string]string {
-	paths := map[string]string{
-		// pseudo-controller for compatibility
-		"devices": m.dirPath,
-		"freezer": m.dirPath,
-	}
-	for c := range m.controllers {
-		paths[c] = m.dirPath
-	}
-	return paths
-}
-
-func (m *manager) GetUnifiedPath() (string, error) {
-	return m.dirPath, nil
+func (m *manager) Path(_ string) string {
+	return m.dirPath
 }

 func (m *manager) Set(container *configs.Config) error {
 	if container == nil || container.Cgroups == nil {
 		return nil
 	}
-	var errs []error
+	if err := m.getControllers(); err != nil {
+		return err
+	}
 	// pids (since kernel 4.5)
-	if _, ok := m.controllers["pids"]; ok {
-		if err := setPids(m.dirPath, container.Cgroups); err != nil {
-			errs = append(errs, err)
-		}
+	if err := setPids(m.dirPath, container.Cgroups); err != nil {
+		return err
 	}
 	// memory (since kernel 4.5)
-	if _, ok := m.controllers["memory"]; ok {
-		if err := setMemory(m.dirPath, container.Cgroups); err != nil {
-			errs = append(errs, err)
-		}
+	if err := setMemory(m.dirPath, container.Cgroups); err != nil {
+		return err
 	}
 	// io (since kernel 4.5)
-	if _, ok := m.controllers["io"]; ok {
-		if err := setIo(m.dirPath, container.Cgroups); err != nil {
-			errs = append(errs, err)
-		}
+	if err := setIo(m.dirPath, container.Cgroups); err != nil {
+		return err
 	}
 	// cpu (since kernel 4.15)
-	if _, ok := m.controllers["cpu"]; ok {
-		if err := setCpu(m.dirPath, container.Cgroups); err != nil {
-			errs = append(errs, err)
-		}
+	if err := setCpu(m.dirPath, container.Cgroups); err != nil {
+		return err
 	}
 	// devices (since kernel 4.15, pseudo-controller)
-	if err := setDevices(m.dirPath, container.Cgroups); err != nil {
-		errs = append(errs, err)
+	//
+	// When m.Rootless is true, errors from the device subsystem are ignored because it is really not expected to work.
+	// However, errors from other subsystems are not ignored.
+	// see @test "runc create (rootless + limits + no cgrouppath + no permission) fails with informative error"
+	if err := setDevices(m.dirPath, container.Cgroups); err != nil && !m.rootless {
+		return err
 	}
 	// cpuset (since kernel 5.0)
-	if _, ok := m.controllers["cpuset"]; ok {
-		if err := setCpuset(m.dirPath, container.Cgroups); err != nil {
-			errs = append(errs, err)
-		}
+	if err := setCpuset(m.dirPath, container.Cgroups); err != nil {
+		return err
+	}
+	// hugetlb (since kernel 5.6)
+	if err := setHugeTlb(m.dirPath, container.Cgroups); err != nil {
+		return err
 	}
 	// freezer (since kernel 5.2, pseudo-controller)
 	if err := setFreezer(m.dirPath, container.Cgroups.Freezer); err != nil {
-		errs = append(errs, err)
-	}
-	if len(errs) > 0 && !m.rootless {
-		return errors.Errorf("error while setting cgroup v2: %+v", errs)
+		return err
 	}
 	m.config = container.Cgroups
 	return nil
 }

+func (m *manager) GetPaths() map[string]string {
+	paths := make(map[string]string, 1)
+	paths[""] = m.dirPath
+	return paths
+}
+
 func (m *manager) GetCgroups() (*configs.Cgroup, error) {
 	return m.config, nil
 }
+
+func (m *manager) GetFreezerState() (configs.FreezerState, error) {
+	return getFreezer(m.dirPath)
+}
+
+func (m *manager) Exists() bool {
+	return cgroups.PathExists(m.dirPath)
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go
@@ -0,0 +1,66 @@
+// +build linux
+
+package fs2
+
+import (
+	"io/ioutil"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	"github.com/pkg/errors"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func isHugeTlbSet(cgroup *configs.Cgroup) bool {
+	return len(cgroup.Resources.HugetlbLimit) > 0
+}
+
+func setHugeTlb(dirPath string, cgroup *configs.Cgroup) error {
+	if !isHugeTlbSet(cgroup) {
+		return nil
+	}
+	for _, hugetlb := range cgroup.Resources.HugetlbLimit {
+		if err := fscommon.WriteFile(dirPath, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "max"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
+	hugePageSizes, err := cgroups.GetHugePageSize()
+	if err != nil {
+		return errors.Wrap(err, "failed to fetch hugetlb info")
+	}
+	hugetlbStats := cgroups.HugetlbStats{}
+
+	for _, pagesize := range hugePageSizes {
+		usage := strings.Join([]string{"hugetlb", pagesize, "current"}, ".")
+		value, err := fscommon.GetCgroupParamUint(dirPath, usage)
+		if err != nil {
+			return errors.Wrapf(err, "failed to parse hugetlb.%s.current file", pagesize)
+		}
+		hugetlbStats.Usage = value
+
+		fileName := strings.Join([]string{"hugetlb", pagesize, "events"}, ".")
+		filePath := filepath.Join(dirPath, fileName)
+		contents, err := ioutil.ReadFile(filePath)
+		if err != nil {
+			return errors.Wrapf(err, "failed to parse hugetlb.%s.events file", pagesize)
+		}
+		_, value, err = fscommon.GetCgroupParamKeyValue(string(contents))
+		if err != nil {
+			return errors.Wrapf(err, "failed to parse hugetlb.%s.events file", pagesize)
+		}
+		hugetlbStats.Failcnt = value
+
+		stats.HugetlbStats[pagesize] = hugetlbStats
+	}
+
+	return nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/io.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/io.go
@@ -14,14 +14,26 @@ import (
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

+func isIoSet(cgroup *configs.Cgroup) bool {
+	return cgroup.Resources.BlkioWeight != 0 ||
+		len(cgroup.Resources.BlkioThrottleReadBpsDevice) > 0 ||
+		len(cgroup.Resources.BlkioThrottleWriteBpsDevice) > 0 ||
+		len(cgroup.Resources.BlkioThrottleReadIOPSDevice) > 0 ||
+		len(cgroup.Resources.BlkioThrottleWriteIOPSDevice) > 0
+}
+
 func setIo(dirPath string, cgroup *configs.Cgroup) error {
+	if !isIoSet(cgroup) {
+		return nil
+	}
+
 	if cgroup.Resources.BlkioWeight != 0 {
 		filename := "io.bfq.weight"
-		if err := fscommon.WriteFile(dirPath, filename, strconv.FormatUint(uint64(cgroup.Resources.BlkioWeight), 10)); err != nil {
+		if err := fscommon.WriteFile(dirPath, filename,
+			strconv.FormatUint(cgroups.ConvertBlkIOToCgroupV2Value(cgroup.Resources.BlkioWeight), 10)); err != nil {
 			return err
 		}
 	}
-
 	for _, td := range cgroup.Resources.BlkioThrottleReadBpsDevice {
 		if err := fscommon.WriteFile(dirPath, "io.max", td.StringName("rbps")); err != nil {
 			return err
@@ -81,11 +93,11 @@ func statIo(dirPath string, stats *cgroups.Stats) error {
 		if len(d) != 2 {
 			continue
 		}
-		minor, err := strconv.ParseUint(d[0], 10, 0)
+		major, err := strconv.ParseUint(d[0], 10, 0)
 		if err != nil {
 			return err
 		}
-		major, err := strconv.ParseUint(d[1], 10, 0)
+		minor, err := strconv.ParseUint(d[1], 10, 0)
 		if err != nil {
 			return err
 		}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go
@@ -15,22 +15,58 @@ import (
 	"github.com/pkg/errors"
 )

+// numToStr converts an int64 value to a string for writing to a
+// cgroupv2 files with .min, .max, .low, or .high suffix.
+// The value of -1 is converted to "max" for cgroupv1 compatibility
+// (which used to write -1 to remove the limit).
+func numToStr(value int64) (ret string) {
+	switch {
+	case value == 0:
+		ret = ""
+	case value == -1:
+		ret = "max"
+	default:
+		ret = strconv.FormatInt(value, 10)
+	}
+
+	return ret
+}
+
+func isMemorySet(cgroup *configs.Cgroup) bool {
+	return cgroup.Resources.MemoryReservation != 0 ||
+		cgroup.Resources.Memory != 0 || cgroup.Resources.MemorySwap != 0
+}
+
 func setMemory(dirPath string, cgroup *configs.Cgroup) error {
-	if cgroup.Resources.MemorySwap != 0 {
-		if err := fscommon.WriteFile(dirPath, "memory.swap.max", strconv.FormatInt(cgroup.Resources.MemorySwap, 10)); err != nil {
+	if !isMemorySet(cgroup) {
+		return nil
+	}
+	swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(cgroup.Resources.MemorySwap, cgroup.Resources.Memory)
+	if err != nil {
+		return err
+	}
+	swapStr := numToStr(swap)
+	if swapStr == "" && swap == 0 && cgroup.Resources.MemorySwap > 0 {
+		// memory and memorySwap set to the same value -- disable swap
+		swapStr = "0"
+	}
+	// never write empty string to `memory.swap.max`, it means set to 0.
+	if swapStr != "" {
+		if err := fscommon.WriteFile(dirPath, "memory.swap.max", swapStr); err != nil {
 			return err
 		}
 	}
-	if cgroup.Resources.Memory != 0 {
-		if err := fscommon.WriteFile(dirPath, "memory.max", strconv.FormatInt(cgroup.Resources.Memory, 10)); err != nil {
+
+	if val := numToStr(cgroup.Resources.Memory); val != "" {
+		if err := fscommon.WriteFile(dirPath, "memory.max", val); err != nil {
 			return err
 		}
 	}

 	// cgroup.Resources.KernelMemory is ignored

-	if cgroup.Resources.MemoryReservation != 0 {
-		if err := fscommon.WriteFile(dirPath, "memory.low", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil {
+	if val := numToStr(cgroup.Resources.MemoryReservation); val != "" {
+		if err := fscommon.WriteFile(dirPath, "memory.low", val); err != nil {
 			return err
 		}
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/pids.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/pids.go
@@ -4,9 +4,7 @@ package fs2

 import (
 	"io/ioutil"
-	"os"
 	"path/filepath"
-	"strconv"
 	"strings"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -16,16 +14,16 @@ import (
 	"golang.org/x/sys/unix"
 )

+func isPidsSet(cgroup *configs.Cgroup) bool {
+	return cgroup.Resources.PidsLimit != 0
+}
+
 func setPids(dirPath string, cgroup *configs.Cgroup) error {
-	if cgroup.Resources.PidsLimit != 0 {
-		// "max" is the fallback value.
-		limit := "max"
-
-		if cgroup.Resources.PidsLimit > 0 {
-			limit = strconv.FormatInt(cgroup.Resources.PidsLimit, 10)
-		}
-
-		if err := fscommon.WriteFile(dirPath, "pids.max", limit); err != nil {
+	if !isPidsSet(cgroup) {
+		return nil
+	}
+	if val := numToStr(cgroup.Resources.PidsLimit); val != "" {
+		if err := fscommon.WriteFile(dirPath, "pids.max", val); err != nil {
 			return err
 		}
 	}
@@ -33,20 +31,11 @@ func setPids(dirPath string, cgroup *configs.Cgroup) error {
 	return nil
 }

-func isNOTSUP(err error) bool {
-	switch err := err.(type) {
-	case *os.PathError:
-		return err.Err == unix.ENOTSUP
-	default:
-		return false
-	}
-}
-
 func statPidsWithoutController(dirPath string, stats *cgroups.Stats) error {
 	// if the controller is not enabled, let's read PIDS from cgroups.procs
 	// (or threads if cgroup.threads is enabled)
 	contents, err := ioutil.ReadFile(filepath.Join(dirPath, "cgroup.procs"))
-	if err != nil && isNOTSUP(err) {
+	if errors.Is(err, unix.ENOTSUP) {
 		contents, err = ioutil.ReadFile(filepath.Join(dirPath, "cgroup.threads"))
 	}
 	if err != nil {
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/BUILD
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/BUILD
@@ -13,10 +13,14 @@ go_library(
        "@io_bazel_rules_go//go/platform:android": [
            "//vendor/github.com/cyphar/filepath-securejoin:go_default_library",
            "//vendor/github.com/pkg/errors:go_default_library",
+            "//vendor/github.com/sirupsen/logrus:go_default_library",
+            "//vendor/golang.org/x/sys/unix:go_default_library",
        ],
        "@io_bazel_rules_go//go/platform:linux": [
            "//vendor/github.com/cyphar/filepath-securejoin:go_default_library",
            "//vendor/github.com/pkg/errors:go_default_library",
+            "//vendor/github.com/sirupsen/logrus:go_default_library",
+            "//vendor/golang.org/x/sys/unix:go_default_library",
        ],
        "//conditions:default": [],
    }),
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go
@@ -4,9 +4,12 @@ package fscommon

 import (
 	"io/ioutil"
+	"os"

 	securejoin "github.com/cyphar/filepath-securejoin"
 	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
 )

 func WriteFile(dir, file, data string) error {
@@ -17,7 +20,7 @@ func WriteFile(dir, file, data string) error {
 	if err != nil {
 		return err
 	}
-	if err := ioutil.WriteFile(path, []byte(data), 0700); err != nil {
+	if err := retryingWriteFile(path, []byte(data), 0700); err != nil {
 		return errors.Wrapf(err, "failed to write %q to %q", data, path)
 	}
 	return nil
@@ -34,3 +37,14 @@ func ReadFile(dir, file string) (string, error) {
 	data, err := ioutil.ReadFile(path)
 	return string(data), err
 }
+
+func retryingWriteFile(filename string, data []byte, perm os.FileMode) error {
+	for {
+		err := ioutil.WriteFile(filename, data, perm)
+		if errors.Is(err, unix.EINTR) {
+			logrus.Infof("interrupted while writing %s to %s", string(data), filename)
+			continue
+		}
+		return err
+	}
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
@@ -20,6 +20,12 @@ type CpuUsage struct {
 	// Total CPU time consumed per core.
 	// Units: nanoseconds.
 	PercpuUsage []uint64 `json:"percpu_usage,omitempty"`
+	// CPU time consumed per core in kernel mode
+	// Units: nanoseconds.
+	PercpuUsageInKernelmode []uint64 `json:"percpu_usage_in_kernelmode"`
+	// CPU time consumed per core in user mode
+	// Units: nanoseconds.
+	PercpuUsageInUsermode []uint64 `json:"percpu_usage_in_usermode"`
 	// Time spent by tasks of the cgroup in kernel mode.
 	// Units: nanoseconds.
 	UsageInKernelmode uint64 `json:"usage_in_kernelmode"`
@@ -51,12 +57,33 @@ type MemoryStats struct {
 	KernelUsage MemoryData `json:"kernel_usage,omitempty"`
 	// usage of kernel TCP memory
 	KernelTCPUsage MemoryData `json:"kernel_tcp_usage,omitempty"`
+	// usage of memory pages by NUMA node
+	// see chapter 5.6 of memory controller documentation
+	PageUsageByNUMA PageUsageByNUMA `json:"page_usage_by_numa,omitempty"`
 	// if true, memory usage is accounted for throughout a hierarchy of cgroups.
 	UseHierarchy bool `json:"use_hierarchy"`

 	Stats map[string]uint64 `json:"stats,omitempty"`
 }

+type PageUsageByNUMA struct {
+	// Embedding is used as types can't be recursive.
+	PageUsageByNUMAInner
+	Hierarchical PageUsageByNUMAInner `json:"hierarchical,omitempty"`
+}
+
+type PageUsageByNUMAInner struct {
+	Total       PageStats `json:"total,omitempty"`
+	File        PageStats `json:"file,omitempty"`
+	Anon        PageStats `json:"anon,omitempty"`
+	Unevictable PageStats `json:"unevictable,omitempty"`
+}
+
+type PageStats struct {
+	Total uint64           `json:"total,omitempty"`
+	Nodes map[uint8]uint64 `json:"nodes,omitempty"`
+}
+
 type PidsStats struct {
 	// number of pids in the cgroup
 	Current uint64 `json:"current,omitempty"`
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/BUILD
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/BUILD
@@ -3,73 +3,66 @@ load("@io_bazel_rules_go//go:def.bzl", "go_library")
 go_library(
    name = "go_default_library",
    srcs = [
-        "apply_nosystemd.go",
-        "apply_systemd.go",
-        "unified_hierarchy.go",
+        "common.go",
+        "unsupported.go",
+        "user.go",
+        "v1.go",
+        "v2.go",
    ],
    importmap = "k8s.io/kubernetes/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd",
    importpath = "github.com/opencontainers/runc/libcontainer/cgroups/systemd",
    visibility = ["//visibility:public"],
-    deps = select({
+    deps = [
+        "//vendor/github.com/coreos/go-systemd/v22/dbus:go_default_library",
+        "//vendor/github.com/godbus/dbus/v5:go_default_library",
+        "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices:go_default_library",
+        "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
+        "//vendor/github.com/pkg/errors:go_default_library",
+        "//vendor/github.com/sirupsen/logrus:go_default_library",
+    ] + select({
        "@io_bazel_rules_go//go/platform:android": [
-            "//vendor/github.com/coreos/go-systemd/dbus:go_default_library",
-            "//vendor/github.com/godbus/dbus:go_default_library",
+            "//vendor/github.com/cyphar/filepath-securejoin:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2:go_default_library",
-            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
-            "//vendor/github.com/pkg/errors:go_default_library",
-            "//vendor/github.com/sirupsen/logrus:go_default_library",
+            "//vendor/github.com/opencontainers/runc/libcontainer/system:go_default_library",
        ],
        "@io_bazel_rules_go//go/platform:darwin": [
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
-            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
        ],
        "@io_bazel_rules_go//go/platform:dragonfly": [
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
-            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
        ],
        "@io_bazel_rules_go//go/platform:freebsd": [
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
-            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
        ],
        "@io_bazel_rules_go//go/platform:ios": [
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
-            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
        ],
        "@io_bazel_rules_go//go/platform:linux": [
-            "//vendor/github.com/coreos/go-systemd/dbus:go_default_library",
-            "//vendor/github.com/godbus/dbus:go_default_library",
+            "//vendor/github.com/cyphar/filepath-securejoin:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs:go_default_library",
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2:go_default_library",
-            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
-            "//vendor/github.com/pkg/errors:go_default_library",
-            "//vendor/github.com/sirupsen/logrus:go_default_library",
+            "//vendor/github.com/opencontainers/runc/libcontainer/system:go_default_library",
        ],
        "@io_bazel_rules_go//go/platform:nacl": [
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
-            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
        ],
        "@io_bazel_rules_go//go/platform:netbsd": [
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
-            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
        ],
        "@io_bazel_rules_go//go/platform:openbsd": [
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
-            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
        ],
        "@io_bazel_rules_go//go/platform:plan9": [
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
-            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
        ],
        "@io_bazel_rules_go//go/platform:solaris": [
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
-            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
        ],
        "@io_bazel_rules_go//go/platform:windows": [
            "//vendor/github.com/opencontainers/runc/libcontainer/cgroups:go_default_library",
-            "//vendor/github.com/opencontainers/runc/libcontainer/configs:go_default_library",
        ],
        "//conditions:default": [],
    }),
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go
@@ -0,0 +1,424 @@
+package systemd
+
+import (
+	"bufio"
+	"fmt"
+	"math"
+	"os"
+	"regexp"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	dbus "github.com/godbus/dbus/v5"
+	"github.com/opencontainers/runc/libcontainer/cgroups/devices"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+)
+
+var (
+	connOnce sync.Once
+	connDbus *systemdDbus.Conn
+	connErr  error
+
+	versionOnce sync.Once
+	version     int
+	versionErr  error
+)
+
+// NOTE: This function comes from package github.com/coreos/go-systemd/util
+// It was borrowed here to avoid a dependency on cgo.
+//
+// IsRunningSystemd checks whether the host was booted with systemd as its init
+// system. This functions similarly to systemd's `sd_booted(3)`: internally, it
+// checks whether /run/systemd/system/ exists and is a directory.
+// http://www.freedesktop.org/software/systemd/man/sd_booted.html
+func IsRunningSystemd() bool {
+	fi, err := os.Lstat("/run/systemd/system")
+	if err != nil {
+		return false
+	}
+	return fi.IsDir()
+}
+
+// systemd represents slice hierarchy using `-`, so we need to follow suit when
+// generating the path of slice. Essentially, test-a-b.slice becomes
+// /test.slice/test-a.slice/test-a-b.slice.
+func ExpandSlice(slice string) (string, error) {
+	suffix := ".slice"
+	// Name has to end with ".slice", but can't be just ".slice".
+	if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
+		return "", fmt.Errorf("invalid slice name: %s", slice)
+	}
+
+	// Path-separators are not allowed.
+	if strings.Contains(slice, "/") {
+		return "", fmt.Errorf("invalid slice name: %s", slice)
+	}
+
+	var path, prefix string
+	sliceName := strings.TrimSuffix(slice, suffix)
+	// if input was -.slice, we should just return root now
+	if sliceName == "-" {
+		return "/", nil
+	}
+	for _, component := range strings.Split(sliceName, "-") {
+		// test--a.slice isn't permitted, nor is -test.slice.
+		if component == "" {
+			return "", fmt.Errorf("invalid slice name: %s", slice)
+		}
+
+		// Append the component to the path and to the prefix.
+		path += "/" + prefix + component + suffix
+		prefix += component + "-"
+	}
+	return path, nil
+}
+
+func groupPrefix(ruleType configs.DeviceType) (string, error) {
+	switch ruleType {
+	case configs.BlockDevice:
+		return "block-", nil
+	case configs.CharDevice:
+		return "char-", nil
+	default:
+		return "", errors.Errorf("device type %v has no group prefix", ruleType)
+	}
+}
+
+// findDeviceGroup tries to find the device group name (as listed in
+// /proc/devices) with the type prefixed as requried for DeviceAllow, for a
+// given (type, major) combination. If more than one device group exists, an
+// arbitrary one is chosen.
+func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, error) {
+	fh, err := os.Open("/proc/devices")
+	if err != nil {
+		return "", err
+	}
+	defer fh.Close()
+
+	prefix, err := groupPrefix(ruleType)
+	if err != nil {
+		return "", err
+	}
+
+	scanner := bufio.NewScanner(fh)
+	var currentType configs.DeviceType
+	for scanner.Scan() {
+		// We need to strip spaces because the first number is column-aligned.
+		line := strings.TrimSpace(scanner.Text())
+
+		// Handle the "header" lines.
+		switch line {
+		case "Block devices:":
+			currentType = configs.BlockDevice
+			continue
+		case "Character devices:":
+			currentType = configs.CharDevice
+			continue
+		case "":
+			continue
+		}
+
+		// Skip lines unrelated to our type.
+		if currentType != ruleType {
+			continue
+		}
+
+		// Parse out the (major, name).
+		var (
+			currMajor int64
+			currName  string
+		)
+		if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 {
+			if err == nil {
+				err = errors.Errorf("wrong number of fields")
+			}
+			return "", errors.Wrapf(err, "scan /proc/devices line %q", line)
+		}
+
+		if currMajor == ruleMajor {
+			return prefix + currName, nil
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return "", errors.Wrap(err, "reading /proc/devices")
+	}
+	// Couldn't find the device group.
+	return "", nil
+}
+
+// generateDeviceProperties takes the configured device rules and generates a
+// corresponding set of systemd properties to configure the devices correctly.
+func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Property, error) {
+	// DeviceAllow is the type "a(ss)" which means we need a temporary struct
+	// to represent it in Go.
+	type deviceAllowEntry struct {
+		Path  string
+		Perms string
+	}
+
+	properties := []systemdDbus.Property{
+		// Always run in the strictest white-list mode.
+		newProp("DevicePolicy", "strict"),
+		// Empty the DeviceAllow array before filling it.
+		newProp("DeviceAllow", []deviceAllowEntry{}),
+	}
+
+	// Figure out the set of rules.
+	configEmu := &devices.Emulator{}
+	for _, rule := range rules {
+		if err := configEmu.Apply(*rule); err != nil {
+			return nil, errors.Wrap(err, "apply rule for systemd")
+		}
+	}
+	// systemd doesn't support blacklists. So we log a warning, and tell
+	// systemd to act as a deny-all whitelist. This ruleset will be replaced
+	// with our normal fallback code. This may result in spurrious errors, but
+	// the only other option is to error out here.
+	if configEmu.IsBlacklist() {
+		// However, if we're dealing with an allow-all rule then we can do it.
+		if configEmu.IsAllowAll() {
+			return []systemdDbus.Property{
+				// Run in white-list mode by setting to "auto" and removing all
+				// DeviceAllow rules.
+				newProp("DevicePolicy", "auto"),
+				newProp("DeviceAllow", []deviceAllowEntry{}),
+			}, nil
+		}
+		logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
+		return properties, nil
+	}
+
+	// Now generate the set of rules we actually need to apply. Unlike the
+	// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
+	// whitelist which is the default for devices.Emulator.
+	baseEmu := &devices.Emulator{}
+	finalRules, err := baseEmu.Transition(configEmu)
+	if err != nil {
+		return nil, errors.Wrap(err, "get simplified rules for systemd")
+	}
+	var deviceAllowList []deviceAllowEntry
+	for _, rule := range finalRules {
+		if !rule.Allow {
+			// Should never happen.
+			return nil, errors.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
+		}
+		switch rule.Type {
+		case configs.BlockDevice, configs.CharDevice:
+		default:
+			// Should never happen.
+			return nil, errors.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
+		}
+
+		entry := deviceAllowEntry{
+			Perms: string(rule.Permissions),
+		}
+
+		// systemd has a fairly odd (though understandable) syntax here, and
+		// because of the OCI configuration format we have to do quite a bit of
+		// trickery to convert things:
+		//
+		//  * Concrete rules with non-wildcard major/minor numbers have to use
+		//    /dev/{block,char} paths. This is slightly odd because it means
+		//    that we cannot add whitelist rules for devices that don't exist,
+		//    but there's not too much we can do about that.
+		//
+		//    However, path globbing is not support for path-based rules so we
+		//    need to handle wildcards in some other manner.
+		//
+		//  * Wildcard-minor rules have to specify a "device group name" (the
+		//    second column in /proc/devices).
+		//
+		//  * Wildcard (major and minor) rules can just specify a glob with the
+		//    type ("char-*" or "block-*").
+		//
+		// The only type of rule we can't handle is wildcard-major rules, and
+		// so we'll give a warning in that case (note that the fallback code
+		// will insert any rules systemd couldn't handle). What amazing fun.
+
+		if rule.Major == configs.Wildcard {
+			// "_ *:n _" rules aren't supported by systemd.
+			if rule.Minor != configs.Wildcard {
+				logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
+				continue
+			}
+
+			// "_ *:* _" rules just wildcard everything.
+			prefix, err := groupPrefix(rule.Type)
+			if err != nil {
+				return nil, err
+			}
+			entry.Path = prefix + "*"
+		} else if rule.Minor == configs.Wildcard {
+			// "_ n:* _" rules require a device group from /proc/devices.
+			group, err := findDeviceGroup(rule.Type, rule.Major)
+			if err != nil {
+				return nil, errors.Wrapf(err, "find device '%v/%d'", rule.Type, rule.Major)
+			}
+			if group == "" {
+				// Couldn't find a group.
+				logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
+				continue
+			}
+			entry.Path = group
+		} else {
+			// "_ n:m _" rules are just a path in /dev/{block,char}/.
+			switch rule.Type {
+			case configs.BlockDevice:
+				entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
+			case configs.CharDevice:
+				entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
+			}
+		}
+		deviceAllowList = append(deviceAllowList, entry)
+	}
+
+	properties = append(properties, newProp("DeviceAllow", deviceAllowList))
+	return properties, nil
+}
+
+// getDbusConnection lazy initializes systemd dbus connection
+// and returns it
+func getDbusConnection(rootless bool) (*systemdDbus.Conn, error) {
+	connOnce.Do(func() {
+		if rootless {
+			connDbus, connErr = NewUserSystemdDbus()
+		} else {
+			connDbus, connErr = systemdDbus.New()
+		}
+	})
+	return connDbus, connErr
+}
+
+func newProp(name string, units interface{}) systemdDbus.Property {
+	return systemdDbus.Property{
+		Name:  name,
+		Value: dbus.MakeVariant(units),
+	}
+}
+
+func getUnitName(c *configs.Cgroup) string {
+	// by default, we create a scope unless the user explicitly asks for a slice.
+	if !strings.HasSuffix(c.Name, ".slice") {
+		return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
+	}
+	return c.Name
+}
+
+// isUnitExists returns true if the error is that a systemd unit already exists.
+func isUnitExists(err error) bool {
+	if err != nil {
+		if dbusError, ok := err.(dbus.Error); ok {
+			return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
+		}
+	}
+	return false
+}
+
+func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []systemdDbus.Property) error {
+	statusChan := make(chan string, 1)
+	if _, err := dbusConnection.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
+		select {
+		case s := <-statusChan:
+			close(statusChan)
+			// Please refer to https://godoc.org/github.com/coreos/go-systemd/dbus#Conn.StartUnit
+			if s != "done" {
+				dbusConnection.ResetFailedUnit(unitName)
+				return errors.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
+			}
+		case <-time.After(time.Second):
+			logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
+		}
+	} else if !isUnitExists(err) {
+		return err
+	}
+
+	return nil
+}
+
+func stopUnit(dbusConnection *systemdDbus.Conn, unitName string) error {
+	statusChan := make(chan string, 1)
+	if _, err := dbusConnection.StopUnit(unitName, "replace", statusChan); err == nil {
+		select {
+		case s := <-statusChan:
+			close(statusChan)
+			// Please refer to https://godoc.org/github.com/coreos/go-systemd/dbus#Conn.StartUnit
+			if s != "done" {
+				logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s)
+			}
+		case <-time.After(time.Second):
+			logrus.Warnf("Timed out while waiting for StopUnit(%s) completion signal from dbus. Continuing...", unitName)
+		}
+	}
+	return nil
+}
+
+func systemdVersion(conn *systemdDbus.Conn) (int, error) {
+	versionOnce.Do(func() {
+		version = -1
+		verStr, err := conn.GetManagerProperty("Version")
+		if err != nil {
+			versionErr = err
+			return
+		}
+
+		version, versionErr = systemdVersionAtoi(verStr)
+		return
+	})
+
+	return version, versionErr
+}
+
+func systemdVersionAtoi(verStr string) (int, error) {
+	// verStr should be of the form:
+	// "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32"
+	// all the input strings include quotes, and the output int should be 245
+	// thus, we unconditionally remove the `"v`
+	// and then match on the first integer we can grab
+	re := regexp.MustCompile(`"?v?([0-9]+)`)
+	matches := re.FindStringSubmatch(verStr)
+	if len(matches) < 2 {
+		return 0, errors.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches)
+	}
+	ver, err := strconv.Atoi(matches[1])
+	return ver, errors.Wrapf(err, "can't parse version %s", verStr)
+}
+
+func addCpuQuota(conn *systemdDbus.Conn, properties *[]systemdDbus.Property, quota int64, period uint64) {
+	if period != 0 {
+		// systemd only supports CPUQuotaPeriodUSec since v242
+		sdVer, err := systemdVersion(conn)
+		if err != nil {
+			logrus.Warnf("systemdVersion: %s", err)
+		} else if sdVer >= 242 {
+			*properties = append(*properties,
+				newProp("CPUQuotaPeriodUSec", period))
+		}
+	}
+	if quota != 0 || period != 0 {
+		// corresponds to USEC_INFINITY in systemd
+		cpuQuotaPerSecUSec := uint64(math.MaxUint64)
+		if quota > 0 {
+			if period == 0 {
+				// assume the default kernel value of 100000 us (100 ms), same for v1 and v2.
+				// v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and
+				// v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
+				period = 100000
+			}
+			// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
+			// (integer percentage of CPU) internally.  This means that if a fractional percent of
+			// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
+			// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
+			cpuQuotaPerSecUSec = uint64(quota*1000000) / period
+			if cpuQuotaPerSecUSec%10000 != 0 {
+				cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
+			}
+		}
+		*properties = append(*properties,
+			newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
+	}
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/unified_hierarchy.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/unified_hierarchy.go
@@ -1,312 +0,0 @@
-// +build linux
-
-package systemd
-
-import (
-	"fmt"
-	"io/ioutil"
-	"math"
-	"os"
-	"path/filepath"
-	"strings"
-	"sync"
-	"time"
-
-	systemdDbus "github.com/coreos/go-systemd/dbus"
-	"github.com/opencontainers/runc/libcontainer/cgroups"
-	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
-	"github.com/opencontainers/runc/libcontainer/configs"
-	"github.com/pkg/errors"
-	"github.com/sirupsen/logrus"
-)
-
-type UnifiedManager struct {
-	mu      sync.Mutex
-	Cgroups *configs.Cgroup
-	Paths   map[string]string
-}
-
-func (m *UnifiedManager) Apply(pid int) error {
-	var (
-		c          = m.Cgroups
-		unitName   = getUnitName(c)
-		slice      = "system.slice"
-		properties []systemdDbus.Property
-	)
-
-	if c.Paths != nil {
-		paths := make(map[string]string)
-		for name, path := range c.Paths {
-			_, err := getSubsystemPath(m.Cgroups, name)
-			if err != nil {
-				// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
-				if cgroups.IsNotFound(err) {
-					continue
-				}
-				return err
-			}
-			paths[name] = path
-		}
-		m.Paths = paths
-		return cgroups.EnterPid(m.Paths, pid)
-	}
-
-	if c.Parent != "" {
-		slice = c.Parent
-	}
-
-	properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
-
-	// if we create a slice, the parent is defined via a Wants=
-	if strings.HasSuffix(unitName, ".slice") {
-		properties = append(properties, systemdDbus.PropWants(slice))
-	} else {
-		// otherwise, we use Slice=
-		properties = append(properties, systemdDbus.PropSlice(slice))
-	}
-
-	// only add pid if its valid, -1 is used w/ general slice creation.
-	if pid != -1 {
-		properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
-	}
-
-	// Check if we can delegate. This is only supported on systemd versions 218 and above.
-	if !strings.HasSuffix(unitName, ".slice") {
-		// Assume scopes always support delegation.
-		properties = append(properties, newProp("Delegate", true))
-	}
-
-	// Always enable accounting, this gets us the same behaviour as the fs implementation,
-	// plus the kernel has some problems with joining the memory cgroup at a later time.
-	properties = append(properties,
-		newProp("MemoryAccounting", true),
-		newProp("CPUAccounting", true),
-		newProp("BlockIOAccounting", true))
-
-	// Assume DefaultDependencies= will always work (the check for it was previously broken.)
-	properties = append(properties,
-		newProp("DefaultDependencies", false))
-
-	if c.Resources.Memory != 0 {
-		properties = append(properties,
-			newProp("MemoryLimit", uint64(c.Resources.Memory)))
-	}
-
-	if c.Resources.CpuShares != 0 {
-		properties = append(properties,
-			newProp("CPUShares", c.Resources.CpuShares))
-	}
-
-	// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
-	if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
-		// corresponds to USEC_INFINITY in systemd
-		// if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
-		// always setting a property value ensures we can apply a quota and remove it later
-		cpuQuotaPerSecUSec := uint64(math.MaxUint64)
-		if c.Resources.CpuQuota > 0 {
-			// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
-			// (integer percentage of CPU) internally.  This means that if a fractional percent of
-			// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
-			// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
-			cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
-			if cpuQuotaPerSecUSec%10000 != 0 {
-				cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
-			}
-		}
-		properties = append(properties,
-			newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
-	}
-
-	if c.Resources.BlkioWeight != 0 {
-		properties = append(properties,
-			newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
-	}
-
-	if c.Resources.PidsLimit > 0 {
-		properties = append(properties,
-			newProp("TasksAccounting", true),
-			newProp("TasksMax", uint64(c.Resources.PidsLimit)))
-	}
-
-	// We have to set kernel memory here, as we can't change it once
-	// processes have been attached to the cgroup.
-	if c.Resources.KernelMemory != 0 {
-		if err := setKernelMemory(c); err != nil {
-			return err
-		}
-	}
-
-	statusChan := make(chan string, 1)
-	if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
-		select {
-		case <-statusChan:
-		case <-time.After(time.Second):
-			logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
-		}
-	} else if !isUnitExists(err) {
-		return err
-	}
-
-	if err := joinCgroupsV2(c, pid); err != nil {
-		return err
-	}
-
-	path, err := getSubsystemPath(m.Cgroups, "")
-	if err != nil {
-		return err
-	}
-	m.Paths = map[string]string{
-		"pids":    path,
-		"memory":  path,
-		"io":      path,
-		"cpu":     path,
-		"devices": path,
-		"cpuset":  path,
-		"freezer": path,
-	}
-	return nil
-}
-
-func (m *UnifiedManager) Destroy() error {
-	if m.Cgroups.Paths != nil {
-		return nil
-	}
-	m.mu.Lock()
-	defer m.mu.Unlock()
-	theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
-	if err := cgroups.RemovePaths(m.Paths); err != nil {
-		return err
-	}
-	m.Paths = make(map[string]string)
-	return nil
-}
-
-func (m *UnifiedManager) GetPaths() map[string]string {
-	m.mu.Lock()
-	paths := m.Paths
-	m.mu.Unlock()
-	return paths
-}
-func (m *UnifiedManager) GetUnifiedPath() (string, error) {
-	unifiedPath := ""
-	m.mu.Lock()
-	defer m.mu.Unlock()
-	for k, v := range m.Paths {
-		if unifiedPath == "" {
-			unifiedPath = v
-		} else if v != unifiedPath {
-			return unifiedPath,
-				errors.Errorf("expected %q path to be unified path %q, got %q", k, unifiedPath, v)
-		}
-	}
-	if unifiedPath == "" {
-		// FIXME: unified path could be detected even when no controller is available
-		return unifiedPath, errors.New("cannot detect unified path")
-	}
-	return unifiedPath, nil
-}
-func createCgroupsv2Path(path string) (Err error) {
-	content, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
-	if err != nil {
-		return err
-	}
-	if !filepath.HasPrefix(path, "/sys/fs/cgroup") {
-		return fmt.Errorf("invalid cgroup path %s", path)
-	}
-
-	res := ""
-	for i, c := range strings.Split(strings.TrimSpace(string(content)), " ") {
-		if i == 0 {
-			res = fmt.Sprintf("+%s", c)
-		} else {
-			res = res + fmt.Sprintf(" +%s", c)
-		}
-	}
-	resByte := []byte(res)
-
-	current := "/sys/fs"
-	elements := strings.Split(path, "/")
-	for i, e := range elements[3:] {
-		current = filepath.Join(current, e)
-		if i > 0 {
-			if err := os.Mkdir(current, 0755); err != nil {
-				if !os.IsExist(err) {
-					return err
-				}
-			} else {
-				// If the directory was created, be sure it is not left around on errors.
-				defer func() {
-					if Err != nil {
-						os.Remove(current)
-					}
-				}()
-			}
-		}
-		if i < len(elements[3:])-1 {
-			if err := ioutil.WriteFile(filepath.Join(current, "cgroup.subtree_control"), resByte, 0755); err != nil {
-				return err
-			}
-		}
-	}
-	return nil
-}
-
-func joinCgroupsV2(c *configs.Cgroup, pid int) error {
-	path, err := getSubsystemPath(c, "memory")
-	if err != nil {
-		return err
-	}
-	return createCgroupsv2Path(path)
-}
-
-func (m *UnifiedManager) fsManager() (cgroups.Manager, error) {
-	path, err := m.GetUnifiedPath()
-	if err != nil {
-		return nil, err
-	}
-	return fs2.NewManager(m.Cgroups, path, false)
-}
-
-func (m *UnifiedManager) Freeze(state configs.FreezerState) error {
-	fsMgr, err := m.fsManager()
-	if err != nil {
-		return err
-	}
-	return fsMgr.Freeze(state)
-}
-
-func (m *UnifiedManager) GetPids() ([]int, error) {
-	path, err := m.GetUnifiedPath()
-	if err != nil {
-		return nil, err
-	}
-	return cgroups.GetPids(path)
-}
-
-func (m *UnifiedManager) GetAllPids() ([]int, error) {
-	path, err := m.GetUnifiedPath()
-	if err != nil {
-		return nil, err
-	}
-	return cgroups.GetAllPids(path)
-}
-
-func (m *UnifiedManager) GetStats() (*cgroups.Stats, error) {
-	fsMgr, err := m.fsManager()
-	if err != nil {
-		return nil, err
-	}
-	return fsMgr.GetStats()
-}
-
-func (m *UnifiedManager) Set(container *configs.Config) error {
-	fsMgr, err := m.fsManager()
-	if err != nil {
-		return err
-	}
-	return fsMgr.Set(container)
-}
-
-func (m *UnifiedManager) GetCgroups() (*configs.Cgroup, error) {
-	return m.Cgroups, nil
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_nosystemd.go
@@ -3,7 +3,7 @@
 package systemd

 import (
-	"fmt"
+	"errors"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
@@ -14,54 +14,58 @@ type Manager struct {
 	Paths   map[string]string
 }

-func UseSystemd() bool {
+func IsRunningSystemd() bool {
 	return false
 }

 func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) {
-	return nil, fmt.Errorf("Systemd not supported")
+	return nil, errors.New("Systemd not supported")
 }

 func (m *Manager) Apply(pid int) error {
-	return fmt.Errorf("Systemd not supported")
+	return errors.New("Systemd not supported")
 }

 func (m *Manager) GetPids() ([]int, error) {
-	return nil, fmt.Errorf("Systemd not supported")
+	return nil, errors.New("Systemd not supported")
 }

 func (m *Manager) GetAllPids() ([]int, error) {
-	return nil, fmt.Errorf("Systemd not supported")
+	return nil, errors.New("Systemd not supported")
 }

 func (m *Manager) Destroy() error {
-	return fmt.Errorf("Systemd not supported")
+	return errors.New("Systemd not supported")
 }

 func (m *Manager) GetPaths() map[string]string {
 	return nil
 }

-func (m *Manager) GetUnifiedPath() (string, error) {
-	return "", fmt.Errorf("Systemd not supported")
+func (m *Manager) Path(_ string) string {
+	return ""
 }

 func (m *Manager) GetStats() (*cgroups.Stats, error) {
-	return nil, fmt.Errorf("Systemd not supported")
+	return nil, errors.New("Systemd not supported")
 }

 func (m *Manager) Set(container *configs.Config) error {
-	return fmt.Errorf("Systemd not supported")
+	return errors.New("Systemd not supported")
 }

 func (m *Manager) Freeze(state configs.FreezerState) error {
-	return fmt.Errorf("Systemd not supported")
+	return errors.New("Systemd not supported")
 }

 func Freeze(c *configs.Cgroup, state configs.FreezerState) error {
-	return fmt.Errorf("Systemd not supported")
+	return errors.New("Systemd not supported")
 }

 func (m *Manager) GetCgroups() (*configs.Cgroup, error) {
-	return nil, fmt.Errorf("Systemd not supported")
+	return nil, errors.New("Systemd not supported")
+}
+
+func (m *Manager) Exists() bool {
+	return false
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/user.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/user.go
@@ -0,0 +1,106 @@
+// +build linux
+
+package systemd
+
+import (
+	"bufio"
+	"bytes"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	dbus "github.com/godbus/dbus/v5"
+	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/pkg/errors"
+)
+
+// NewUserSystemdDbus creates a connection for systemd user-instance.
+func NewUserSystemdDbus() (*systemdDbus.Conn, error) {
+	addr, err := DetectUserDbusSessionBusAddress()
+	if err != nil {
+		return nil, err
+	}
+	uid, err := DetectUID()
+	if err != nil {
+		return nil, err
+	}
+
+	return systemdDbus.NewConnection(func() (*dbus.Conn, error) {
+		conn, err := dbus.Dial(addr)
+		if err != nil {
+			return nil, errors.Wrapf(err, "error while dialing %q", addr)
+		}
+		methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))}
+		err = conn.Auth(methods)
+		if err != nil {
+			conn.Close()
+			return nil, errors.Wrapf(err, "error while authenticating connection, address=%q, UID=%d", addr, uid)
+		}
+		if err = conn.Hello(); err != nil {
+			conn.Close()
+			return nil, errors.Wrapf(err, "error while sending Hello message, address=%q, UID=%d", addr, uid)
+		}
+		return conn, nil
+	})
+}
+
+// DetectUID detects UID from the OwnerUID field of `busctl --user status`
+// if running in userNS. The value corresponds to sd_bus_creds_get_owner_uid(3) .
+//
+// Otherwise returns os.Getuid() .
+func DetectUID() (int, error) {
+	if !system.RunningInUserNS() {
+		return os.Getuid(), nil
+	}
+	b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput()
+	if err != nil {
+		return -1, errors.Wrap(err, "could not execute `busctl --user --no-pager status`")
+	}
+	scanner := bufio.NewScanner(bytes.NewReader(b))
+	for scanner.Scan() {
+		s := strings.TrimSpace(scanner.Text())
+		if strings.HasPrefix(s, "OwnerUID=") {
+			uidStr := strings.TrimPrefix(s, "OwnerUID=")
+			i, err := strconv.Atoi(uidStr)
+			if err != nil {
+				return -1, errors.Wrapf(err, "could not detect the OwnerUID: %s", s)
+			}
+			return i, nil
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return -1, err
+	}
+	return -1, errors.New("could not detect the OwnerUID")
+}
+
+// DetectUserDbusSessionBusAddress returns $DBUS_SESSION_BUS_ADDRESS if set.
+// Otherwise returns "unix:path=$XDG_RUNTIME_DIR/bus" if $XDG_RUNTIME_DIR/bus exists.
+// Otherwise parses the value from `systemctl --user show-environment` .
+func DetectUserDbusSessionBusAddress() (string, error) {
+	if env := os.Getenv("DBUS_SESSION_BUS_ADDRESS"); env != "" {
+		return env, nil
+	}
+	if xdr := os.Getenv("XDG_RUNTIME_DIR"); xdr != "" {
+		busPath := filepath.Join(xdr, "bus")
+		if _, err := os.Stat(busPath); err == nil {
+			busAddress := "unix:path=" + busPath
+			return busAddress, nil
+		}
+	}
+	b, err := exec.Command("systemctl", "--user", "--no-pager", "show-environment").CombinedOutput()
+	if err != nil {
+		return "", errors.Wrapf(err, "could not execute `systemctl --user --no-pager show-environment`, output=%q", string(b))
+	}
+	scanner := bufio.NewScanner(bytes.NewReader(b))
+	for scanner.Scan() {
+		s := strings.TrimSpace(scanner.Text())
+		if strings.HasPrefix(s, "DBUS_SESSION_BUS_ADDRESS=") {
+			return strings.TrimPrefix(s, "DBUS_SESSION_BUS_ADDRESS="), nil
+		}
+	}
+	return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from `systemctl --user --no-pager show-environment`")
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/apply_systemd.go
@@ -4,27 +4,30 @@ package systemd

 import (
 	"errors"
-	"fmt"
 	"io/ioutil"
-	"math"
 	"os"
 	"path/filepath"
 	"strings"
 	"sync"
-	"time"

-	systemdDbus "github.com/coreos/go-systemd/dbus"
-	"github.com/godbus/dbus"
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fs"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/sirupsen/logrus"
 )

-type LegacyManager struct {
+type legacyManager struct {
 	mu      sync.Mutex
-	Cgroups *configs.Cgroup
-	Paths   map[string]string
+	cgroups *configs.Cgroup
+	paths   map[string]string
+}
+
+func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) cgroups.Manager {
+	return &legacyManager{
+		cgroups: cg,
+		paths:   paths,
+	}
 }

 type subsystem interface {
@@ -65,88 +68,56 @@ var legacySubsystems = subsystemSet{
 	&fs.NameGroup{GroupName: "name=systemd"},
 }

-const (
-	testScopeWait = 4
-	testSliceWait = 4
-)
+func genV1ResourcesProperties(c *configs.Cgroup, conn *systemdDbus.Conn) ([]systemdDbus.Property, error) {
+	var properties []systemdDbus.Property
+	r := c.Resources

-var (
-	connLock sync.Mutex
-	theConn  *systemdDbus.Conn
-)
-
-func newProp(name string, units interface{}) systemdDbus.Property {
-	return systemdDbus.Property{
-		Name:  name,
-		Value: dbus.MakeVariant(units),
-	}
-}
-
-// NOTE: This function comes from package github.com/coreos/go-systemd/util
-// It was borrowed here to avoid a dependency on cgo.
-//
-// IsRunningSystemd checks whether the host was booted with systemd as its init
-// system. This functions similarly to systemd's `sd_booted(3)`: internally, it
-// checks whether /run/systemd/system/ exists and is a directory.
-// http://www.freedesktop.org/software/systemd/man/sd_booted.html
-func isRunningSystemd() bool {
-	fi, err := os.Lstat("/run/systemd/system")
+	deviceProperties, err := generateDeviceProperties(r.Devices)
 	if err != nil {
-		return false
+		return nil, err
 	}
-	return fi.IsDir()
+	properties = append(properties, deviceProperties...)
+
+	if r.Memory != 0 {
+		properties = append(properties,
+			newProp("MemoryLimit", uint64(r.Memory)))
+	}
+
+	if r.CpuShares != 0 {
+		properties = append(properties,
+			newProp("CPUShares", r.CpuShares))
+	}
+
+	addCpuQuota(conn, &properties, r.CpuQuota, r.CpuPeriod)
+
+	if r.BlkioWeight != 0 {
+		properties = append(properties,
+			newProp("BlockIOWeight", uint64(r.BlkioWeight)))
+	}
+
+	if r.PidsLimit > 0 || r.PidsLimit == -1 {
+		properties = append(properties,
+			newProp("TasksAccounting", true),
+			newProp("TasksMax", uint64(r.PidsLimit)))
+	}
+
+	return properties, nil
 }

-func UseSystemd() bool {
-	if !isRunningSystemd() {
-		return false
-	}
-
-	connLock.Lock()
-	defer connLock.Unlock()
-
-	if theConn == nil {
-		var err error
-		theConn, err = systemdDbus.New()
-		if err != nil {
-			return false
-		}
-	}
-	return true
-}
-
-func NewSystemdCgroupsManager() (func(config *configs.Cgroup, paths map[string]string) cgroups.Manager, error) {
-	if !isRunningSystemd() {
-		return nil, fmt.Errorf("systemd not running on this host, can't use systemd as a cgroups.Manager")
-	}
-	if cgroups.IsCgroup2UnifiedMode() {
-		return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
-			return &UnifiedManager{
-				Cgroups: config,
-				Paths:   paths,
-			}
-		}, nil
-	}
-	return func(config *configs.Cgroup, paths map[string]string) cgroups.Manager {
-		return &LegacyManager{
-			Cgroups: config,
-			Paths:   paths,
-		}
-	}, nil
-}
-
-func (m *LegacyManager) Apply(pid int) error {
+func (m *legacyManager) Apply(pid int) error {
 	var (
-		c          = m.Cgroups
+		c          = m.cgroups
 		unitName   = getUnitName(c)
 		slice      = "system.slice"
 		properties []systemdDbus.Property
 	)

+	m.mu.Lock()
+	defer m.mu.Unlock()
 	if c.Paths != nil {
 		paths := make(map[string]string)
 		for name, path := range c.Paths {
-			_, err := getSubsystemPath(m.Cgroups, name)
+			_, err := getSubsystemPath(m.cgroups, name)
 			if err != nil {
 				// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
 				if cgroups.IsNotFound(err) {
@@ -156,8 +127,8 @@ func (m *LegacyManager) Apply(pid int) error {
 			}
 			paths[name] = path
 		}
-		m.Paths = paths
-		return cgroups.EnterPid(m.Paths, pid)
+		m.paths = paths
+		return cgroups.EnterPid(m.paths, pid)
 	}

 	if c.Parent != "" {
@@ -196,63 +167,26 @@ func (m *LegacyManager) Apply(pid int) error {
 	properties = append(properties,
 		newProp("DefaultDependencies", false))

-	if c.Resources.Memory != 0 {
-		properties = append(properties,
-			newProp("MemoryLimit", uint64(c.Resources.Memory)))
+	dbusConnection, err := getDbusConnection(false)
+	if err != nil {
+		return err
 	}
-
-	if c.Resources.CpuShares != 0 {
-		properties = append(properties,
-			newProp("CPUShares", c.Resources.CpuShares))
-	}
-
-	// cpu.cfs_quota_us and cpu.cfs_period_us are controlled by systemd.
-	if c.Resources.CpuQuota != 0 && c.Resources.CpuPeriod != 0 {
-		// corresponds to USEC_INFINITY in systemd
-		// if USEC_INFINITY is provided, CPUQuota is left unbound by systemd
-		// always setting a property value ensures we can apply a quota and remove it later
-		cpuQuotaPerSecUSec := uint64(math.MaxUint64)
-		if c.Resources.CpuQuota > 0 {
-			// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
-			// (integer percentage of CPU) internally.  This means that if a fractional percent of
-			// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
-			// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
-			cpuQuotaPerSecUSec = uint64(c.Resources.CpuQuota*1000000) / c.Resources.CpuPeriod
-			if cpuQuotaPerSecUSec%10000 != 0 {
-				cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
-			}
-		}
-		properties = append(properties,
-			newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
-	}
-
-	if c.Resources.BlkioWeight != 0 {
-		properties = append(properties,
-			newProp("BlockIOWeight", uint64(c.Resources.BlkioWeight)))
-	}
-
-	if c.Resources.PidsLimit > 0 {
-		properties = append(properties,
-			newProp("TasksAccounting", true),
-			newProp("TasksMax", uint64(c.Resources.PidsLimit)))
+	resourcesProperties, err := genV1ResourcesProperties(c, dbusConnection)
+	if err != nil {
+		return err
 	}
+	properties = append(properties, resourcesProperties...)
+	properties = append(properties, c.SystemdProps...)

 	// We have to set kernel memory here, as we can't change it once
 	// processes have been attached to the cgroup.
 	if c.Resources.KernelMemory != 0 {
-		if err := setKernelMemory(c); err != nil {
+		if err := enableKmem(c); err != nil {
 			return err
 		}
 	}

-	statusChan := make(chan string, 1)
-	if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
-		select {
-		case <-statusChan:
-		case <-time.After(time.Second):
-			logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
-		}
-	} else if !isUnitExists(err) {
+	if err := startUnit(dbusConnection, unitName, properties); err != nil {
 		return err
 	}

@@ -262,7 +196,7 @@ func (m *LegacyManager) Apply(pid int) error {

 	paths := make(map[string]string)
 	for _, s := range legacySubsystems {
-		subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name())
+		subsystemPath, err := getSubsystemPath(m.cgroups, s.Name())
 		if err != nil {
 			// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
 			if cgroups.IsNotFound(err) {
@@ -272,33 +206,33 @@ func (m *LegacyManager) Apply(pid int) error {
 		}
 		paths[s.Name()] = subsystemPath
 	}
-	m.Paths = paths
+	m.paths = paths
 	return nil
 }

-func (m *LegacyManager) Destroy() error {
-	if m.Cgroups.Paths != nil {
+func (m *legacyManager) Destroy() error {
+	if m.cgroups.Paths != nil {
 		return nil
 	}
 	m.mu.Lock()
 	defer m.mu.Unlock()
-	theConn.StopUnit(getUnitName(m.Cgroups), "replace", nil)
-	if err := cgroups.RemovePaths(m.Paths); err != nil {
+
+	dbusConnection, err := getDbusConnection(false)
+	if err != nil {
 		return err
 	}
-	m.Paths = make(map[string]string)
+	unitName := getUnitName(m.cgroups)
+	if err := stopUnit(dbusConnection, unitName); err != nil {
+		return err
+	}
+	m.paths = make(map[string]string)
 	return nil
 }

-func (m *LegacyManager) GetPaths() map[string]string {
+func (m *legacyManager) Path(subsys string) string {
 	m.mu.Lock()
-	paths := m.Paths
-	m.mu.Unlock()
-	return paths
-}
-
-func (m *LegacyManager) GetUnifiedPath() (string, error) {
-	return "", errors.New("unified path is only supported when running in unified mode")
+	defer m.mu.Unlock()
+	return m.paths[subsys]
 }

 func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
@@ -352,40 +286,6 @@ func joinCgroups(c *configs.Cgroup, pid int) error {
 	return nil
 }

-// systemd represents slice hierarchy using `-`, so we need to follow suit when
-// generating the path of slice. Essentially, test-a-b.slice becomes
-// /test.slice/test-a.slice/test-a-b.slice.
-func ExpandSlice(slice string) (string, error) {
-	suffix := ".slice"
-	// Name has to end with ".slice", but can't be just ".slice".
-	if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
-		return "", fmt.Errorf("invalid slice name: %s", slice)
-	}
-
-	// Path-separators are not allowed.
-	if strings.Contains(slice, "/") {
-		return "", fmt.Errorf("invalid slice name: %s", slice)
-	}
-
-	var path, prefix string
-	sliceName := strings.TrimSuffix(slice, suffix)
-	// if input was -.slice, we should just return root now
-	if sliceName == "-" {
-		return "/", nil
-	}
-	for _, component := range strings.Split(sliceName, "-") {
-		// test--a.slice isn't permitted, nor is -test.slice.
-		if component == "" {
-			return "", fmt.Errorf("invalid slice name: %s", slice)
-		}
-
-		// Append the component to the path and to the prefix.
-		path += "/" + prefix + component + suffix
-		prefix += component + "-"
-	}
-	return path, nil
-}
-
 func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
 	mountpoint, err := cgroups.FindCgroupMountpoint(c.Path, subsystem)
 	if err != nil {
@@ -412,46 +312,46 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
 	return filepath.Join(mountpoint, initPath, slice, getUnitName(c)), nil
 }

-func (m *LegacyManager) Freeze(state configs.FreezerState) error {
-	path, err := getSubsystemPath(m.Cgroups, "freezer")
+func (m *legacyManager) Freeze(state configs.FreezerState) error {
+	path, err := getSubsystemPath(m.cgroups, "freezer")
 	if err != nil {
 		return err
 	}
-	prevState := m.Cgroups.Resources.Freezer
-	m.Cgroups.Resources.Freezer = state
+	prevState := m.cgroups.Resources.Freezer
+	m.cgroups.Resources.Freezer = state
 	freezer, err := legacySubsystems.Get("freezer")
 	if err != nil {
 		return err
 	}
-	err = freezer.Set(path, m.Cgroups)
+	err = freezer.Set(path, m.cgroups)
 	if err != nil {
-		m.Cgroups.Resources.Freezer = prevState
+		m.cgroups.Resources.Freezer = prevState
 		return err
 	}
 	return nil
 }

-func (m *LegacyManager) GetPids() ([]int, error) {
-	path, err := getSubsystemPath(m.Cgroups, "devices")
+func (m *legacyManager) GetPids() ([]int, error) {
+	path, err := getSubsystemPath(m.cgroups, "devices")
 	if err != nil {
 		return nil, err
 	}
 	return cgroups.GetPids(path)
 }

-func (m *LegacyManager) GetAllPids() ([]int, error) {
-	path, err := getSubsystemPath(m.Cgroups, "devices")
+func (m *legacyManager) GetAllPids() ([]int, error) {
+	path, err := getSubsystemPath(m.cgroups, "devices")
 	if err != nil {
 		return nil, err
 	}
 	return cgroups.GetAllPids(path)
 }

-func (m *LegacyManager) GetStats() (*cgroups.Stats, error) {
+func (m *legacyManager) GetStats() (*cgroups.Stats, error) {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	stats := cgroups.NewStats()
-	for name, path := range m.Paths {
+	for name, path := range m.paths {
 		sys, err := legacySubsystems.Get(name)
 		if err == errSubsystemDoesNotExist || !cgroups.PathExists(path) {
 			continue
@@ -464,41 +364,65 @@ func (m *LegacyManager) GetStats() (*cgroups.Stats, error) {
 	return stats, nil
 }

-func (m *LegacyManager) Set(container *configs.Config) error {
+func (m *legacyManager) Set(container *configs.Config) error {
 	// If Paths are set, then we are just joining cgroups paths
 	// and there is no need to set any values.
-	if m.Cgroups.Paths != nil {
+	if m.cgroups.Paths != nil {
 		return nil
 	}
+	dbusConnection, err := getDbusConnection(false)
+	if err != nil {
+		return err
+	}
+	properties, err := genV1ResourcesProperties(container.Cgroups, dbusConnection)
+	if err != nil {
+		return err
+	}
+
+	// Figure out the current freezer state, so we can revert to it after we
+	// temporarily freeze the container.
+	targetFreezerState, err := m.GetFreezerState()
+	if err != nil {
+		return err
+	}
+	if targetFreezerState == configs.Undefined {
+		targetFreezerState = configs.Thawed
+	}
+
+	// We have to freeze the container while systemd sets the cgroup settings.
+	// The reason for this is that systemd's application of DeviceAllow rules
+	// is done disruptively, resulting in spurrious errors to common devices
+	// (unlike our fs driver, they will happily write deny-all rules to running
+	// containers). So we freeze the container to avoid them hitting the cgroup
+	// error. But if the freezer cgroup isn't supported, we just warn about it.
+	if err := m.Freeze(configs.Frozen); err != nil {
+		logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
+	}
+
+	if err := dbusConnection.SetUnitProperties(getUnitName(container.Cgroups), true, properties...); err != nil {
+		_ = m.Freeze(targetFreezerState)
+		return err
+	}
+
+	// Reset freezer state before we apply the configuration, to avoid clashing
+	// with the freezer setting in the configuration.
+	_ = m.Freeze(targetFreezerState)
+
 	for _, sys := range legacySubsystems {
 		// Get the subsystem path, but don't error out for not found cgroups.
 		path, err := getSubsystemPath(container.Cgroups, sys.Name())
 		if err != nil && !cgroups.IsNotFound(err) {
 			return err
 		}
-
 		if err := sys.Set(path, container.Cgroups); err != nil {
 			return err
 		}
 	}

-	if m.Paths["cpu"] != "" {
-		if err := fs.CheckCpushares(m.Paths["cpu"], container.Cgroups.Resources.CpuShares); err != nil {
-			return err
-		}
-	}
 	return nil
 }

-func getUnitName(c *configs.Cgroup) string {
-	// by default, we create a scope unless the user explicitly asks for a slice.
-	if !strings.HasSuffix(c.Name, ".slice") {
-		return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
-	}
-	return c.Name
-}
-
-func setKernelMemory(c *configs.Cgroup) error {
+func enableKmem(c *configs.Cgroup) error {
 	path, err := getSubsystemPath(c, "memory")
 	if err != nil && !cgroups.IsNotFound(err) {
 		return err
@@ -519,16 +443,28 @@ func setKernelMemory(c *configs.Cgroup) error {
 	return fs.EnableKernelMemoryAccounting(path)
 }

-// isUnitExists returns true if the error is that a systemd unit already exists.
-func isUnitExists(err error) bool {
-	if err != nil {
-		if dbusError, ok := err.(dbus.Error); ok {
-			return strings.Contains(dbusError.Name, "org.freedesktop.systemd1.UnitExists")
-		}
-	}
-	return false
+func (m *legacyManager) GetPaths() map[string]string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	return m.paths
 }

-func (m *LegacyManager) GetCgroups() (*configs.Cgroup, error) {
-	return m.Cgroups, nil
+func (m *legacyManager) GetCgroups() (*configs.Cgroup, error) {
+	return m.cgroups, nil
+}
+
+func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
+	path, err := getSubsystemPath(m.cgroups, "freezer")
+	if err != nil && !cgroups.IsNotFound(err) {
+		return configs.Undefined, err
+	}
+	freezer, err := legacySubsystems.Get("freezer")
+	if err != nil {
+		return configs.Undefined, err
+	}
+	return freezer.(*fs.FreezerGroup).GetState(path)
+}
+
+func (m *legacyManager) Exists() bool {
+	return cgroups.PathExists(m.Path("devices"))
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go
@@ -0,0 +1,357 @@
+// +build linux
+
+package systemd
+
+import (
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+)
+
+type unifiedManager struct {
+	mu      sync.Mutex
+	cgroups *configs.Cgroup
+	// path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
+	path     string
+	rootless bool
+}
+
+func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgroups.Manager {
+	return &unifiedManager{
+		cgroups:  config,
+		path:     path,
+		rootless: rootless,
+	}
+}
+
+func genV2ResourcesProperties(c *configs.Cgroup, conn *systemdDbus.Conn) ([]systemdDbus.Property, error) {
+	var properties []systemdDbus.Property
+	r := c.Resources
+
+	// NOTE: This is of questionable correctness because we insert our own
+	//       devices eBPF program later. Two programs with identical rules
+	//       aren't the end of the world, but it is a bit concerning. However
+	//       it's unclear if systemd removes all eBPF programs attached when
+	//       doing SetUnitProperties...
+	deviceProperties, err := generateDeviceProperties(r.Devices)
+	if err != nil {
+		return nil, err
+	}
+	properties = append(properties, deviceProperties...)
+
+	if r.Memory != 0 {
+		properties = append(properties,
+			newProp("MemoryMax", uint64(r.Memory)))
+	}
+	if r.MemoryReservation != 0 {
+		properties = append(properties,
+			newProp("MemoryLow", uint64(r.MemoryReservation)))
+	}
+
+	swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
+	if err != nil {
+		return nil, err
+	}
+	if swap != 0 {
+		properties = append(properties,
+			newProp("MemorySwapMax", uint64(swap)))
+	}
+
+	if r.CpuWeight != 0 {
+		properties = append(properties,
+			newProp("CPUWeight", r.CpuWeight))
+	}
+
+	addCpuQuota(conn, &properties, r.CpuQuota, r.CpuPeriod)
+
+	if r.PidsLimit > 0 || r.PidsLimit == -1 {
+		properties = append(properties,
+			newProp("TasksAccounting", true),
+			newProp("TasksMax", uint64(r.PidsLimit)))
+	}
+
+	// ignore r.KernelMemory
+
+	return properties, nil
+}
+
+func (m *unifiedManager) Apply(pid int) error {
+	var (
+		c          = m.cgroups
+		unitName   = getUnitName(c)
+		properties []systemdDbus.Property
+	)
+
+	if c.Paths != nil {
+		return cgroups.WriteCgroupProc(m.path, pid)
+	}
+
+	slice := "system.slice"
+	if m.rootless {
+		slice = "user.slice"
+	}
+	if c.Parent != "" {
+		slice = c.Parent
+	}
+
+	properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
+
+	// if we create a slice, the parent is defined via a Wants=
+	if strings.HasSuffix(unitName, ".slice") {
+		properties = append(properties, systemdDbus.PropWants(slice))
+	} else {
+		// otherwise, we use Slice=
+		properties = append(properties, systemdDbus.PropSlice(slice))
+	}
+
+	// only add pid if its valid, -1 is used w/ general slice creation.
+	if pid != -1 {
+		properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
+	}
+
+	// Check if we can delegate. This is only supported on systemd versions 218 and above.
+	if !strings.HasSuffix(unitName, ".slice") {
+		// Assume scopes always support delegation.
+		properties = append(properties, newProp("Delegate", true))
+	}
+
+	// Always enable accounting, this gets us the same behaviour as the fs implementation,
+	// plus the kernel has some problems with joining the memory cgroup at a later time.
+	properties = append(properties,
+		newProp("MemoryAccounting", true),
+		newProp("CPUAccounting", true),
+		newProp("IOAccounting", true))
+
+	// Assume DefaultDependencies= will always work (the check for it was previously broken.)
+	properties = append(properties,
+		newProp("DefaultDependencies", false))
+
+	dbusConnection, err := getDbusConnection(m.rootless)
+	if err != nil {
+		return err
+	}
+	resourcesProperties, err := genV2ResourcesProperties(c, dbusConnection)
+	if err != nil {
+		return err
+	}
+	properties = append(properties, resourcesProperties...)
+	properties = append(properties, c.SystemdProps...)
+
+	if err := startUnit(dbusConnection, unitName, properties); err != nil {
+		return errors.Wrapf(err, "error while starting unit %q with properties %+v", unitName, properties)
+	}
+
+	if err = m.initPath(); err != nil {
+		return err
+	}
+	if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
+		return err
+	}
+	return nil
+}
+
+func (m *unifiedManager) Destroy() error {
+	if m.cgroups.Paths != nil {
+		return nil
+	}
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	dbusConnection, err := getDbusConnection(m.rootless)
+	if err != nil {
+		return err
+	}
+	unitName := getUnitName(m.cgroups)
+	if err := stopUnit(dbusConnection, unitName); err != nil {
+		return err
+	}
+
+	// XXX this is probably not needed, systemd should handle it
+	err = os.Remove(m.path)
+	if err != nil && !os.IsNotExist(err) {
+		return err
+	}
+
+	return nil
+}
+
+func (m *unifiedManager) Path(_ string) string {
+	return m.path
+}
+
+// getSliceFull value is used in initPath.
+// The value is incompatible with systemdDbus.PropSlice.
+func (m *unifiedManager) getSliceFull() (string, error) {
+	c := m.cgroups
+	slice := "system.slice"
+	if m.rootless {
+		slice = "user.slice"
+	}
+	if c.Parent != "" {
+		var err error
+		slice, err = ExpandSlice(c.Parent)
+		if err != nil {
+			return "", err
+		}
+	}
+
+	if m.rootless {
+		dbusConnection, err := getDbusConnection(m.rootless)
+		if err != nil {
+			return "", err
+		}
+		// managerCGQuoted is typically "/user.slice/user-${uid}.slice/user@${uid}.service" including the quote symbols
+		managerCGQuoted, err := dbusConnection.GetManagerProperty("ControlGroup")
+		if err != nil {
+			return "", err
+		}
+		managerCG, err := strconv.Unquote(managerCGQuoted)
+		if err != nil {
+			return "", err
+		}
+		slice = filepath.Join(managerCG, slice)
+	}
+
+	// an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice"
+	// NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified.
+	return slice, nil
+}
+
+func (m *unifiedManager) initPath() error {
+	if m.path != "" {
+		return nil
+	}
+
+	sliceFull, err := m.getSliceFull()
+	if err != nil {
+		return err
+	}
+
+	c := m.cgroups
+	path := filepath.Join(sliceFull, getUnitName(c))
+	path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path)
+	if err != nil {
+		return err
+	}
+
+	// an example of the final path in rootless:
+	// "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope"
+	m.path = path
+
+	return nil
+}
+
+func (m *unifiedManager) fsManager() (cgroups.Manager, error) {
+	if err := m.initPath(); err != nil {
+		return nil, err
+	}
+	return fs2.NewManager(m.cgroups, m.path, m.rootless)
+}
+
+func (m *unifiedManager) Freeze(state configs.FreezerState) error {
+	fsMgr, err := m.fsManager()
+	if err != nil {
+		return err
+	}
+	return fsMgr.Freeze(state)
+}
+
+func (m *unifiedManager) GetPids() ([]int, error) {
+	if err := m.initPath(); err != nil {
+		return nil, err
+	}
+	return cgroups.GetPids(m.path)
+}
+
+func (m *unifiedManager) GetAllPids() ([]int, error) {
+	if err := m.initPath(); err != nil {
+		return nil, err
+	}
+	return cgroups.GetAllPids(m.path)
+}
+
+func (m *unifiedManager) GetStats() (*cgroups.Stats, error) {
+	fsMgr, err := m.fsManager()
+	if err != nil {
+		return nil, err
+	}
+	return fsMgr.GetStats()
+}
+
+func (m *unifiedManager) Set(container *configs.Config) error {
+	dbusConnection, err := getDbusConnection(m.rootless)
+	if err != nil {
+		return err
+	}
+	properties, err := genV2ResourcesProperties(m.cgroups, dbusConnection)
+	if err != nil {
+		return err
+	}
+
+	// Figure out the current freezer state, so we can revert to it after we
+	// temporarily freeze the container.
+	targetFreezerState, err := m.GetFreezerState()
+	if err != nil {
+		return err
+	}
+	if targetFreezerState == configs.Undefined {
+		targetFreezerState = configs.Thawed
+	}
+
+	// We have to freeze the container while systemd sets the cgroup settings.
+	// The reason for this is that systemd's application of DeviceAllow rules
+	// is done disruptively, resulting in spurrious errors to common devices
+	// (unlike our fs driver, they will happily write deny-all rules to running
+	// containers). So we freeze the container to avoid them hitting the cgroup
+	// error. But if the freezer cgroup isn't supported, we just warn about it.
+	if err := m.Freeze(configs.Frozen); err != nil {
+		logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
+	}
+
+	if err := dbusConnection.SetUnitProperties(getUnitName(m.cgroups), true, properties...); err != nil {
+		_ = m.Freeze(targetFreezerState)
+		return errors.Wrap(err, "error while setting unit properties")
+	}
+
+	// Reset freezer state before we apply the configuration, to avoid clashing
+	// with the freezer setting in the configuration.
+	_ = m.Freeze(targetFreezerState)
+
+	fsMgr, err := m.fsManager()
+	if err != nil {
+		return err
+	}
+	return fsMgr.Set(container)
+}
+
+func (m *unifiedManager) GetPaths() map[string]string {
+	paths := make(map[string]string, 1)
+	paths[""] = m.path
+	return paths
+}
+
+func (m *unifiedManager) GetCgroups() (*configs.Cgroup, error) {
+	return m.cgroups, nil
+}
+
+func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) {
+	fsMgr, err := m.fsManager()
+	if err != nil {
+		return configs.Undefined, err
+	}
+	return fsMgr.GetFreezerState()
+}
+
+func (m *unifiedManager) Exists() bool {
+	return cgroups.PathExists(m.path)
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
@@ -4,6 +4,7 @@ package cgroups

 import (
 	"bufio"
+	"errors"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -12,7 +13,6 @@ import (
 	"strconv"
 	"strings"
 	"sync"
-	"syscall"
 	"time"

 	units "github.com/docker/go-units"
@@ -20,7 +20,6 @@ import (
 )

 const (
-	CgroupNamePrefix  = "name="
 	CgroupProcesses   = "cgroup.procs"
 	unifiedMountpoint = "/sys/fs/cgroup"
 )
@@ -40,8 +39,8 @@ var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"}
 // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
 func IsCgroup2UnifiedMode() bool {
 	isUnifiedOnce.Do(func() {
-		var st syscall.Statfs_t
-		if err := syscall.Statfs(unifiedMountpoint, &st); err != nil {
+		var st unix.Statfs_t
+		if err := unix.Statfs(unifiedMountpoint, &st); err != nil {
 			panic("cannot statfs cgroup root")
 		}
 		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
@@ -49,191 +48,19 @@ func IsCgroup2UnifiedMode() bool {
 	return isUnified
 }

-// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
-func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
-	if IsCgroup2UnifiedMode() {
-		return unifiedMountpoint, nil
-	}
-	mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem)
-	return mnt, err
-}
-
-func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) {
-	// We are not using mount.GetMounts() because it's super-inefficient,
-	// parsing it directly sped up x10 times because of not using Sscanf.
-	// It was one of two major performance drawbacks in container start.
-	if !isSubsystemAvailable(subsystem) {
-		return "", "", NewNotFoundError(subsystem)
-	}
-
-	f, err := os.Open("/proc/self/mountinfo")
-	if err != nil {
-		return "", "", err
-	}
-	defer f.Close()
-
-	if IsCgroup2UnifiedMode() {
-		subsystem = ""
-	}
-
-	return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
-}
-
-func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
-	scanner := bufio.NewScanner(reader)
-	for scanner.Scan() {
-		txt := scanner.Text()
-		fields := strings.Fields(txt)
-		if len(fields) < 9 {
-			continue
-		}
-		if strings.HasPrefix(fields[4], cgroupPath) {
-			for _, opt := range strings.Split(fields[len(fields)-1], ",") {
-				if (subsystem == "" && fields[9] == "cgroup2") || opt == subsystem {
-					return fields[4], fields[3], nil
-				}
-			}
-		}
-	}
-	if err := scanner.Err(); err != nil {
-		return "", "", err
-	}
-
-	return "", "", NewNotFoundError(subsystem)
-}
-
-func isSubsystemAvailable(subsystem string) bool {
-	if IsCgroup2UnifiedMode() {
-		controllers, err := GetAllSubsystems()
-		if err != nil {
-			return false
-		}
-		for _, c := range controllers {
-			if c == subsystem {
-				return true
-			}
-		}
-		return false
-	}
-
-	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
-	if err != nil {
-		return false
-	}
-	_, avail := cgroups[subsystem]
-	return avail
-}
-
-func GetClosestMountpointAncestor(dir, mountinfo string) string {
-	deepestMountPoint := ""
-	for _, mountInfoEntry := range strings.Split(mountinfo, "\n") {
-		mountInfoParts := strings.Fields(mountInfoEntry)
-		if len(mountInfoParts) < 5 {
-			continue
-		}
-		mountPoint := mountInfoParts[4]
-		if strings.HasPrefix(mountPoint, deepestMountPoint) && strings.HasPrefix(dir, mountPoint) {
-			deepestMountPoint = mountPoint
-		}
-	}
-	return deepestMountPoint
-}
-
-func FindCgroupMountpointDir() (string, error) {
-	f, err := os.Open("/proc/self/mountinfo")
-	if err != nil {
-		return "", err
-	}
-	defer f.Close()
-
-	scanner := bufio.NewScanner(f)
-	for scanner.Scan() {
-		text := scanner.Text()
-		fields := strings.Split(text, " ")
-		// Safe as mountinfo encodes mountpoints with spaces as \040.
-		index := strings.Index(text, " - ")
-		postSeparatorFields := strings.Fields(text[index+3:])
-		numPostFields := len(postSeparatorFields)
-
-		// This is an error as we can't detect if the mount is for "cgroup"
-		if numPostFields == 0 {
-			return "", fmt.Errorf("Found no fields post '-' in %q", text)
-		}
-
-		if postSeparatorFields[0] == "cgroup" || postSeparatorFields[0] == "cgroup2" {
-			// Check that the mount is properly formatted.
-			if numPostFields < 3 {
-				return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
-			}
-
-			return filepath.Dir(fields[4]), nil
-		}
-	}
-	if err := scanner.Err(); err != nil {
-		return "", err
-	}
-
-	return "", NewNotFoundError("cgroup")
-}
-
 type Mount struct {
 	Mountpoint string
 	Root       string
 	Subsystems []string
 }

-func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
-	if len(m.Subsystems) == 0 {
-		return "", fmt.Errorf("no subsystem for mount")
-	}
-
-	return getControllerPath(m.Subsystems[0], cgroups)
-}
-
-func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
-	res := make([]Mount, 0, len(ss))
-	scanner := bufio.NewScanner(mi)
-	numFound := 0
-	for scanner.Scan() && numFound < len(ss) {
-		txt := scanner.Text()
-		sepIdx := strings.Index(txt, " - ")
-		if sepIdx == -1 {
-			return nil, fmt.Errorf("invalid mountinfo format")
-		}
-		if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
-			continue
-		}
-		fields := strings.Split(txt, " ")
-		m := Mount{
-			Mountpoint: fields[4],
-			Root:       fields[3],
-		}
-		for _, opt := range strings.Split(fields[len(fields)-1], ",") {
-			seen, known := ss[opt]
-			if !known || (!all && seen) {
-				continue
-			}
-			ss[opt] = true
-			if strings.HasPrefix(opt, CgroupNamePrefix) {
-				opt = opt[len(CgroupNamePrefix):]
-			}
-			m.Subsystems = append(m.Subsystems, opt)
-			numFound++
-		}
-		if len(m.Subsystems) > 0 || all {
-			res = append(res, m)
-		}
-	}
-	if err := scanner.Err(); err != nil {
-		return nil, err
-	}
-	return res, nil
-}
-
 // GetCgroupMounts returns the mounts for the cgroup subsystems.
 // all indicates whether to return just the first instance or all the mounts.
+// This function should not be used from cgroupv2 code, as in this case
+// all the controllers are available under the constant unifiedMountpoint.
 func GetCgroupMounts(all bool) ([]Mount, error) {
 	if IsCgroup2UnifiedMode() {
+		// TODO: remove cgroupv2 case once all external users are converted
 		availableControllers, err := GetAllSubsystems()
 		if err != nil {
 			return nil, err
@@ -246,22 +73,7 @@ func GetCgroupMounts(all bool) ([]Mount, error) {
 		return []Mount{m}, nil
 	}

-	f, err := os.Open("/proc/self/mountinfo")
-	if err != nil {
-		return nil, err
-	}
-	defer f.Close()
-
-	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
-	if err != nil {
-		return nil, err
-	}
-
-	allMap := make(map[string]bool)
-	for s := range allSubsystems {
-		allMap[s] = false
-	}
-	return getCgroupMountsHelper(allMap, f, all)
+	return getCgroupMountsV1(all)
 }

 // GetAllSubsystems returns all the cgroup subsystems supported by the kernel
@@ -305,61 +117,8 @@ func GetAllSubsystems() ([]string, error) {
 	return subsystems, nil
 }

-// GetOwnCgroup returns the relative path to the cgroup docker is running in.
-func GetOwnCgroup(subsystem string) (string, error) {
-	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
-	if err != nil {
-		return "", err
-	}
-
-	return getControllerPath(subsystem, cgroups)
-}
-
-func GetOwnCgroupPath(subsystem string) (string, error) {
-	cgroup, err := GetOwnCgroup(subsystem)
-	if err != nil {
-		return "", err
-	}
-
-	return getCgroupPathHelper(subsystem, cgroup)
-}
-
-func GetInitCgroup(subsystem string) (string, error) {
-	cgroups, err := ParseCgroupFile("/proc/1/cgroup")
-	if err != nil {
-		return "", err
-	}
-
-	return getControllerPath(subsystem, cgroups)
-}
-
-func GetInitCgroupPath(subsystem string) (string, error) {
-	cgroup, err := GetInitCgroup(subsystem)
-	if err != nil {
-		return "", err
-	}
-
-	return getCgroupPathHelper(subsystem, cgroup)
-}
-
-func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
-	mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
-	if err != nil {
-		return "", err
-	}
-
-	// This is needed for nested containers, because in /proc/self/cgroup we
-	// see paths from host, which don't exist in container.
-	relCgroup, err := filepath.Rel(root, cgroup)
-	if err != nil {
-		return "", err
-	}
-
-	return filepath.Join(mnt, relCgroup), nil
-}
-
-func readProcsFile(dir string) ([]int, error) {
-	f, err := os.Open(filepath.Join(dir, CgroupProcesses))
+func readProcsFile(file string) ([]int, error) {
+	f, err := os.Open(file)
 	if err != nil {
 		return nil, err
 	}
@@ -379,11 +138,18 @@ func readProcsFile(dir string) ([]int, error) {
 			out = append(out, pid)
 		}
 	}
-	return out, nil
+	return out, s.Err()
 }

-// ParseCgroupFile parses the given cgroup file, typically from
-// /proc/<pid>/cgroup, into a map of subgroups to cgroup names.
+// ParseCgroupFile parses the given cgroup file, typically /proc/self/cgroup
+// or /proc/<pid>/cgroup, into a map of subsystems to cgroup paths, e.g.
+//   "cpu": "/user.slice/user-1000.slice"
+//   "pids": "/user.slice/user-1000.slice"
+// etc.
+//
+// Note that for cgroup v2 unified hierarchy, there are no per-controller
+// cgroup paths, so the resulting map will have a single element where the key
+// is empty string ("") and the value is the cgroup path the <pid> is in.
 func ParseCgroupFile(path string) (map[string]string, error) {
 	f, err := os.Open(path)
 	if err != nil {
@@ -423,22 +189,6 @@ func parseCgroupFromReader(r io.Reader) (map[string]string, error) {
 	return cgroups, nil
 }

-func getControllerPath(subsystem string, cgroups map[string]string) (string, error) {
-	if IsCgroup2UnifiedMode() {
-		return "/", nil
-	}
-
-	if p, ok := cgroups[subsystem]; ok {
-		return p, nil
-	}
-
-	if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
-		return p, nil
-	}
-
-	return "", NewNotFoundError(subsystem)
-}
-
 func PathExists(path string) bool {
 	if _, err := os.Stat(path); err != nil {
 		return false
@@ -514,8 +264,8 @@ func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
 }

 // GetPids returns all pids, that were added to cgroup at path.
-func GetPids(path string) ([]int, error) {
-	return readProcsFile(path)
+func GetPids(dir string) ([]int, error) {
+	return readProcsFile(filepath.Join(dir, CgroupProcesses))
 }

 // GetAllPids returns all pids, that were added to cgroup at path and to all its
@@ -524,14 +274,13 @@ func GetAllPids(path string) ([]int, error) {
 	var pids []int
 	// collect pids from all sub-cgroups
 	err := filepath.Walk(path, func(p string, info os.FileInfo, iErr error) error {
-		dir, file := filepath.Split(p)
-		if file != CgroupProcesses {
-			return nil
-		}
 		if iErr != nil {
 			return iErr
 		}
-		cPids, err := readProcsFile(dir)
+		if info.IsDir() || info.Name() != CgroupProcesses {
+			return nil
+		}
+		cPids, err := readProcsFile(p)
 		if err != nil {
 			return err
 		}
@@ -568,7 +317,7 @@ func WriteCgroupProc(dir string, pid int) error {

 		// EINVAL might mean that the task being added to cgroup.procs is in state
 		// TASK_NEW. We should attempt to do so again.
-		if isEINVAL(err) {
+		if errors.Is(err, unix.EINVAL) {
 			time.Sleep(30 * time.Millisecond)
 			continue
 		}
@@ -578,11 +327,53 @@ func WriteCgroupProc(dir string, pid int) error {
 	return err
 }

-func isEINVAL(err error) bool {
-	switch err := err.(type) {
-	case *os.PathError:
-		return err.Err == unix.EINVAL
-	default:
-		return false
+// Since the OCI spec is designed for cgroup v1, in some cases
+// there is need to convert from the cgroup v1 configuration to cgroup v2
+// the formula for BlkIOWeight is y = (1 + (x - 10) * 9999 / 990)
+// convert linearly from [10-1000] to [1-10000]
+func ConvertBlkIOToCgroupV2Value(blkIoWeight uint16) uint64 {
+	if blkIoWeight == 0 {
+		return 0
 	}
+	return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
+}
+
+// Since the OCI spec is designed for cgroup v1, in some cases
+// there is need to convert from the cgroup v1 configuration to cgroup v2
+// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
+// convert from [2-262144] to [1-10000]
+// 262144 comes from Linux kernel definition "#define MAX_SHARES (1UL << 18)"
+func ConvertCPUSharesToCgroupV2Value(cpuShares uint64) uint64 {
+	if cpuShares == 0 {
+		return 0
+	}
+	return (1 + ((cpuShares-2)*9999)/262142)
+}
+
+// ConvertMemorySwapToCgroupV2Value converts MemorySwap value from OCI spec
+// for use by cgroup v2 drivers. A conversion is needed since Resources.MemorySwap
+// is defined as memory+swap combined, while in cgroup v2 swap is a separate value.
+func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
+	// for compatibility with cgroup1 controller, set swap to unlimited in
+	// case the memory is set to unlimited, and swap is not explicitly set,
+	// treating the request as "set both memory and swap to unlimited".
+	if memory == -1 && memorySwap == 0 {
+		return -1, nil
+	}
+	if memorySwap == -1 || memorySwap == 0 {
+		// -1 is "max", 0 is "unset", so treat as is
+		return memorySwap, nil
+	}
+	// sanity checks
+	if memory == 0 || memory == -1 {
+		return 0, errors.New("unable to set swap limit without memory limit")
+	}
+	if memory < 0 {
+		return 0, fmt.Errorf("invalid memory value: %d", memory)
+	}
+	if memorySwap < memory {
+		return 0, errors.New("memory+swap limit should be >= memory limit")
+	}
+
+	return memorySwap - memory, nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go
@@ -0,0 +1,250 @@
+package cgroups
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"io"
+	"os"
+	"path/filepath"
+	"strings"
+)
+
+// Code in this source file are specific to cgroup v1,
+// and must not be used from any cgroup v2 code.
+
+const (
+	CgroupNamePrefix = "name="
+)
+
+var (
+	errUnified = errors.New("not implemented for cgroup v2 unified hierarchy")
+)
+
+type NotFoundError struct {
+	Subsystem string
+}
+
+func (e *NotFoundError) Error() string {
+	return fmt.Sprintf("mountpoint for %s not found", e.Subsystem)
+}
+
+func NewNotFoundError(sub string) error {
+	return &NotFoundError{
+		Subsystem: sub,
+	}
+}
+
+func IsNotFound(err error) bool {
+	if err == nil {
+		return false
+	}
+	_, ok := err.(*NotFoundError)
+	return ok
+}
+
+// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
+func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
+	if IsCgroup2UnifiedMode() {
+		return "", errUnified
+	}
+	mnt, _, err := FindCgroupMountpointAndRoot(cgroupPath, subsystem)
+	return mnt, err
+}
+
+func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, error) {
+	if IsCgroup2UnifiedMode() {
+		return "", "", errUnified
+	}
+
+	// We are not using mount.GetMounts() because it's super-inefficient,
+	// parsing it directly sped up x10 times because of not using Sscanf.
+	// It was one of two major performance drawbacks in container start.
+	if !isSubsystemAvailable(subsystem) {
+		return "", "", NewNotFoundError(subsystem)
+	}
+
+	f, err := os.Open("/proc/self/mountinfo")
+	if err != nil {
+		return "", "", err
+	}
+	defer f.Close()
+
+	return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
+}
+
+func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
+	scanner := bufio.NewScanner(reader)
+	for scanner.Scan() {
+		txt := scanner.Text()
+		fields := strings.Fields(txt)
+		if len(fields) < 9 {
+			continue
+		}
+		if strings.HasPrefix(fields[4], cgroupPath) {
+			for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+				if opt == subsystem {
+					return fields[4], fields[3], nil
+				}
+			}
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return "", "", err
+	}
+
+	return "", "", NewNotFoundError(subsystem)
+}
+
+func isSubsystemAvailable(subsystem string) bool {
+	if IsCgroup2UnifiedMode() {
+		panic("don't call isSubsystemAvailable from cgroupv2 code")
+	}
+
+	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		return false
+	}
+	_, avail := cgroups[subsystem]
+	return avail
+}
+
+func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
+	if len(m.Subsystems) == 0 {
+		return "", fmt.Errorf("no subsystem for mount")
+	}
+
+	return getControllerPath(m.Subsystems[0], cgroups)
+}
+
+func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
+	res := make([]Mount, 0, len(ss))
+	scanner := bufio.NewScanner(mi)
+	numFound := 0
+	for scanner.Scan() && numFound < len(ss) {
+		txt := scanner.Text()
+		sepIdx := strings.Index(txt, " - ")
+		if sepIdx == -1 {
+			return nil, fmt.Errorf("invalid mountinfo format")
+		}
+		if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
+			continue
+		}
+		fields := strings.Split(txt, " ")
+		m := Mount{
+			Mountpoint: fields[4],
+			Root:       fields[3],
+		}
+		for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+			seen, known := ss[opt]
+			if !known || (!all && seen) {
+				continue
+			}
+			ss[opt] = true
+			opt = strings.TrimPrefix(opt, CgroupNamePrefix)
+			m.Subsystems = append(m.Subsystems, opt)
+			numFound++
+		}
+		if len(m.Subsystems) > 0 || all {
+			res = append(res, m)
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, err
+	}
+	return res, nil
+}
+
+func getCgroupMountsV1(all bool) ([]Mount, error) {
+	f, err := os.Open("/proc/self/mountinfo")
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+
+	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		return nil, err
+	}
+
+	allMap := make(map[string]bool)
+	for s := range allSubsystems {
+		allMap[s] = false
+	}
+	return getCgroupMountsHelper(allMap, f, all)
+}
+
+// GetOwnCgroup returns the relative path to the cgroup docker is running in.
+func GetOwnCgroup(subsystem string) (string, error) {
+	if IsCgroup2UnifiedMode() {
+		return "", errUnified
+	}
+	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
+	if err != nil {
+		return "", err
+	}
+
+	return getControllerPath(subsystem, cgroups)
+}
+
+func GetOwnCgroupPath(subsystem string) (string, error) {
+	cgroup, err := GetOwnCgroup(subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	return getCgroupPathHelper(subsystem, cgroup)
+}
+
+func GetInitCgroup(subsystem string) (string, error) {
+	if IsCgroup2UnifiedMode() {
+		return "", errUnified
+	}
+	cgroups, err := ParseCgroupFile("/proc/1/cgroup")
+	if err != nil {
+		return "", err
+	}
+
+	return getControllerPath(subsystem, cgroups)
+}
+
+func GetInitCgroupPath(subsystem string) (string, error) {
+	cgroup, err := GetInitCgroup(subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	return getCgroupPathHelper(subsystem, cgroup)
+}
+
+func getCgroupPathHelper(subsystem, cgroup string) (string, error) {
+	mnt, root, err := FindCgroupMountpointAndRoot("", subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	// This is needed for nested containers, because in /proc/self/cgroup we
+	// see paths from host, which don't exist in container.
+	relCgroup, err := filepath.Rel(root, cgroup)
+	if err != nil {
+		return "", err
+	}
+
+	return filepath.Join(mnt, relCgroup), nil
+}
+
+func getControllerPath(subsystem string, cgroups map[string]string) (string, error) {
+	if IsCgroup2UnifiedMode() {
+		return "", errUnified
+	}
+
+	if p, ok := cgroups[subsystem]; ok {
+		return p, nil
+	}
+
+	if p, ok := cgroups[CgroupNamePrefix+subsystem]; ok {
+		return p, nil
+	}
+
+	return "", NewNotFoundError(subsystem)
+}