cri: implement RuntimeConfig rpc

The rpc only reports one field, i.e. the cgroup driver, to kubelet. Containerd determines the effective cgroup driver by looking at all runtime handlers, starting from the default runtime handler (the rest in alphabetical order), and returning the cgroup driver setting of the first runtime handler that supports one. If no runtime handler supports cgroup driver (i.e. has a config option for it) containerd falls back to auto-detection, returning systemd if systemd is running and cgroupfs otherwise. This patch implements the CRI server side of Kubernetes KEP-4033: https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/4033-group-driver-detection-over-cri Signed-off-by: Markus Lehtonen <markus.lehtonen@intel.com>
2023-06-19 16:14:22 +03:00
parent 850b2e1bf3
commit ed47d6ba76
80 changed files with 9669 additions and 0 deletions
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go
@@ -0,0 +1,564 @@
+package systemd
+
+import (
+	"bufio"
+	"context"
+	"errors"
+	"fmt"
+	"math"
+	"os"
+	"regexp"
+	"strconv"
+	"strings"
+	"sync"
+	"time"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	dbus "github.com/godbus/dbus/v5"
+	"github.com/sirupsen/logrus"
+
+	cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/devices"
+)
+
+const (
+	// Default kernel value for cpu quota period is 100000 us (100 ms), same for v1 and v2.
+	// v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and
+	// v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
+	defCPUQuotaPeriod = uint64(100000)
+)
+
+var (
+	versionOnce sync.Once
+	version     int
+
+	isRunningSystemdOnce sync.Once
+	isRunningSystemd     bool
+)
+
+// NOTE: This function comes from package github.com/coreos/go-systemd/util
+// It was borrowed here to avoid a dependency on cgo.
+//
+// IsRunningSystemd checks whether the host was booted with systemd as its init
+// system. This functions similarly to systemd's `sd_booted(3)`: internally, it
+// checks whether /run/systemd/system/ exists and is a directory.
+// http://www.freedesktop.org/software/systemd/man/sd_booted.html
+func IsRunningSystemd() bool {
+	isRunningSystemdOnce.Do(func() {
+		fi, err := os.Lstat("/run/systemd/system")
+		isRunningSystemd = err == nil && fi.IsDir()
+	})
+	return isRunningSystemd
+}
+
+// systemd represents slice hierarchy using `-`, so we need to follow suit when
+// generating the path of slice. Essentially, test-a-b.slice becomes
+// /test.slice/test-a.slice/test-a-b.slice.
+func ExpandSlice(slice string) (string, error) {
+	suffix := ".slice"
+	// Name has to end with ".slice", but can't be just ".slice".
+	if len(slice) < len(suffix) || !strings.HasSuffix(slice, suffix) {
+		return "", fmt.Errorf("invalid slice name: %s", slice)
+	}
+
+	// Path-separators are not allowed.
+	if strings.Contains(slice, "/") {
+		return "", fmt.Errorf("invalid slice name: %s", slice)
+	}
+
+	var path, prefix string
+	sliceName := strings.TrimSuffix(slice, suffix)
+	// if input was -.slice, we should just return root now
+	if sliceName == "-" {
+		return "/", nil
+	}
+	for _, component := range strings.Split(sliceName, "-") {
+		// test--a.slice isn't permitted, nor is -test.slice.
+		if component == "" {
+			return "", fmt.Errorf("invalid slice name: %s", slice)
+		}
+
+		// Append the component to the path and to the prefix.
+		path += "/" + prefix + component + suffix
+		prefix += component + "-"
+	}
+	return path, nil
+}
+
+func groupPrefix(ruleType devices.Type) (string, error) {
+	switch ruleType {
+	case devices.BlockDevice:
+		return "block-", nil
+	case devices.CharDevice:
+		return "char-", nil
+	default:
+		return "", fmt.Errorf("device type %v has no group prefix", ruleType)
+	}
+}
+
+// findDeviceGroup tries to find the device group name (as listed in
+// /proc/devices) with the type prefixed as required for DeviceAllow, for a
+// given (type, major) combination. If more than one device group exists, an
+// arbitrary one is chosen.
+func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
+	fh, err := os.Open("/proc/devices")
+	if err != nil {
+		return "", err
+	}
+	defer fh.Close()
+
+	prefix, err := groupPrefix(ruleType)
+	if err != nil {
+		return "", err
+	}
+
+	scanner := bufio.NewScanner(fh)
+	var currentType devices.Type
+	for scanner.Scan() {
+		// We need to strip spaces because the first number is column-aligned.
+		line := strings.TrimSpace(scanner.Text())
+
+		// Handle the "header" lines.
+		switch line {
+		case "Block devices:":
+			currentType = devices.BlockDevice
+			continue
+		case "Character devices:":
+			currentType = devices.CharDevice
+			continue
+		case "":
+			continue
+		}
+
+		// Skip lines unrelated to our type.
+		if currentType != ruleType {
+			continue
+		}
+
+		// Parse out the (major, name).
+		var (
+			currMajor int64
+			currName  string
+		)
+		if n, err := fmt.Sscanf(line, "%d %s", &currMajor, &currName); err != nil || n != 2 {
+			if err == nil {
+				err = errors.New("wrong number of fields")
+			}
+			return "", fmt.Errorf("scan /proc/devices line %q: %w", line, err)
+		}
+
+		if currMajor == ruleMajor {
+			return prefix + currName, nil
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return "", fmt.Errorf("reading /proc/devices: %w", err)
+	}
+	// Couldn't find the device group.
+	return "", nil
+}
+
+// DeviceAllow is the dbus type "a(ss)" which means we need a struct
+// to represent it in Go.
+type deviceAllowEntry struct {
+	Path  string
+	Perms string
+}
+
+func allowAllDevices() []systemdDbus.Property {
+	// Setting mode to auto and removing all DeviceAllow rules
+	// results in allowing access to all devices.
+	return []systemdDbus.Property{
+		newProp("DevicePolicy", "auto"),
+		newProp("DeviceAllow", []deviceAllowEntry{}),
+	}
+}
+
+// generateDeviceProperties takes the configured device rules and generates a
+// corresponding set of systemd properties to configure the devices correctly.
+func generateDeviceProperties(r *configs.Resources, sdVer int) ([]systemdDbus.Property, error) {
+	if r.SkipDevices {
+		return nil, nil
+	}
+
+	properties := []systemdDbus.Property{
+		// Always run in the strictest white-list mode.
+		newProp("DevicePolicy", "strict"),
+		// Empty the DeviceAllow array before filling it.
+		newProp("DeviceAllow", []deviceAllowEntry{}),
+	}
+
+	// Figure out the set of rules.
+	configEmu := &cgroupdevices.Emulator{}
+	for _, rule := range r.Devices {
+		if err := configEmu.Apply(*rule); err != nil {
+			return nil, fmt.Errorf("unable to apply rule for systemd: %w", err)
+		}
+	}
+	// systemd doesn't support blacklists. So we log a warning, and tell
+	// systemd to act as a deny-all whitelist. This ruleset will be replaced
+	// with our normal fallback code. This may result in spurious errors, but
+	// the only other option is to error out here.
+	if configEmu.IsBlacklist() {
+		// However, if we're dealing with an allow-all rule then we can do it.
+		if configEmu.IsAllowAll() {
+			return allowAllDevices(), nil
+		}
+		logrus.Warn("systemd doesn't support blacklist device rules -- applying temporary deny-all rule")
+		return properties, nil
+	}
+
+	// Now generate the set of rules we actually need to apply. Unlike the
+	// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
+	// whitelist which is the default for devices.Emulator.
+	finalRules, err := configEmu.Rules()
+	if err != nil {
+		return nil, fmt.Errorf("unable to get simplified rules for systemd: %w", err)
+	}
+	var deviceAllowList []deviceAllowEntry
+	for _, rule := range finalRules {
+		if !rule.Allow {
+			// Should never happen.
+			return nil, fmt.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
+		}
+		switch rule.Type {
+		case devices.BlockDevice, devices.CharDevice:
+		default:
+			// Should never happen.
+			return nil, fmt.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
+		}
+
+		entry := deviceAllowEntry{
+			Perms: string(rule.Permissions),
+		}
+
+		// systemd has a fairly odd (though understandable) syntax here, and
+		// because of the OCI configuration format we have to do quite a bit of
+		// trickery to convert things:
+		//
+		//  * Concrete rules with non-wildcard major/minor numbers have to use
+		//    /dev/{block,char}/MAJOR:minor paths. Before v240, systemd uses
+		//    stat(2) on such paths to look up device properties, meaning we
+		//    cannot add whitelist rules for devices that don't exist. Since v240,
+		//    device properties are parsed from the path string.
+		//
+		//    However, path globbing is not support for path-based rules so we
+		//    need to handle wildcards in some other manner.
+		//
+		//  * Wildcard-minor rules have to specify a "device group name" (the
+		//    second column in /proc/devices).
+		//
+		//  * Wildcard (major and minor) rules can just specify a glob with the
+		//    type ("char-*" or "block-*").
+		//
+		// The only type of rule we can't handle is wildcard-major rules, and
+		// so we'll give a warning in that case (note that the fallback code
+		// will insert any rules systemd couldn't handle). What amazing fun.
+
+		if rule.Major == devices.Wildcard {
+			// "_ *:n _" rules aren't supported by systemd.
+			if rule.Minor != devices.Wildcard {
+				logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
+				continue
+			}
+
+			// "_ *:* _" rules just wildcard everything.
+			prefix, err := groupPrefix(rule.Type)
+			if err != nil {
+				return nil, err
+			}
+			entry.Path = prefix + "*"
+		} else if rule.Minor == devices.Wildcard {
+			// "_ n:* _" rules require a device group from /proc/devices.
+			group, err := findDeviceGroup(rule.Type, rule.Major)
+			if err != nil {
+				return nil, fmt.Errorf("unable to find device '%v/%d': %w", rule.Type, rule.Major, err)
+			}
+			if group == "" {
+				// Couldn't find a group.
+				logrus.Warnf("could not find device group for '%v/%d' in /proc/devices -- temporarily ignoring rule: %v", rule.Type, rule.Major, *rule)
+				continue
+			}
+			entry.Path = group
+		} else {
+			// "_ n:m _" rules are just a path in /dev/{block,char}/.
+			switch rule.Type {
+			case devices.BlockDevice:
+				entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
+			case devices.CharDevice:
+				entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
+			}
+			if sdVer < 240 {
+				// Old systemd versions use stat(2) on path to find out device major:minor
+				// numbers and type. If the path doesn't exist, it will not add the rule,
+				// emitting a warning instead.
+				// Since all of this logic is best-effort anyway (we manually set these
+				// rules separately to systemd) we can safely skip entries that don't
+				// have a corresponding path.
+				if _, err := os.Stat(entry.Path); err != nil {
+					continue
+				}
+			}
+		}
+		deviceAllowList = append(deviceAllowList, entry)
+	}
+
+	properties = append(properties, newProp("DeviceAllow", deviceAllowList))
+	return properties, nil
+}
+
+func newProp(name string, units interface{}) systemdDbus.Property {
+	return systemdDbus.Property{
+		Name:  name,
+		Value: dbus.MakeVariant(units),
+	}
+}
+
+func getUnitName(c *configs.Cgroup) string {
+	// by default, we create a scope unless the user explicitly asks for a slice.
+	if !strings.HasSuffix(c.Name, ".slice") {
+		return c.ScopePrefix + "-" + c.Name + ".scope"
+	}
+	return c.Name
+}
+
+// This code should be in sync with getUnitName.
+func getUnitType(unitName string) string {
+	if strings.HasSuffix(unitName, ".slice") {
+		return "Slice"
+	}
+	return "Scope"
+}
+
+// isDbusError returns true if the error is a specific dbus error.
+func isDbusError(err error, name string) bool {
+	if err != nil {
+		var derr dbus.Error
+		if errors.As(err, &derr) {
+			return strings.Contains(derr.Name, name)
+		}
+	}
+	return false
+}
+
+// isUnitExists returns true if the error is that a systemd unit already exists.
+func isUnitExists(err error) bool {
+	return isDbusError(err, "org.freedesktop.systemd1.UnitExists")
+}
+
+func startUnit(cm *dbusConnManager, unitName string, properties []systemdDbus.Property, ignoreExist bool) error {
+	statusChan := make(chan string, 1)
+	retry := true
+
+retry:
+	err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
+		_, err := c.StartTransientUnitContext(context.TODO(), unitName, "replace", properties, statusChan)
+		return err
+	})
+	if err != nil {
+		if !isUnitExists(err) {
+			return err
+		}
+		if ignoreExist {
+			// TODO: remove this hack.
+			// This is kubelet making sure a slice exists (see
+			// https://github.com/opencontainers/runc/pull/1124).
+			return nil
+		}
+		if retry {
+			// In case a unit with the same name exists, this may
+			// be a leftover failed unit. Reset it, so systemd can
+			// remove it, and retry once.
+			err = resetFailedUnit(cm, unitName)
+			if err != nil {
+				logrus.Warnf("unable to reset failed unit: %v", err)
+			}
+			retry = false
+			goto retry
+		}
+		return err
+	}
+
+	timeout := time.NewTimer(30 * time.Second)
+	defer timeout.Stop()
+
+	select {
+	case s := <-statusChan:
+		close(statusChan)
+		// Please refer to https://pkg.go.dev/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
+		if s != "done" {
+			_ = resetFailedUnit(cm, unitName)
+			return fmt.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
+		}
+	case <-timeout.C:
+		_ = resetFailedUnit(cm, unitName)
+		return errors.New("Timeout waiting for systemd to create " + unitName)
+	}
+
+	return nil
+}
+
+func stopUnit(cm *dbusConnManager, unitName string) error {
+	statusChan := make(chan string, 1)
+	err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
+		_, err := c.StopUnitContext(context.TODO(), unitName, "replace", statusChan)
+		return err
+	})
+	if err == nil {
+		timeout := time.NewTimer(30 * time.Second)
+		defer timeout.Stop()
+
+		select {
+		case s := <-statusChan:
+			close(statusChan)
+			// Please refer to https://godoc.org/github.com/coreos/go-systemd/v22/dbus#Conn.StartUnit
+			if s != "done" {
+				logrus.Warnf("error removing unit `%s`: got `%s`. Continuing...", unitName, s)
+			}
+		case <-timeout.C:
+			return errors.New("Timed out while waiting for systemd to remove " + unitName)
+		}
+	}
+
+	// In case of a failed unit, let systemd remove it.
+	_ = resetFailedUnit(cm, unitName)
+
+	return nil
+}
+
+func resetFailedUnit(cm *dbusConnManager, name string) error {
+	return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
+		return c.ResetFailedUnitContext(context.TODO(), name)
+	})
+}
+
+func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) {
+	var prop *systemdDbus.Property
+	err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) {
+		prop, Err = c.GetUnitTypePropertyContext(context.TODO(), unitName, unitType, propertyName)
+		return Err
+	})
+	return prop, err
+}
+
+func setUnitProperties(cm *dbusConnManager, name string, properties ...systemdDbus.Property) error {
+	return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
+		return c.SetUnitPropertiesContext(context.TODO(), name, true, properties...)
+	})
+}
+
+func getManagerProperty(cm *dbusConnManager, name string) (string, error) {
+	str := ""
+	err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) error {
+		var err error
+		str, err = c.GetManagerProperty(name)
+		return err
+	})
+	if err != nil {
+		return "", err
+	}
+	return strconv.Unquote(str)
+}
+
+func systemdVersion(cm *dbusConnManager) int {
+	versionOnce.Do(func() {
+		version = -1
+		verStr, err := getManagerProperty(cm, "Version")
+		if err == nil {
+			version, err = systemdVersionAtoi(verStr)
+		}
+
+		if err != nil {
+			logrus.WithError(err).Error("unable to get systemd version")
+		}
+	})
+
+	return version
+}
+
+func systemdVersionAtoi(verStr string) (int, error) {
+	// verStr should be of the form:
+	// "v245.4-1.fc32", "245", "v245-1.fc32", "245-1.fc32" (without quotes).
+	// The result for all of the above should be 245.
+	// Thus, we unconditionally remove the "v" prefix
+	// and then match on the first integer we can grab.
+	re := regexp.MustCompile(`v?([0-9]+)`)
+	matches := re.FindStringSubmatch(verStr)
+	if len(matches) < 2 {
+		return 0, fmt.Errorf("can't parse version %s: incorrect number of matches %v", verStr, matches)
+	}
+	ver, err := strconv.Atoi(matches[1])
+	if err != nil {
+		return -1, fmt.Errorf("can't parse version: %w", err)
+	}
+	return ver, nil
+}
+
+func addCpuQuota(cm *dbusConnManager, properties *[]systemdDbus.Property, quota int64, period uint64) {
+	if period != 0 {
+		// systemd only supports CPUQuotaPeriodUSec since v242
+		sdVer := systemdVersion(cm)
+		if sdVer >= 242 {
+			*properties = append(*properties,
+				newProp("CPUQuotaPeriodUSec", period))
+		} else {
+			logrus.Debugf("systemd v%d is too old to support CPUQuotaPeriodSec "+
+				" (setting will still be applied to cgroupfs)", sdVer)
+		}
+	}
+	if quota != 0 || period != 0 {
+		// corresponds to USEC_INFINITY in systemd
+		cpuQuotaPerSecUSec := uint64(math.MaxUint64)
+		if quota > 0 {
+			if period == 0 {
+				// assume the default
+				period = defCPUQuotaPeriod
+			}
+			// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
+			// (integer percentage of CPU) internally.  This means that if a fractional percent of
+			// CPU is indicated by Resources.CpuQuota, we need to round up to the nearest
+			// 10ms (1% of a second) such that child cgroups can set the cpu.cfs_quota_us they expect.
+			cpuQuotaPerSecUSec = uint64(quota*1000000) / period
+			if cpuQuotaPerSecUSec%10000 != 0 {
+				cpuQuotaPerSecUSec = ((cpuQuotaPerSecUSec / 10000) + 1) * 10000
+			}
+		}
+		*properties = append(*properties,
+			newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
+	}
+}
+
+func addCpuset(cm *dbusConnManager, props *[]systemdDbus.Property, cpus, mems string) error {
+	if cpus == "" && mems == "" {
+		return nil
+	}
+
+	// systemd only supports AllowedCPUs/AllowedMemoryNodes since v244
+	sdVer := systemdVersion(cm)
+	if sdVer < 244 {
+		logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+
+			" (settings will still be applied to cgroupfs)", sdVer)
+		return nil
+	}
+
+	if cpus != "" {
+		bits, err := RangeToBits(cpus)
+		if err != nil {
+			return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w",
+				cpus, err)
+		}
+		*props = append(*props,
+			newProp("AllowedCPUs", bits))
+	}
+	if mems != "" {
+		bits, err := RangeToBits(mems)
+		if err != nil {
+			return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w",
+				mems, err)
+		}
+		*props = append(*props,
+			newProp("AllowedMemoryNodes", bits))
+	}
+	return nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go
@@ -0,0 +1,60 @@
+package systemd
+
+import (
+	"errors"
+	"math/big"
+	"strconv"
+	"strings"
+)
+
+// RangeToBits converts a text representation of a CPU mask (as written to
+// or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes
+// with the corresponding bits set (as consumed by systemd over dbus as
+// AllowedCPUs/AllowedMemoryNodes unit property value).
+func RangeToBits(str string) ([]byte, error) {
+	bits := new(big.Int)
+
+	for _, r := range strings.Split(str, ",") {
+		// allow extra spaces around
+		r = strings.TrimSpace(r)
+		// allow empty elements (extra commas)
+		if r == "" {
+			continue
+		}
+		ranges := strings.SplitN(r, "-", 2)
+		if len(ranges) > 1 {
+			start, err := strconv.ParseUint(ranges[0], 10, 32)
+			if err != nil {
+				return nil, err
+			}
+			end, err := strconv.ParseUint(ranges[1], 10, 32)
+			if err != nil {
+				return nil, err
+			}
+			if start > end {
+				return nil, errors.New("invalid range: " + r)
+			}
+			for i := start; i <= end; i++ {
+				bits.SetBit(bits, int(i), 1)
+			}
+		} else {
+			val, err := strconv.ParseUint(ranges[0], 10, 32)
+			if err != nil {
+				return nil, err
+			}
+			bits.SetBit(bits, int(val), 1)
+		}
+	}
+
+	ret := bits.Bytes()
+	if len(ret) == 0 {
+		// do not allow empty values
+		return nil, errors.New("empty value")
+	}
+
+	// fit cpuset parsing order in systemd
+	for l, r := 0, len(ret)-1; l < r; l, r = l+1, r-1 {
+		ret[l], ret[r] = ret[r], ret[l]
+	}
+	return ret, nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/dbus.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/dbus.go
@@ -0,0 +1,102 @@
+package systemd
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"sync"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	dbus "github.com/godbus/dbus/v5"
+)
+
+var (
+	dbusC        *systemdDbus.Conn
+	dbusMu       sync.RWMutex
+	dbusInited   bool
+	dbusRootless bool
+)
+
+type dbusConnManager struct{}
+
+// newDbusConnManager initializes systemd dbus connection manager.
+func newDbusConnManager(rootless bool) *dbusConnManager {
+	dbusMu.Lock()
+	defer dbusMu.Unlock()
+	if dbusInited && rootless != dbusRootless {
+		panic("can't have both root and rootless dbus")
+	}
+	dbusInited = true
+	dbusRootless = rootless
+	return &dbusConnManager{}
+}
+
+// getConnection lazily initializes and returns systemd dbus connection.
+func (d *dbusConnManager) getConnection() (*systemdDbus.Conn, error) {
+	// In the case where dbusC != nil
+	// Use the read lock the first time to ensure
+	// that Conn can be acquired at the same time.
+	dbusMu.RLock()
+	if conn := dbusC; conn != nil {
+		dbusMu.RUnlock()
+		return conn, nil
+	}
+	dbusMu.RUnlock()
+
+	// In the case where dbusC == nil
+	// Use write lock to ensure that only one
+	// will be created
+	dbusMu.Lock()
+	defer dbusMu.Unlock()
+	if conn := dbusC; conn != nil {
+		return conn, nil
+	}
+
+	conn, err := d.newConnection()
+	if err != nil {
+		// When dbus-user-session is not installed, we can't detect whether we should try to connect to user dbus or system dbus, so d.dbusRootless is set to false.
+		// This may fail with a cryptic error "read unix @->/run/systemd/private: read: connection reset by peer: unknown."
+		// https://github.com/moby/moby/issues/42793
+		return nil, fmt.Errorf("failed to connect to dbus (hint: for rootless containers, maybe you need to install dbus-user-session package, see https://github.com/opencontainers/runc/blob/master/docs/cgroup-v2.md): %w", err)
+	}
+	dbusC = conn
+	return conn, nil
+}
+
+func (d *dbusConnManager) newConnection() (*systemdDbus.Conn, error) {
+	if dbusRootless {
+		return newUserSystemdDbus()
+	}
+	return systemdDbus.NewWithContext(context.TODO())
+}
+
+// resetConnection resets the connection to its initial state
+// (so it can be reconnected if necessary).
+func (d *dbusConnManager) resetConnection(conn *systemdDbus.Conn) {
+	dbusMu.Lock()
+	defer dbusMu.Unlock()
+	if dbusC != nil && dbusC == conn {
+		dbusC.Close()
+		dbusC = nil
+	}
+}
+
+// retryOnDisconnect calls op, and if the error it returns is about closed dbus
+// connection, the connection is re-established and the op is retried. This helps
+// with the situation when dbus is restarted and we have a stale connection.
+func (d *dbusConnManager) retryOnDisconnect(op func(*systemdDbus.Conn) error) error {
+	for {
+		conn, err := d.getConnection()
+		if err != nil {
+			return err
+		}
+		err = op(conn)
+		if err == nil {
+			return nil
+		}
+		if !errors.Is(err, dbus.ErrClosed) {
+			return err
+		}
+		d.resetConnection(conn)
+	}
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/user.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/user.go
@@ -0,0 +1,106 @@
+package systemd
+
+import (
+	"bufio"
+	"bytes"
+	"errors"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"strings"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	dbus "github.com/godbus/dbus/v5"
+
+	"github.com/opencontainers/runc/libcontainer/userns"
+)
+
+// newUserSystemdDbus creates a connection for systemd user-instance.
+func newUserSystemdDbus() (*systemdDbus.Conn, error) {
+	addr, err := DetectUserDbusSessionBusAddress()
+	if err != nil {
+		return nil, err
+	}
+	uid, err := DetectUID()
+	if err != nil {
+		return nil, err
+	}
+
+	return systemdDbus.NewConnection(func() (*dbus.Conn, error) {
+		conn, err := dbus.Dial(addr)
+		if err != nil {
+			return nil, fmt.Errorf("error while dialing %q: %w", addr, err)
+		}
+		methods := []dbus.Auth{dbus.AuthExternal(strconv.Itoa(uid))}
+		err = conn.Auth(methods)
+		if err != nil {
+			conn.Close()
+			return nil, fmt.Errorf("error while authenticating connection (address=%q, UID=%d): %w", addr, uid, err)
+		}
+		if err = conn.Hello(); err != nil {
+			conn.Close()
+			return nil, fmt.Errorf("error while sending Hello message (address=%q, UID=%d): %w", addr, uid, err)
+		}
+		return conn, nil
+	})
+}
+
+// DetectUID detects UID from the OwnerUID field of `busctl --user status`
+// if running in userNS. The value corresponds to sd_bus_creds_get_owner_uid(3) .
+//
+// Otherwise returns os.Getuid() .
+func DetectUID() (int, error) {
+	if !userns.RunningInUserNS() {
+		return os.Getuid(), nil
+	}
+	b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput()
+	if err != nil {
+		return -1, fmt.Errorf("could not execute `busctl --user --no-pager status` (output: %q): %w", string(b), err)
+	}
+	scanner := bufio.NewScanner(bytes.NewReader(b))
+	for scanner.Scan() {
+		s := strings.TrimSpace(scanner.Text())
+		if strings.HasPrefix(s, "OwnerUID=") {
+			uidStr := strings.TrimPrefix(s, "OwnerUID=")
+			i, err := strconv.Atoi(uidStr)
+			if err != nil {
+				return -1, fmt.Errorf("could not detect the OwnerUID: %w", err)
+			}
+			return i, nil
+		}
+	}
+	if err := scanner.Err(); err != nil {
+		return -1, err
+	}
+	return -1, errors.New("could not detect the OwnerUID")
+}
+
+// DetectUserDbusSessionBusAddress returns $DBUS_SESSION_BUS_ADDRESS if set.
+// Otherwise returns "unix:path=$XDG_RUNTIME_DIR/bus" if $XDG_RUNTIME_DIR/bus exists.
+// Otherwise parses the value from `systemctl --user show-environment` .
+func DetectUserDbusSessionBusAddress() (string, error) {
+	if env := os.Getenv("DBUS_SESSION_BUS_ADDRESS"); env != "" {
+		return env, nil
+	}
+	if xdr := os.Getenv("XDG_RUNTIME_DIR"); xdr != "" {
+		busPath := filepath.Join(xdr, "bus")
+		if _, err := os.Stat(busPath); err == nil {
+			busAddress := "unix:path=" + busPath
+			return busAddress, nil
+		}
+	}
+	b, err := exec.Command("systemctl", "--user", "--no-pager", "show-environment").CombinedOutput()
+	if err != nil {
+		return "", fmt.Errorf("could not execute `systemctl --user --no-pager show-environment` (output=%q): %w", string(b), err)
+	}
+	scanner := bufio.NewScanner(bytes.NewReader(b))
+	for scanner.Scan() {
+		s := strings.TrimSpace(scanner.Text())
+		if strings.HasPrefix(s, "DBUS_SESSION_BUS_ADDRESS=") {
+			return strings.TrimPrefix(s, "DBUS_SESSION_BUS_ADDRESS="), nil
+		}
+	}
+	return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from `systemctl --user --no-pager show-environment`. Make sure you have installed the dbus-user-session or dbus-daemon package and then run: `systemctl --user start dbus`")
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go
@@ -0,0 +1,480 @@
+package systemd
+
+import (
+	"errors"
+	"os"
+	"path/filepath"
+	"reflect"
+	"strings"
+	"sync"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	"github.com/godbus/dbus/v5"
+	"github.com/sirupsen/logrus"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fs"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type legacyManager struct {
+	mu      sync.Mutex
+	cgroups *configs.Cgroup
+	paths   map[string]string
+	dbus    *dbusConnManager
+}
+
+func NewLegacyManager(cg *configs.Cgroup, paths map[string]string) (cgroups.Manager, error) {
+	if cg.Rootless {
+		return nil, errors.New("cannot use rootless systemd cgroups manager on cgroup v1")
+	}
+	if cg.Resources != nil && cg.Resources.Unified != nil {
+		return nil, cgroups.ErrV1NoUnified
+	}
+	if paths == nil {
+		var err error
+		paths, err = initPaths(cg)
+		if err != nil {
+			return nil, err
+		}
+	}
+	return &legacyManager{
+		cgroups: cg,
+		paths:   paths,
+		dbus:    newDbusConnManager(false),
+	}, nil
+}
+
+type subsystem interface {
+	// Name returns the name of the subsystem.
+	Name() string
+	// Returns the stats, as 'stats', corresponding to the cgroup under 'path'.
+	GetStats(path string, stats *cgroups.Stats) error
+	// Set sets cgroup resource limits.
+	Set(path string, r *configs.Resources) error
+}
+
+var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist")
+
+var legacySubsystems = []subsystem{
+	&fs.CpusetGroup{},
+	&fs.DevicesGroup{},
+	&fs.MemoryGroup{},
+	&fs.CpuGroup{},
+	&fs.CpuacctGroup{},
+	&fs.PidsGroup{},
+	&fs.BlkioGroup{},
+	&fs.HugetlbGroup{},
+	&fs.PerfEventGroup{},
+	&fs.FreezerGroup{},
+	&fs.NetPrioGroup{},
+	&fs.NetClsGroup{},
+	&fs.NameGroup{GroupName: "name=systemd"},
+	&fs.RdmaGroup{},
+	&fs.NameGroup{GroupName: "misc"},
+}
+
+func genV1ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
+	var properties []systemdDbus.Property
+
+	deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm))
+	if err != nil {
+		return nil, err
+	}
+	properties = append(properties, deviceProperties...)
+
+	if r.Memory != 0 {
+		properties = append(properties,
+			newProp("MemoryLimit", uint64(r.Memory)))
+	}
+
+	if r.CpuShares != 0 {
+		properties = append(properties,
+			newProp("CPUShares", r.CpuShares))
+	}
+
+	addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
+
+	if r.BlkioWeight != 0 {
+		properties = append(properties,
+			newProp("BlockIOWeight", uint64(r.BlkioWeight)))
+	}
+
+	if r.PidsLimit > 0 || r.PidsLimit == -1 {
+		properties = append(properties,
+			newProp("TasksMax", uint64(r.PidsLimit)))
+	}
+
+	err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
+	if err != nil {
+		return nil, err
+	}
+
+	return properties, nil
+}
+
+// initPaths figures out and returns paths to cgroups.
+func initPaths(c *configs.Cgroup) (map[string]string, error) {
+	slice := "system.slice"
+	if c.Parent != "" {
+		var err error
+		slice, err = ExpandSlice(c.Parent)
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	unit := getUnitName(c)
+
+	paths := make(map[string]string)
+	for _, s := range legacySubsystems {
+		subsystemPath, err := getSubsystemPath(slice, unit, s.Name())
+		if err != nil {
+			// Even if it's `not found` error, we'll return err
+			// because devices cgroup is hard requirement for
+			// container security.
+			if s.Name() == "devices" {
+				return nil, err
+			}
+			// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
+			if cgroups.IsNotFound(err) {
+				continue
+			}
+			return nil, err
+		}
+		paths[s.Name()] = subsystemPath
+	}
+
+	// If systemd is using cgroups-hybrid mode then add the slice path of
+	// this container to the paths so the following process executed with
+	// "runc exec" joins that cgroup as well.
+	if cgroups.IsCgroup2HybridMode() {
+		// "" means cgroup-hybrid path
+		cgroupsHybridPath, err := getSubsystemPath(slice, unit, "")
+		if err != nil && cgroups.IsNotFound(err) {
+			return nil, err
+		}
+		paths[""] = cgroupsHybridPath
+	}
+
+	return paths, nil
+}
+
+func (m *legacyManager) Apply(pid int) error {
+	var (
+		c          = m.cgroups
+		unitName   = getUnitName(c)
+		slice      = "system.slice"
+		properties []systemdDbus.Property
+	)
+
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	if c.Parent != "" {
+		slice = c.Parent
+	}
+
+	properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
+
+	if strings.HasSuffix(unitName, ".slice") {
+		// If we create a slice, the parent is defined via a Wants=.
+		properties = append(properties, systemdDbus.PropWants(slice))
+	} else {
+		// Otherwise it's a scope, which we put into a Slice=.
+		properties = append(properties, systemdDbus.PropSlice(slice))
+		// Assume scopes always support delegation (supported since systemd v218).
+		properties = append(properties, newProp("Delegate", true))
+	}
+
+	// only add pid if its valid, -1 is used w/ general slice creation.
+	if pid != -1 {
+		properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
+	}
+
+	// Always enable accounting, this gets us the same behaviour as the fs implementation,
+	// plus the kernel has some problems with joining the memory cgroup at a later time.
+	properties = append(properties,
+		newProp("MemoryAccounting", true),
+		newProp("CPUAccounting", true),
+		newProp("BlockIOAccounting", true),
+		newProp("TasksAccounting", true),
+	)
+
+	// Assume DefaultDependencies= will always work (the check for it was previously broken.)
+	properties = append(properties,
+		newProp("DefaultDependencies", false))
+
+	properties = append(properties, c.SystemdProps...)
+
+	if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
+		return err
+	}
+
+	if err := m.joinCgroups(pid); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (m *legacyManager) Destroy() error {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	stopErr := stopUnit(m.dbus, getUnitName(m.cgroups))
+
+	// Both on success and on error, cleanup all the cgroups
+	// we are aware of, as some of them were created directly
+	// by Apply() and are not managed by systemd.
+	if err := cgroups.RemovePaths(m.paths); err != nil && stopErr == nil {
+		return err
+	}
+
+	return stopErr
+}
+
+func (m *legacyManager) Path(subsys string) string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	return m.paths[subsys]
+}
+
+func (m *legacyManager) joinCgroups(pid int) error {
+	for _, sys := range legacySubsystems {
+		name := sys.Name()
+		switch name {
+		case "name=systemd":
+			// let systemd handle this
+		case "cpuset":
+			if path, ok := m.paths[name]; ok {
+				s := &fs.CpusetGroup{}
+				if err := s.ApplyDir(path, m.cgroups.Resources, pid); err != nil {
+					return err
+				}
+			}
+		default:
+			if path, ok := m.paths[name]; ok {
+				if err := os.MkdirAll(path, 0o755); err != nil {
+					return err
+				}
+				if err := cgroups.WriteCgroupProc(path, pid); err != nil {
+					return err
+				}
+			}
+		}
+	}
+
+	return nil
+}
+
+func getSubsystemPath(slice, unit, subsystem string) (string, error) {
+	mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem)
+	if err != nil {
+		return "", err
+	}
+
+	return filepath.Join(mountpoint, slice, unit), nil
+}
+
+func (m *legacyManager) Freeze(state configs.FreezerState) error {
+	err := m.doFreeze(state)
+	if err == nil {
+		m.cgroups.Resources.Freezer = state
+	}
+	return err
+}
+
+// doFreeze is the same as Freeze but without
+// changing the m.cgroups.Resources.Frozen field.
+func (m *legacyManager) doFreeze(state configs.FreezerState) error {
+	path, ok := m.paths["freezer"]
+	if !ok {
+		return errSubsystemDoesNotExist
+	}
+	freezer := &fs.FreezerGroup{}
+	resources := &configs.Resources{Freezer: state}
+	return freezer.Set(path, resources)
+}
+
+func (m *legacyManager) GetPids() ([]int, error) {
+	path, ok := m.paths["devices"]
+	if !ok {
+		return nil, errSubsystemDoesNotExist
+	}
+	return cgroups.GetPids(path)
+}
+
+func (m *legacyManager) GetAllPids() ([]int, error) {
+	path, ok := m.paths["devices"]
+	if !ok {
+		return nil, errSubsystemDoesNotExist
+	}
+	return cgroups.GetAllPids(path)
+}
+
+func (m *legacyManager) GetStats() (*cgroups.Stats, error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	stats := cgroups.NewStats()
+	for _, sys := range legacySubsystems {
+		path := m.paths[sys.Name()]
+		if path == "" {
+			continue
+		}
+		if err := sys.GetStats(path, stats); err != nil {
+			return nil, err
+		}
+	}
+
+	return stats, nil
+}
+
+// freezeBeforeSet answers whether there is a need to freeze the cgroup before
+// applying its systemd unit properties, and thaw after, while avoiding
+// unnecessary freezer state changes.
+//
+// The reason why we have to freeze is that systemd's application of device
+// rules is done disruptively, resulting in spurious errors to common devices
+// (unlike our fs driver, they will happily write deny-all rules to running
+// containers). So we have to freeze the container to avoid the container get
+// an occasional "permission denied" error.
+func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) (needsFreeze, needsThaw bool, err error) {
+	// Special case for SkipDevices, as used by Kubernetes to create pod
+	// cgroups with allow-all device policy).
+	if r.SkipDevices {
+		if r.SkipFreezeOnSet {
+			// Both needsFreeze and needsThaw are false.
+			return
+		}
+
+		// No need to freeze if SkipDevices is set, and either
+		// (1) systemd unit does not (yet) exist, or
+		// (2) it has DevicePolicy=auto and empty DeviceAllow list.
+		//
+		// Interestingly, (1) and (2) are the same here because
+		// a non-existent unit returns default properties,
+		// and settings in (2) are the defaults.
+		//
+		// Do not return errors from getUnitTypeProperty, as they alone
+		// should not prevent Set from working.
+
+		unitType := getUnitType(unitName)
+
+		devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy")
+		if e == nil && devPolicy.Value == dbus.MakeVariant("auto") {
+			devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow")
+			if e == nil {
+				if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 {
+					needsFreeze = false
+					needsThaw = false
+					return
+				}
+			}
+		}
+	}
+
+	needsFreeze = true
+	needsThaw = true
+
+	// Check the current freezer state.
+	freezerState, err := m.GetFreezerState()
+	if err != nil {
+		return
+	}
+	if freezerState == configs.Frozen {
+		// Already frozen, and should stay frozen.
+		needsFreeze = false
+		needsThaw = false
+	}
+
+	if r.Freezer == configs.Frozen {
+		// Will be frozen anyway -- no need to thaw.
+		needsThaw = false
+	}
+	return
+}
+
+func (m *legacyManager) Set(r *configs.Resources) error {
+	if r == nil {
+		return nil
+	}
+	if r.Unified != nil {
+		return cgroups.ErrV1NoUnified
+	}
+	properties, err := genV1ResourcesProperties(r, m.dbus)
+	if err != nil {
+		return err
+	}
+
+	unitName := getUnitName(m.cgroups)
+	needsFreeze, needsThaw, err := m.freezeBeforeSet(unitName, r)
+	if err != nil {
+		return err
+	}
+
+	if needsFreeze {
+		if err := m.doFreeze(configs.Frozen); err != nil {
+			// If freezer cgroup isn't supported, we just warn about it.
+			logrus.Infof("freeze container before SetUnitProperties failed: %v", err)
+			// skip update the cgroup while frozen failed. #3803
+			if !errors.Is(err, errSubsystemDoesNotExist) {
+				if needsThaw {
+					if thawErr := m.doFreeze(configs.Thawed); thawErr != nil {
+						logrus.Infof("thaw container after doFreeze failed: %v", thawErr)
+					}
+				}
+				return err
+			}
+		}
+	}
+	setErr := setUnitProperties(m.dbus, unitName, properties...)
+	if needsThaw {
+		if err := m.doFreeze(configs.Thawed); err != nil {
+			logrus.Infof("thaw container after SetUnitProperties failed: %v", err)
+		}
+	}
+	if setErr != nil {
+		return setErr
+	}
+
+	for _, sys := range legacySubsystems {
+		// Get the subsystem path, but don't error out for not found cgroups.
+		path, ok := m.paths[sys.Name()]
+		if !ok {
+			continue
+		}
+		if err := sys.Set(path, r); err != nil {
+			return err
+		}
+	}
+
+	return nil
+}
+
+func (m *legacyManager) GetPaths() map[string]string {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+	return m.paths
+}
+
+func (m *legacyManager) GetCgroups() (*configs.Cgroup, error) {
+	return m.cgroups, nil
+}
+
+func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
+	path, ok := m.paths["freezer"]
+	if !ok {
+		return configs.Undefined, nil
+	}
+	freezer := &fs.FreezerGroup{}
+	return freezer.GetState(path)
+}
+
+func (m *legacyManager) Exists() bool {
+	return cgroups.PathExists(m.Path("devices"))
+}
+
+func (m *legacyManager) OOMKillCount() (uint64, error) {
+	return fs.OOMKillCount(m.Path("memory"))
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go
@@ -0,0 +1,472 @@
+package systemd
+
+import (
+	"bufio"
+	"errors"
+	"fmt"
+	"math"
+	"os"
+	"path/filepath"
+	"strconv"
+	"strings"
+	"sync"
+
+	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/sirupsen/logrus"
+
+	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type unifiedManager struct {
+	mu      sync.Mutex
+	cgroups *configs.Cgroup
+	// path is like "/sys/fs/cgroup/user.slice/user-1001.slice/session-1.scope"
+	path  string
+	dbus  *dbusConnManager
+	fsMgr cgroups.Manager
+}
+
+func NewUnifiedManager(config *configs.Cgroup, path string) (cgroups.Manager, error) {
+	m := &unifiedManager{
+		cgroups: config,
+		path:    path,
+		dbus:    newDbusConnManager(config.Rootless),
+	}
+	if err := m.initPath(); err != nil {
+		return nil, err
+	}
+
+	fsMgr, err := fs2.NewManager(config, m.path)
+	if err != nil {
+		return nil, err
+	}
+	m.fsMgr = fsMgr
+
+	return m, nil
+}
+
+// unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified
+// key/value map (where key is cgroupfs file name) to systemd unit properties.
+// This is on a best-effort basis, so the properties that are not known
+// (to this function and/or systemd) are ignored (but logged with "debug"
+// log level).
+//
+// For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt
+//
+// For the list of systemd unit properties, see systemd.resource-control(5).
+func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props []systemdDbus.Property, _ error) {
+	var err error
+
+	for k, v := range res {
+		if strings.Contains(k, "/") {
+			return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
+		}
+		sk := strings.SplitN(k, ".", 2)
+		if len(sk) != 2 {
+			return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
+		}
+		// Kernel is quite forgiving to extra whitespace
+		// around the value, and so should we.
+		v = strings.TrimSpace(v)
+		// Please keep cases in alphabetical order.
+		switch k {
+		case "cpu.max":
+			// value: quota [period]
+			quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set
+			period := defCPUQuotaPeriod
+			sv := strings.Fields(v)
+			if len(sv) < 1 || len(sv) > 2 {
+				return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v)
+			}
+			// quota
+			if sv[0] != "max" {
+				quota, err = strconv.ParseInt(sv[0], 10, 64)
+				if err != nil {
+					return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err)
+				}
+			}
+			// period
+			if len(sv) == 2 {
+				period, err = strconv.ParseUint(sv[1], 10, 64)
+				if err != nil {
+					return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err)
+				}
+			}
+			addCpuQuota(cm, &props, quota, period)
+
+		case "cpu.weight":
+			num, err := strconv.ParseUint(v, 10, 64)
+			if err != nil {
+				return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
+			}
+			props = append(props,
+				newProp("CPUWeight", num))
+
+		case "cpuset.cpus", "cpuset.mems":
+			bits, err := RangeToBits(v)
+			if err != nil {
+				return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err)
+			}
+			m := map[string]string{
+				"cpuset.cpus": "AllowedCPUs",
+				"cpuset.mems": "AllowedMemoryNodes",
+			}
+			// systemd only supports these properties since v244
+			sdVer := systemdVersion(cm)
+			if sdVer >= 244 {
+				props = append(props,
+					newProp(m[k], bits))
+			} else {
+				logrus.Debugf("systemd v%d is too old to support %s"+
+					" (setting will still be applied to cgroupfs)",
+					sdVer, m[k])
+			}
+
+		case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max":
+			num := uint64(math.MaxUint64)
+			if v != "max" {
+				num, err = strconv.ParseUint(v, 10, 64)
+				if err != nil {
+					return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
+				}
+			}
+			m := map[string]string{
+				"memory.high":     "MemoryHigh",
+				"memory.low":      "MemoryLow",
+				"memory.min":      "MemoryMin",
+				"memory.max":      "MemoryMax",
+				"memory.swap.max": "MemorySwapMax",
+			}
+			props = append(props,
+				newProp(m[k], num))
+
+		case "pids.max":
+			num := uint64(math.MaxUint64)
+			if v != "max" {
+				var err error
+				num, err = strconv.ParseUint(v, 10, 64)
+				if err != nil {
+					return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
+				}
+			}
+			props = append(props,
+				newProp("TasksMax", num))
+
+		case "memory.oom.group":
+			// Setting this to 1 is roughly equivalent to OOMPolicy=kill
+			// (as per systemd.service(5) and
+			// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html),
+			// but it's not clear what to do if it is unset or set
+			// to 0 in runc update, as there are two other possible
+			// values for OOMPolicy (continue/stop).
+			fallthrough
+
+		default:
+			// Ignore the unknown resource here -- will still be
+			// applied in Set which calls fs2.Set.
+			logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v)
+		}
+	}
+
+	return props, nil
+}
+
+func genV2ResourcesProperties(r *configs.Resources, cm *dbusConnManager) ([]systemdDbus.Property, error) {
+	var properties []systemdDbus.Property
+
+	// NOTE: This is of questionable correctness because we insert our own
+	//       devices eBPF program later. Two programs with identical rules
+	//       aren't the end of the world, but it is a bit concerning. However
+	//       it's unclear if systemd removes all eBPF programs attached when
+	//       doing SetUnitProperties...
+	deviceProperties, err := generateDeviceProperties(r, systemdVersion(cm))
+	if err != nil {
+		return nil, err
+	}
+	properties = append(properties, deviceProperties...)
+
+	if r.Memory != 0 {
+		properties = append(properties,
+			newProp("MemoryMax", uint64(r.Memory)))
+	}
+	if r.MemoryReservation != 0 {
+		properties = append(properties,
+			newProp("MemoryLow", uint64(r.MemoryReservation)))
+	}
+
+	swap, err := cgroups.ConvertMemorySwapToCgroupV2Value(r.MemorySwap, r.Memory)
+	if err != nil {
+		return nil, err
+	}
+	if swap != 0 {
+		properties = append(properties,
+			newProp("MemorySwapMax", uint64(swap)))
+	}
+
+	if r.CpuWeight != 0 {
+		properties = append(properties,
+			newProp("CPUWeight", r.CpuWeight))
+	}
+
+	addCpuQuota(cm, &properties, r.CpuQuota, r.CpuPeriod)
+
+	if r.PidsLimit > 0 || r.PidsLimit == -1 {
+		properties = append(properties,
+			newProp("TasksMax", uint64(r.PidsLimit)))
+	}
+
+	err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems)
+	if err != nil {
+		return nil, err
+	}
+
+	// ignore r.KernelMemory
+
+	// convert Resources.Unified map to systemd properties
+	if r.Unified != nil {
+		unifiedProps, err := unifiedResToSystemdProps(cm, r.Unified)
+		if err != nil {
+			return nil, err
+		}
+		properties = append(properties, unifiedProps...)
+	}
+
+	return properties, nil
+}
+
+func (m *unifiedManager) Apply(pid int) error {
+	var (
+		c          = m.cgroups
+		unitName   = getUnitName(c)
+		properties []systemdDbus.Property
+	)
+
+	slice := "system.slice"
+	if m.cgroups.Rootless {
+		slice = "user.slice"
+	}
+	if c.Parent != "" {
+		slice = c.Parent
+	}
+
+	properties = append(properties, systemdDbus.PropDescription("libcontainer container "+c.Name))
+
+	if strings.HasSuffix(unitName, ".slice") {
+		// If we create a slice, the parent is defined via a Wants=.
+		properties = append(properties, systemdDbus.PropWants(slice))
+	} else {
+		// Otherwise it's a scope, which we put into a Slice=.
+		properties = append(properties, systemdDbus.PropSlice(slice))
+		// Assume scopes always support delegation (supported since systemd v218).
+		properties = append(properties, newProp("Delegate", true))
+	}
+
+	// only add pid if its valid, -1 is used w/ general slice creation.
+	if pid != -1 {
+		properties = append(properties, newProp("PIDs", []uint32{uint32(pid)}))
+	}
+
+	// Always enable accounting, this gets us the same behaviour as the fs implementation,
+	// plus the kernel has some problems with joining the memory cgroup at a later time.
+	properties = append(properties,
+		newProp("MemoryAccounting", true),
+		newProp("CPUAccounting", true),
+		newProp("IOAccounting", true),
+		newProp("TasksAccounting", true),
+	)
+
+	// Assume DefaultDependencies= will always work (the check for it was previously broken.)
+	properties = append(properties,
+		newProp("DefaultDependencies", false))
+
+	properties = append(properties, c.SystemdProps...)
+
+	if err := startUnit(m.dbus, unitName, properties, pid == -1); err != nil {
+		return fmt.Errorf("unable to start unit %q (properties %+v): %w", unitName, properties, err)
+	}
+
+	if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil {
+		return err
+	}
+
+	if c.OwnerUID != nil {
+		// The directory itself must be chowned.
+		err := os.Chown(m.path, *c.OwnerUID, -1)
+		if err != nil {
+			return err
+		}
+
+		filesToChown, err := cgroupFilesToChown()
+		if err != nil {
+			return err
+		}
+
+		for _, v := range filesToChown {
+			err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1)
+			// Some files might not be present.
+			if err != nil && !errors.Is(err, os.ErrNotExist) {
+				return err
+			}
+		}
+	}
+
+	return nil
+}
+
+// The kernel exposes a list of files that should be chowned to the delegate
+// uid in /sys/kernel/cgroup/delegate.  If the file is not present
+// (Linux < 4.15), use the initial values mentioned in cgroups(7).
+func cgroupFilesToChown() ([]string, error) {
+	const cgroupDelegateFile = "/sys/kernel/cgroup/delegate"
+
+	f, err := os.Open(cgroupDelegateFile)
+	if err != nil {
+		return []string{"cgroup.procs", "cgroup.subtree_control", "cgroup.threads"}, nil
+	}
+	defer f.Close()
+
+	filesToChown := []string{}
+	scanner := bufio.NewScanner(f)
+	for scanner.Scan() {
+		filesToChown = append(filesToChown, scanner.Text())
+	}
+	if err := scanner.Err(); err != nil {
+		return nil, fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err)
+	}
+
+	return filesToChown, nil
+}
+
+func (m *unifiedManager) Destroy() error {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	unitName := getUnitName(m.cgroups)
+	if err := stopUnit(m.dbus, unitName); err != nil {
+		return err
+	}
+
+	// systemd 239 do not remove sub-cgroups.
+	err := m.fsMgr.Destroy()
+	// fsMgr.Destroy has handled ErrNotExist
+	if err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (m *unifiedManager) Path(_ string) string {
+	return m.path
+}
+
+// getSliceFull value is used in initPath.
+// The value is incompatible with systemdDbus.PropSlice.
+func (m *unifiedManager) getSliceFull() (string, error) {
+	c := m.cgroups
+	slice := "system.slice"
+	if c.Rootless {
+		slice = "user.slice"
+	}
+	if c.Parent != "" {
+		var err error
+		slice, err = ExpandSlice(c.Parent)
+		if err != nil {
+			return "", err
+		}
+	}
+
+	if c.Rootless {
+		// managerCG is typically "/user.slice/user-${uid}.slice/user@${uid}.service".
+		managerCG, err := getManagerProperty(m.dbus, "ControlGroup")
+		if err != nil {
+			return "", err
+		}
+		slice = filepath.Join(managerCG, slice)
+	}
+
+	// an example of the final slice in rootless: "/user.slice/user-1001.slice/user@1001.service/user.slice"
+	// NOTE: systemdDbus.PropSlice requires the "/user.slice/user-1001.slice/user@1001.service/" prefix NOT to be specified.
+	return slice, nil
+}
+
+func (m *unifiedManager) initPath() error {
+	if m.path != "" {
+		return nil
+	}
+
+	sliceFull, err := m.getSliceFull()
+	if err != nil {
+		return err
+	}
+
+	c := m.cgroups
+	path := filepath.Join(sliceFull, getUnitName(c))
+	path, err = securejoin.SecureJoin(fs2.UnifiedMountpoint, path)
+	if err != nil {
+		return err
+	}
+
+	// an example of the final path in rootless:
+	// "/sys/fs/cgroup/user.slice/user-1001.slice/user@1001.service/user.slice/libpod-132ff0d72245e6f13a3bbc6cdc5376886897b60ac59eaa8dea1df7ab959cbf1c.scope"
+	m.path = path
+
+	return nil
+}
+
+func (m *unifiedManager) Freeze(state configs.FreezerState) error {
+	return m.fsMgr.Freeze(state)
+}
+
+func (m *unifiedManager) GetPids() ([]int, error) {
+	return cgroups.GetPids(m.path)
+}
+
+func (m *unifiedManager) GetAllPids() ([]int, error) {
+	return cgroups.GetAllPids(m.path)
+}
+
+func (m *unifiedManager) GetStats() (*cgroups.Stats, error) {
+	return m.fsMgr.GetStats()
+}
+
+func (m *unifiedManager) Set(r *configs.Resources) error {
+	if r == nil {
+		return nil
+	}
+	properties, err := genV2ResourcesProperties(r, m.dbus)
+	if err != nil {
+		return err
+	}
+
+	if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil {
+		return fmt.Errorf("unable to set unit properties: %w", err)
+	}
+
+	return m.fsMgr.Set(r)
+}
+
+func (m *unifiedManager) GetPaths() map[string]string {
+	paths := make(map[string]string, 1)
+	paths[""] = m.path
+	return paths
+}
+
+func (m *unifiedManager) GetCgroups() (*configs.Cgroup, error) {
+	return m.cgroups, nil
+}
+
+func (m *unifiedManager) GetFreezerState() (configs.FreezerState, error) {
+	return m.fsMgr.GetFreezerState()
+}
+
+func (m *unifiedManager) Exists() bool {
+	return cgroups.PathExists(m.path)
+}
+
+func (m *unifiedManager) OOMKillCount() (uint64, error) {
+	return m.fsMgr.OOMKillCount()
+}