vendor: cadvisor v0.39.0

Main upgrades: - github.com/opencontainers/runc v1.0.0-rc93 - github.com/containerd/containerd v1.4.4 - github.com/docker/docker v20.10.2 - github.com/mrunalp/fileutils v0.5.0 - github.com/opencontainers/selinux v1.8.0 - github.com/cilium/ebpf v0.2.0
2021-03-08 22:09:22 -08:00
parent faa3a5fbd4
commit b5dd78da3d
286 changed files with 7427 additions and 4415 deletions
--- a/vendor/github.com/opencontainers/runc/libcontainer/README.md
+++ b/vendor/github.com/opencontainers/runc/libcontainer/README.md
@@ -60,87 +60,87 @@ defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
 config := &configs.Config{
 	Rootfs: "/your/path/to/rootfs",
 	Capabilities: &configs.Capabilities{
-                Bounding: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Effective: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Inheritable: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Permitted: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-                Ambient: []string{
-                        "CAP_CHOWN",
-                        "CAP_DAC_OVERRIDE",
-                        "CAP_FSETID",
-                        "CAP_FOWNER",
-                        "CAP_MKNOD",
-                        "CAP_NET_RAW",
-                        "CAP_SETGID",
-                        "CAP_SETUID",
-                        "CAP_SETFCAP",
-                        "CAP_SETPCAP",
-                        "CAP_NET_BIND_SERVICE",
-                        "CAP_SYS_CHROOT",
-                        "CAP_KILL",
-                        "CAP_AUDIT_WRITE",
-                },
-        },
+		Bounding: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Effective: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Inheritable: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Permitted: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+		Ambient: []string{
+			"CAP_CHOWN",
+			"CAP_DAC_OVERRIDE",
+			"CAP_FSETID",
+			"CAP_FOWNER",
+			"CAP_MKNOD",
+			"CAP_NET_RAW",
+			"CAP_SETGID",
+			"CAP_SETUID",
+			"CAP_SETFCAP",
+			"CAP_SETPCAP",
+			"CAP_NET_BIND_SERVICE",
+			"CAP_SYS_CHROOT",
+			"CAP_KILL",
+			"CAP_AUDIT_WRITE",
+		},
+	},
 	Namespaces: configs.Namespaces([]configs.Namespace{
 		{Type: configs.NEWNS},
 		{Type: configs.NEWUTS},
--- a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_linux.go
@@ -1,8 +1,7 @@
-// +build apparmor,linux
-
 package apparmor

 import (
+	"bytes"
 	"fmt"
 	"io/ioutil"
 	"os"
@@ -12,11 +11,9 @@ import (

 // IsEnabled returns true if apparmor is enabled for the host.
 func IsEnabled() bool {
-	if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil && os.Getenv("container") == "" {
-		if _, err = os.Stat("/sbin/apparmor_parser"); err == nil {
-			buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled")
-			return err == nil && len(buf) > 1 && buf[0] == 'Y'
-		}
+	if _, err := os.Stat("/sys/kernel/security/apparmor"); err == nil {
+		buf, err := ioutil.ReadFile("/sys/module/apparmor/parameters/enabled")
+		return err == nil && bytes.HasPrefix(buf, []byte("Y"))
 	}
 	return false
 }
@@ -24,9 +21,7 @@ func IsEnabled() bool {
 func setProcAttr(attr, value string) error {
 	// Under AppArmor you can only change your own attr, so use /proc/self/
 	// instead of /proc/<tid>/ like libapparmor does
-	path := fmt.Sprintf("/proc/self/attr/%s", attr)
-
-	f, err := os.OpenFile(path, os.O_WRONLY, 0)
+	f, err := os.OpenFile("/proc/self/attr/"+attr, os.O_WRONLY, 0)
 	if err != nil {
 		return err
 	}
@@ -36,14 +31,13 @@ func setProcAttr(attr, value string) error {
 		return err
 	}

-	_, err = fmt.Fprintf(f, "%s", value)
+	_, err = f.WriteString(value)
 	return err
 }

 // changeOnExec reimplements aa_change_onexec from libapparmor in Go
 func changeOnExec(name string) error {
-	value := "exec " + name
-	if err := setProcAttr("exec", value); err != nil {
+	if err := setProcAttr("exec", "exec "+name); err != nil {
 		return fmt.Errorf("apparmor failed to apply profile: %s", err)
 	}
 	return nil
--- a/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/apparmor/apparmor_unsupported.go
@@ -1,4 +1,4 @@
-// +build !apparmor !linux
+// +build !linux

 package apparmor

--- a/vendor/github.com/opencontainers/runc/libcontainer/capabilities/capabilities.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/capabilities/capabilities.go
@@ -0,0 +1,96 @@
+// +build linux
+
+package capabilities
+
+import (
+	"fmt"
+	"strings"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/syndtr/gocapability/capability"
+)
+
+const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS
+
+var capabilityMap map[string]capability.Cap
+
+func init() {
+	capabilityMap = make(map[string]capability.Cap, capability.CAP_LAST_CAP+1)
+	for _, c := range capability.List() {
+		if c > capability.CAP_LAST_CAP {
+			continue
+		}
+		capabilityMap["CAP_"+strings.ToUpper(c.String())] = c
+	}
+}
+
+// New creates a new Caps from the given Capabilities config.
+func New(capConfig *configs.Capabilities) (*Caps, error) {
+	var (
+		err  error
+		caps Caps
+	)
+
+	if caps.bounding, err = capSlice(capConfig.Bounding); err != nil {
+		return nil, err
+	}
+	if caps.effective, err = capSlice(capConfig.Effective); err != nil {
+		return nil, err
+	}
+	if caps.inheritable, err = capSlice(capConfig.Inheritable); err != nil {
+		return nil, err
+	}
+	if caps.permitted, err = capSlice(capConfig.Permitted); err != nil {
+		return nil, err
+	}
+	if caps.ambient, err = capSlice(capConfig.Ambient); err != nil {
+		return nil, err
+	}
+	if caps.pid, err = capability.NewPid2(0); err != nil {
+		return nil, err
+	}
+	if err = caps.pid.Load(); err != nil {
+		return nil, err
+	}
+	return &caps, nil
+}
+
+func capSlice(caps []string) ([]capability.Cap, error) {
+	out := make([]capability.Cap, len(caps))
+	for i, c := range caps {
+		v, ok := capabilityMap[c]
+		if !ok {
+			return nil, fmt.Errorf("unknown capability %q", c)
+		}
+		out[i] = v
+	}
+	return out, nil
+}
+
+// Caps holds the capabilities for a container.
+type Caps struct {
+	pid         capability.Capabilities
+	bounding    []capability.Cap
+	effective   []capability.Cap
+	inheritable []capability.Cap
+	permitted   []capability.Cap
+	ambient     []capability.Cap
+}
+
+// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
+func (c *Caps) ApplyBoundingSet() error {
+	c.pid.Clear(capability.BOUNDS)
+	c.pid.Set(capability.BOUNDS, c.bounding...)
+	return c.pid.Apply(capability.BOUNDS)
+}
+
+// Apply sets all the capabilities for the current process in the config.
+func (c *Caps) ApplyCaps() error {
+	c.pid.Clear(allCapabilityTypes)
+	c.pid.Set(capability.BOUNDS, c.bounding...)
+	c.pid.Set(capability.PERMITTED, c.permitted...)
+	c.pid.Set(capability.INHERITABLE, c.inheritable...)
+	c.pid.Set(capability.EFFECTIVE, c.effective...)
+	c.pid.Set(capability.AMBIENT, c.ambient...)
+	return c.pid.Apply(allCapabilityTypes)
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/capabilities/capabilities_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/capabilities/capabilities_unsupported.go
@@ -0,0 +1,3 @@
+// +build !linux
+
+package capabilities
--- a/vendor/github.com/opencontainers/runc/libcontainer/capabilities_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/capabilities_linux.go
@@ -1,117 +0,0 @@
-// +build linux
-
-package libcontainer
-
-import (
-	"fmt"
-	"strings"
-
-	"github.com/opencontainers/runc/libcontainer/configs"
-	"github.com/syndtr/gocapability/capability"
-)
-
-const allCapabilityTypes = capability.CAPS | capability.BOUNDS | capability.AMBS
-
-var capabilityMap map[string]capability.Cap
-
-func init() {
-	capabilityMap = make(map[string]capability.Cap)
-	last := capability.CAP_LAST_CAP
-	// workaround for RHEL6 which has no /proc/sys/kernel/cap_last_cap
-	if last == capability.Cap(63) {
-		last = capability.CAP_BLOCK_SUSPEND
-	}
-	for _, cap := range capability.List() {
-		if cap > last {
-			continue
-		}
-		capKey := fmt.Sprintf("CAP_%s", strings.ToUpper(cap.String()))
-		capabilityMap[capKey] = cap
-	}
-}
-
-func newContainerCapList(capConfig *configs.Capabilities) (*containerCapabilities, error) {
-	bounding := []capability.Cap{}
-	for _, c := range capConfig.Bounding {
-		v, ok := capabilityMap[c]
-		if !ok {
-			return nil, fmt.Errorf("unknown capability %q", c)
-		}
-		bounding = append(bounding, v)
-	}
-	effective := []capability.Cap{}
-	for _, c := range capConfig.Effective {
-		v, ok := capabilityMap[c]
-		if !ok {
-			return nil, fmt.Errorf("unknown capability %q", c)
-		}
-		effective = append(effective, v)
-	}
-	inheritable := []capability.Cap{}
-	for _, c := range capConfig.Inheritable {
-		v, ok := capabilityMap[c]
-		if !ok {
-			return nil, fmt.Errorf("unknown capability %q", c)
-		}
-		inheritable = append(inheritable, v)
-	}
-	permitted := []capability.Cap{}
-	for _, c := range capConfig.Permitted {
-		v, ok := capabilityMap[c]
-		if !ok {
-			return nil, fmt.Errorf("unknown capability %q", c)
-		}
-		permitted = append(permitted, v)
-	}
-	ambient := []capability.Cap{}
-	for _, c := range capConfig.Ambient {
-		v, ok := capabilityMap[c]
-		if !ok {
-			return nil, fmt.Errorf("unknown capability %q", c)
-		}
-		ambient = append(ambient, v)
-	}
-	pid, err := capability.NewPid2(0)
-	if err != nil {
-		return nil, err
-	}
-	err = pid.Load()
-	if err != nil {
-		return nil, err
-	}
-	return &containerCapabilities{
-		bounding:    bounding,
-		effective:   effective,
-		inheritable: inheritable,
-		permitted:   permitted,
-		ambient:     ambient,
-		pid:         pid,
-	}, nil
-}
-
-type containerCapabilities struct {
-	pid         capability.Capabilities
-	bounding    []capability.Cap
-	effective   []capability.Cap
-	inheritable []capability.Cap
-	permitted   []capability.Cap
-	ambient     []capability.Cap
-}
-
-// ApplyBoundingSet sets the capability bounding set to those specified in the whitelist.
-func (c *containerCapabilities) ApplyBoundingSet() error {
-	c.pid.Clear(capability.BOUNDS)
-	c.pid.Set(capability.BOUNDS, c.bounding...)
-	return c.pid.Apply(capability.BOUNDS)
-}
-
-// Apply sets all the capabilities for the current process in the config.
-func (c *containerCapabilities) ApplyCaps() error {
-	c.pid.Clear(allCapabilityTypes)
-	c.pid.Set(capability.BOUNDS, c.bounding...)
-	c.pid.Set(capability.PERMITTED, c.permitted...)
-	c.pid.Set(capability.INHERITABLE, c.inheritable...)
-	c.pid.Set(capability.EFFECTIVE, c.effective...)
-	c.pid.Set(capability.AMBIENT, c.ambient...)
-	return c.pid.Apply(allCapabilityTypes)
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/devices/devices_emulator.go
@@ -27,29 +27,29 @@ import (
 	"sort"
 	"strconv"

-	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/devices"

 	"github.com/pkg/errors"
 )

-// deviceMeta is a DeviceRule without the Allow or Permissions fields, and no
+// deviceMeta is a Rule without the Allow or Permissions fields, and no
 // wildcard-type support. It's effectively the "match" portion of a metadata
 // rule, for the purposes of our emulation.
 type deviceMeta struct {
-	node  configs.DeviceType
+	node  devices.Type
 	major int64
 	minor int64
 }

-// deviceRule is effectively the tuple (deviceMeta, DevicePermissions).
+// deviceRule is effectively the tuple (deviceMeta, Permissions).
 type deviceRule struct {
 	meta  deviceMeta
-	perms configs.DevicePermissions
+	perms devices.Permissions
 }

 // deviceRules is a mapping of device metadata rules to the associated
 // permissions in the ruleset.
-type deviceRules map[deviceMeta]configs.DevicePermissions
+type deviceRules map[deviceMeta]devices.Permissions

 func (r deviceRules) orderedEntries() []deviceRule {
 	var rules []deviceRule
@@ -103,9 +103,9 @@ func parseLine(line string) (*deviceRule, error) {
 		// TODO: Double-check that the entire file is "a *:* rwm".
 		return nil, nil
 	case "b":
-		rule.meta.node = configs.BlockDevice
+		rule.meta.node = devices.BlockDevice
 	case "c":
-		rule.meta.node = configs.CharDevice
+		rule.meta.node = devices.CharDevice
 	default:
 		// Should never happen!
 		return nil, errors.Errorf("unknown device type %q", node)
@@ -113,7 +113,7 @@ func parseLine(line string) (*deviceRule, error) {

 	// Parse the major number.
 	if major == "*" {
-		rule.meta.major = configs.Wildcard
+		rule.meta.major = devices.Wildcard
 	} else {
 		val, err := strconv.ParseUint(major, 10, 32)
 		if err != nil {
@@ -124,7 +124,7 @@ func parseLine(line string) (*deviceRule, error) {

 	// Parse the minor number.
 	if minor == "*" {
-		rule.meta.minor = configs.Wildcard
+		rule.meta.minor = devices.Wildcard
 	} else {
 		val, err := strconv.ParseUint(minor, 10, 32)
 		if err != nil {
@@ -134,7 +134,7 @@ func parseLine(line string) (*deviceRule, error) {
 	}

 	// Parse the access permissions.
-	rule.perms = configs.DevicePermissions(perms)
+	rule.perms = devices.Permissions(perms)
 	if !rule.perms.IsValid() || rule.perms.IsEmpty() {
 		// Should never happen!
 		return nil, errors.Errorf("parse access mode: contained unknown modes or is empty: %q", perms)
@@ -144,7 +144,7 @@ func parseLine(line string) (*deviceRule, error) {

 func (e *Emulator) addRule(rule deviceRule) error {
 	if e.rules == nil {
-		e.rules = make(map[deviceMeta]configs.DevicePermissions)
+		e.rules = make(map[deviceMeta]devices.Permissions)
 	}

 	// Merge with any pre-existing permissions.
@@ -169,9 +169,9 @@ func (e *Emulator) rmRule(rule deviceRule) error {
 	// to mention it'd be really slow (the kernel side is implemented as a
 	// linked-list of exceptions).
 	for _, partialMeta := range []deviceMeta{
-		{node: rule.meta.node, major: configs.Wildcard, minor: rule.meta.minor},
-		{node: rule.meta.node, major: rule.meta.major, minor: configs.Wildcard},
-		{node: rule.meta.node, major: configs.Wildcard, minor: configs.Wildcard},
+		{node: rule.meta.node, major: devices.Wildcard, minor: rule.meta.minor},
+		{node: rule.meta.node, major: rule.meta.major, minor: devices.Wildcard},
+		{node: rule.meta.node, major: devices.Wildcard, minor: devices.Wildcard},
 	} {
 		// This wildcard rule is equivalent to the requested rule, so skip it.
 		if rule.meta == partialMeta {
@@ -202,7 +202,7 @@ func (e *Emulator) rmRule(rule deviceRule) error {
 func (e *Emulator) allow(rule *deviceRule) error {
 	// This cgroup is configured as a black-list. Reset the entire emulator,
 	// and put is into black-list mode.
-	if rule == nil || rule.meta.node == configs.WildcardDevice {
+	if rule == nil || rule.meta.node == devices.WildcardDevice {
 		*e = Emulator{
 			defaultAllow: true,
 			rules:        nil,
@@ -222,7 +222,7 @@ func (e *Emulator) allow(rule *deviceRule) error {
 func (e *Emulator) deny(rule *deviceRule) error {
 	// This cgroup is configured as a white-list. Reset the entire emulator,
 	// and put is into white-list mode.
-	if rule == nil || rule.meta.node == configs.WildcardDevice {
+	if rule == nil || rule.meta.node == devices.WildcardDevice {
 		*e = Emulator{
 			defaultAllow: false,
 			rules:        nil,
@@ -239,7 +239,7 @@ func (e *Emulator) deny(rule *deviceRule) error {
 	return err
 }

-func (e *Emulator) Apply(rule configs.DeviceRule) error {
+func (e *Emulator) Apply(rule devices.Rule) error {
 	if !rule.Type.CanCgroup() {
 		return errors.Errorf("cannot add rule [%#v] with non-cgroup type %q", rule, rule.Type)
 	}
@@ -252,7 +252,7 @@ func (e *Emulator) Apply(rule configs.DeviceRule) error {
 		},
 		perms: rule.Permissions,
 	}
-	if innerRule.meta.node == configs.WildcardDevice {
+	if innerRule.meta.node == devices.WildcardDevice {
 		innerRule = nil
 	}

@@ -307,8 +307,8 @@ func EmulatorFromList(list io.Reader) (*Emulator, error) {
 // This function is the sole reason for all of Emulator -- to allow us
 // to figure out how to update a containers' cgroups without causing spurrious
 // device errors (if possible).
-func (source *Emulator) Transition(target *Emulator) ([]*configs.DeviceRule, error) {
-	var transitionRules []*configs.DeviceRule
+func (source *Emulator) Transition(target *Emulator) ([]*devices.Rule, error) {
+	var transitionRules []*devices.Rule
 	oldRules := source.rules

 	// If the default policy doesn't match, we need to include a "disruptive"
@@ -319,11 +319,11 @@ func (source *Emulator) Transition(target *Emulator) ([]*configs.DeviceRule, err
 	// deny rules are in place in a black-list cgroup. Thus if the source is a
 	// black-list we also have to include a disruptive rule.
 	if source.IsBlacklist() || source.defaultAllow != target.defaultAllow {
-		transitionRules = append(transitionRules, &configs.DeviceRule{
+		transitionRules = append(transitionRules, &devices.Rule{
 			Type:        'a',
 			Major:       -1,
 			Minor:       -1,
-			Permissions: configs.DevicePermissions("rwm"),
+			Permissions: devices.Permissions("rwm"),
 			Allow:       target.defaultAllow,
 		})
 		// The old rules are only relevant if we aren't starting out with a
@@ -342,7 +342,7 @@ func (source *Emulator) Transition(target *Emulator) ([]*configs.DeviceRule, err
 		newPerms := target.rules[meta]
 		droppedPerms := oldPerms.Difference(newPerms)
 		if !droppedPerms.IsEmpty() {
-			transitionRules = append(transitionRules, &configs.DeviceRule{
+			transitionRules = append(transitionRules, &devices.Rule{
 				Type:        meta.node,
 				Major:       meta.major,
 				Minor:       meta.minor,
@@ -360,7 +360,7 @@ func (source *Emulator) Transition(target *Emulator) ([]*configs.DeviceRule, err
 		oldPerms := oldRules[meta]
 		gainedPerms := newPerms.Difference(oldPerms)
 		if !gainedPerms.IsEmpty() {
-			transitionRules = append(transitionRules, &configs.DeviceRule{
+			transitionRules = append(transitionRules, &devices.Rule{
 				Type:        meta.node,
 				Major:       meta.major,
 				Minor:       meta.minor,
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go
@@ -1,4 +1,4 @@
-// Package devicefilter containes eBPF device filter program
+// Package devicefilter contains eBPF device filter program
 //
 // The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c
 //
@@ -7,11 +7,11 @@
 package devicefilter

 import (
-	"fmt"
 	"math"
+	"strconv"

 	"github.com/cilium/ebpf/asm"
-	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/pkg/errors"
 	"golang.org/x/sys/unix"
 )
@@ -22,7 +22,7 @@ const (
 )

 // DeviceFilter returns eBPF device filter program and its license string
-func DeviceFilter(devices []*configs.DeviceRule) (asm.Instructions, string, error) {
+func DeviceFilter(devices []*devices.Rule) (asm.Instructions, string, error) {
 	p := &program{}
 	p.init()
 	for i := len(devices) - 1; i >= 0; i-- {
@@ -68,7 +68,7 @@ func (p *program) init() {
 }

 // appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element.
-func (p *program) appendDevice(dev *configs.DeviceRule) error {
+func (p *program) appendDevice(dev *devices.Rule) error {
 	if p.blockID < 0 {
 		return errors.New("the program is finalized")
 	}
@@ -88,7 +88,7 @@ func (p *program) appendDevice(dev *configs.DeviceRule) error {
 		hasType = false
 	default:
 		// if not specified in OCI json, typ is set to DeviceTypeAll
-		return errors.Errorf("invalid DeviceType %q", string(dev.Type))
+		return errors.Errorf("invalid Type %q", string(dev.Type))
 	}
 	if dev.Major > math.MaxUint32 {
 		return errors.Errorf("invalid major %d", dev.Major)
@@ -114,9 +114,11 @@ func (p *program) appendDevice(dev *configs.DeviceRule) error {
 	// If the access is rwm, skip the check.
 	hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD)

-	blockSym := fmt.Sprintf("block-%d", p.blockID)
-	nextBlockSym := fmt.Sprintf("block-%d", p.blockID+1)
-	prevBlockLastIdx := len(p.insts) - 1
+	var (
+		blockSym         = "block-" + strconv.Itoa(p.blockID)
+		nextBlockSym     = "block-" + strconv.Itoa(p.blockID+1)
+		prevBlockLastIdx = len(p.insts) - 1
+	)
 	if hasType {
 		p.insts = append(p.insts,
 			// if (R2 != bpfType) goto next
@@ -158,7 +160,7 @@ func (p *program) finalize() (asm.Instructions, error) {
 		// acceptBlock with asm.Return() is already inserted
 		return p.insts, nil
 	}
-	blockSym := fmt.Sprintf("block-%d", p.blockID)
+	blockSym := "block-" + strconv.Itoa(p.blockID)
 	p.insts = append(p.insts,
 		// R0 <- 0
 		asm.Mov.Imm32(asm.R0, 0).Sym(blockSym),
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/blkio.go
@@ -6,7 +6,6 @@ import (
 	"bufio"
 	"fmt"
 	"os"
-	"path/filepath"
 	"strconv"
 	"strings"

@@ -105,9 +104,9 @@ func splitBlkioStatLine(r rune) bool {
 	return r == ' ' || r == ':'
 }

-func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
+func getBlkioStat(dir, file string) ([]cgroups.BlkioStatEntry, error) {
 	var blkioStats []cgroups.BlkioStatEntry
-	f, err := os.Open(path)
+	f, err := fscommon.OpenFile(dir, file, os.O_RDONLY)
 	if err != nil {
 		if os.IsNotExist(err) {
 			return blkioStats, nil
@@ -125,7 +124,7 @@ func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
 				// skip total line
 				continue
 			} else {
-				return nil, fmt.Errorf("Invalid line found while parsing %s: %s", path, sc.Text())
+				return nil, fmt.Errorf("Invalid line found while parsing %s/%s: %s", dir, file, sc.Text())
 			}
 		}

@@ -158,73 +157,134 @@ func getBlkioStat(path string) ([]cgroups.BlkioStatEntry, error) {
 }

 func (s *BlkioGroup) GetStats(path string, stats *cgroups.Stats) error {
-	// Try to read CFQ stats available on all CFQ enabled kernels first
-	if blkioStats, err := getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err == nil && blkioStats != nil {
-		return getCFQStats(path, stats)
+	type blkioStatInfo struct {
+		filename            string
+		blkioStatEntriesPtr *[]cgroups.BlkioStatEntry
+	}
+	var bfqDebugStats = []blkioStatInfo{
+		{
+			filename:            "blkio.bfq.sectors_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_service_time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_wait_time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_merged_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_queued_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
+		},
+		{
+			filename:            "blkio.bfq.time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_serviced_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_service_bytes_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	var bfqStats = []blkioStatInfo{
+		{
+			filename:            "blkio.bfq.io_serviced_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.bfq.io_service_bytes_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	var cfqStats = []blkioStatInfo{
+		{
+			filename:            "blkio.sectors_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.SectorsRecursive,
+		},
+		{
+			filename:            "blkio.io_service_time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceTimeRecursive,
+		},
+		{
+			filename:            "blkio.io_wait_time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoWaitTimeRecursive,
+		},
+		{
+			filename:            "blkio.io_merged_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoMergedRecursive,
+		},
+		{
+			filename:            "blkio.io_queued_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoQueuedRecursive,
+		},
+		{
+			filename:            "blkio.time_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoTimeRecursive,
+		},
+		{
+			filename:            "blkio.io_serviced_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.io_service_bytes_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	var throttleRecursiveStats = []blkioStatInfo{
+		{
+			filename:            "blkio.throttle.io_serviced_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.throttle.io_service_bytes_recursive",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	var baseStats = []blkioStatInfo{
+		{
+			filename:            "blkio.throttle.io_serviced",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServicedRecursive,
+		},
+		{
+			filename:            "blkio.throttle.io_service_bytes",
+			blkioStatEntriesPtr: &stats.BlkioStats.IoServiceBytesRecursive,
+		},
+	}
+	var orderedStats = [][]blkioStatInfo{
+		bfqDebugStats,
+		bfqStats,
+		cfqStats,
+		throttleRecursiveStats,
+		baseStats,
 	}
-	return getStats(path, stats) // Use generic stats as fallback
-}

-func getCFQStats(path string, stats *cgroups.Stats) error {
 	var blkioStats []cgroups.BlkioStatEntry
 	var err error

-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.sectors_recursive")); err != nil {
-		return err
+	for _, statGroup := range orderedStats {
+		for i, statInfo := range statGroup {
+			if blkioStats, err = getBlkioStat(path, statInfo.filename); err != nil || blkioStats == nil {
+				// if error occurs on first file, move to next group
+				if i == 0 {
+					break
+				}
+				return err
+			}
+			*statInfo.blkioStatEntriesPtr = blkioStats
+			//finish if all stats are gathered
+			if i == len(statGroup)-1 {
+				return nil
+			}
+		}
 	}
-	stats.BlkioStats.SectorsRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_bytes_recursive")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoServiceBytesRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_serviced_recursive")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoServicedRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_queued_recursive")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoQueuedRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_service_time_recursive")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoServiceTimeRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_wait_time_recursive")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoWaitTimeRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.io_merged_recursive")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoMergedRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.time_recursive")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoTimeRecursive = blkioStats
-
-	return nil
-}
-
-func getStats(path string, stats *cgroups.Stats) error {
-	var blkioStats []cgroups.BlkioStatEntry
-	var err error
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_service_bytes")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoServiceBytesRecursive = blkioStats
-
-	if blkioStats, err = getBlkioStat(filepath.Join(path, "blkio.throttle.io_serviced")); err != nil {
-		return err
-	}
-	stats.BlkioStats.IoServicedRecursive = blkioStats
-
 	return nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpu.go
@@ -6,7 +6,6 @@ import (
 	"bufio"
 	"fmt"
 	"os"
-	"path/filepath"
 	"strconv"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -87,7 +86,7 @@ func (s *CpuGroup) Set(path string, cgroup *configs.Cgroup) error {
 }

 func (s *CpuGroup) GetStats(path string, stats *cgroups.Stats) error {
-	f, err := os.Open(filepath.Join(path, "cpu.stat"))
+	f, err := fscommon.OpenFile(path, "cpu.stat", os.O_RDONLY)
 	if err != nil {
 		if os.IsNotExist(err) {
 			return nil
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuacct.go
@@ -5,7 +5,6 @@ package fs
 import (
 	"bufio"
 	"fmt"
-	"io/ioutil"
 	"os"
 	"path/filepath"
 	"strconv"
@@ -83,8 +82,7 @@ func (s *CpuacctGroup) GetStats(path string, stats *cgroups.Stats) error {

 // Returns user and kernel usage breakdown in nanoseconds.
 func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
-	userModeUsage := uint64(0)
-	kernelModeUsage := uint64(0)
+	var userModeUsage, kernelModeUsage uint64
 	const (
 		userField   = "user"
 		systemField = "system"
@@ -93,11 +91,11 @@ func getCpuUsageBreakdown(path string) (uint64, uint64, error) {
 	// Expected format:
 	// user <usage in ticks>
 	// system <usage in ticks>
-	data, err := ioutil.ReadFile(filepath.Join(path, cgroupCpuacctStat))
+	data, err := fscommon.ReadFile(path, cgroupCpuacctStat)
 	if err != nil {
 		return 0, 0, err
 	}
-	fields := strings.Fields(string(data))
+	fields := strings.Fields(data)
 	if len(fields) < 4 {
 		return 0, 0, fmt.Errorf("failure - %s is expected to have at least 4 fields", filepath.Join(path, cgroupCpuacctStat))
 	}
@@ -119,11 +117,11 @@ func getCpuUsageBreakdown(path string) (uint64, uint64, error) {

 func getPercpuUsage(path string) ([]uint64, error) {
 	percpuUsage := []uint64{}
-	data, err := ioutil.ReadFile(filepath.Join(path, "cpuacct.usage_percpu"))
+	data, err := fscommon.ReadFile(path, "cpuacct.usage_percpu")
 	if err != nil {
 		return percpuUsage, err
 	}
-	for _, value := range strings.Fields(string(data)) {
+	for _, value := range strings.Fields(data) {
 		value, err := strconv.ParseUint(value, 10, 64)
 		if err != nil {
 			return percpuUsage, fmt.Errorf("Unable to convert param value to uint64: %s", err)
@@ -137,7 +135,7 @@ func getPercpuUsageInModes(path string) ([]uint64, []uint64, error) {
 	usageKernelMode := []uint64{}
 	usageUserMode := []uint64{}

-	file, err := os.Open(filepath.Join(path, cgroupCpuacctUsageAll))
+	file, err := fscommon.OpenFile(path, cgroupCpuacctUsageAll, os.O_RDONLY)
 	if os.IsNotExist(err) {
 		return usageKernelMode, usageUserMode, nil
 	} else if err != nil {
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/cpuset.go
@@ -3,17 +3,17 @@
 package fs

 import (
-	"bytes"
-	"io/ioutil"
+	"fmt"
 	"os"
 	"path/filepath"
+	"strconv"
+	"strings"

-	"github.com/moby/sys/mountinfo"
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
-	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
 	"github.com/pkg/errors"
+	"golang.org/x/sys/unix"
 )

 type CpusetGroup struct {
@@ -41,30 +41,107 @@ func (s *CpusetGroup) Set(path string, cgroup *configs.Cgroup) error {
 	return nil
 }

-func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
-	return nil
-}
-
-// Get the source mount point of directory passed in as argument.
-func getMount(dir string) (string, error) {
-	mi, err := mountinfo.GetMounts(mountinfo.ParentsFilter(dir))
+func getCpusetStat(path string, filename string) ([]uint16, error) {
+	var extracted []uint16
+	fileContent, err := fscommon.GetCgroupParamString(path, filename)
 	if err != nil {
-		return "", err
+		return extracted, err
 	}
-	if len(mi) < 1 {
-		return "", errors.Errorf("Can't find mount point of %s", dir)
+	if len(fileContent) == 0 {
+		return extracted, fmt.Errorf("%s found to be empty", filepath.Join(path, filename))
 	}

-	// find the longest mount point
-	var idx, maxlen int
-	for i := range mi {
-		if len(mi[i].Mountpoint) > maxlen {
-			maxlen = len(mi[i].Mountpoint)
-			idx = i
+	for _, s := range strings.Split(fileContent, ",") {
+		splitted := strings.SplitN(s, "-", 3)
+		switch len(splitted) {
+		case 3:
+			return extracted, fmt.Errorf("invalid values in %s", filepath.Join(path, filename))
+		case 2:
+			min, err := strconv.ParseUint(splitted[0], 10, 16)
+			if err != nil {
+				return extracted, err
+			}
+			max, err := strconv.ParseUint(splitted[1], 10, 16)
+			if err != nil {
+				return extracted, err
+			}
+			if min > max {
+				return extracted, fmt.Errorf("invalid values in %s", filepath.Join(path, filename))
+			}
+			for i := min; i <= max; i++ {
+				extracted = append(extracted, uint16(i))
+			}
+		case 1:
+			value, err := strconv.ParseUint(s, 10, 16)
+			if err != nil {
+				return extracted, err
+			}
+			extracted = append(extracted, uint16(value))
 		}
 	}

-	return mi[idx].Mountpoint, nil
+	return extracted, nil
+}
+
+func (s *CpusetGroup) GetStats(path string, stats *cgroups.Stats) error {
+	var err error
+
+	stats.CPUSetStats.CPUs, err = getCpusetStat(path, "cpuset.cpus")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.CPUExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.cpu_exclusive")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.Mems, err = getCpusetStat(path, "cpuset.mems")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemHardwall, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_hardwall")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemExclusive, err = fscommon.GetCgroupParamUint(path, "cpuset.mem_exclusive")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemoryMigrate, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_migrate")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemorySpreadPage, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_page")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemorySpreadSlab, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_spread_slab")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.MemoryPressure, err = fscommon.GetCgroupParamUint(path, "cpuset.memory_pressure")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.SchedLoadBalance, err = fscommon.GetCgroupParamUint(path, "cpuset.sched_load_balance")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	stats.CPUSetStats.SchedRelaxDomainLevel, err = fscommon.GetCgroupParamInt(path, "cpuset.sched_relax_domain_level")
+	if err != nil && !errors.Is(err, os.ErrNotExist) {
+		return err
+	}
+
+	return nil
 }

 func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) error {
@@ -73,18 +150,13 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
 	if dir == "" {
 		return nil
 	}
-	root, err := getMount(dir)
-	if err != nil {
-		return err
-	}
-	root = filepath.Dir(root)
 	// 'ensureParent' start with parent because we don't want to
 	// explicitly inherit from parent, it could conflict with
 	// 'cpuset.cpu_exclusive'.
-	if err := s.ensureParent(filepath.Dir(dir), root); err != nil {
+	if err := cpusetEnsureParent(filepath.Dir(dir)); err != nil {
 		return err
 	}
-	if err := os.MkdirAll(dir, 0755); err != nil {
+	if err := os.Mkdir(dir, 0755); err != nil && !os.IsExist(err) {
 		return err
 	}
 	// We didn't inherit cpuset configs from parent, but we have
@@ -103,59 +175,61 @@ func (s *CpusetGroup) ApplyDir(dir string, cgroup *configs.Cgroup, pid int) erro
 	return cgroups.WriteCgroupProc(dir, pid)
 }

-func (s *CpusetGroup) getSubsystemSettings(parent string) (cpus []byte, mems []byte, err error) {
-	if cpus, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.cpus")); err != nil {
+func getCpusetSubsystemSettings(parent string) (cpus, mems string, err error) {
+	if cpus, err = fscommon.ReadFile(parent, "cpuset.cpus"); err != nil {
 		return
 	}
-	if mems, err = ioutil.ReadFile(filepath.Join(parent, "cpuset.mems")); err != nil {
+	if mems, err = fscommon.ReadFile(parent, "cpuset.mems"); err != nil {
 		return
 	}
 	return cpus, mems, nil
 }

-// ensureParent makes sure that the parent directory of current is created
-// and populated with the proper cpus and mems files copied from
-// it's parent.
-func (s *CpusetGroup) ensureParent(current, root string) error {
+// cpusetEnsureParent makes sure that the parent directories of current
+// are created and populated with the proper cpus and mems files copied
+// from their respective parent. It does that recursively, starting from
+// the top of the cpuset hierarchy (i.e. cpuset cgroup mount point).
+func cpusetEnsureParent(current string) error {
+	var st unix.Statfs_t
+
 	parent := filepath.Dir(current)
-	if libcontainerUtils.CleanPath(parent) == root {
+	err := unix.Statfs(parent, &st)
+	if err == nil && st.Type != unix.CGROUP_SUPER_MAGIC {
 		return nil
 	}
-	// Avoid infinite recursion.
-	if parent == current {
-		return errors.New("cpuset: cgroup parent path outside cgroup root")
+	// Treat non-existing directory as cgroupfs as it will be created,
+	// and the root cpuset directory obviously exists.
+	if err != nil && err != unix.ENOENT {
+		return &os.PathError{Op: "statfs", Path: parent, Err: err}
 	}
-	if err := s.ensureParent(parent, root); err != nil {
+
+	if err := cpusetEnsureParent(parent); err != nil {
 		return err
 	}
-	if err := os.MkdirAll(current, 0755); err != nil {
+	if err := os.Mkdir(current, 0755); err != nil && !os.IsExist(err) {
 		return err
 	}
-	return s.copyIfNeeded(current, parent)
+	return cpusetCopyIfNeeded(current, parent)
 }

-// copyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
+// cpusetCopyIfNeeded copies the cpuset.cpus and cpuset.mems from the parent
 // directory to the current directory if the file's contents are 0
-func (s *CpusetGroup) copyIfNeeded(current, parent string) error {
-	var (
-		err                      error
-		currentCpus, currentMems []byte
-		parentCpus, parentMems   []byte
-	)
-
-	if currentCpus, currentMems, err = s.getSubsystemSettings(current); err != nil {
+func cpusetCopyIfNeeded(current, parent string) error {
+	currentCpus, currentMems, err := getCpusetSubsystemSettings(current)
+	if err != nil {
 		return err
 	}
-	if parentCpus, parentMems, err = s.getSubsystemSettings(parent); err != nil {
+	parentCpus, parentMems, err := getCpusetSubsystemSettings(parent)
+	if err != nil {
 		return err
 	}

-	if s.isEmpty(currentCpus) {
+	if isEmptyCpuset(currentCpus) {
 		if err := fscommon.WriteFile(current, "cpuset.cpus", string(parentCpus)); err != nil {
 			return err
 		}
 	}
-	if s.isEmpty(currentMems) {
+	if isEmptyCpuset(currentMems) {
 		if err := fscommon.WriteFile(current, "cpuset.mems", string(parentMems)); err != nil {
 			return err
 		}
@@ -163,13 +237,13 @@ func (s *CpusetGroup) copyIfNeeded(current, parent string) error {
 	return nil
 }

-func (s *CpusetGroup) isEmpty(b []byte) bool {
-	return len(bytes.Trim(b, "\n")) == 0
+func isEmptyCpuset(str string) bool {
+	return str == "" || str == "\n"
 }

 func (s *CpusetGroup) ensureCpusAndMems(path string, cgroup *configs.Cgroup) error {
 	if err := s.Set(path, cgroup); err != nil {
 		return err
 	}
-	return s.copyIfNeeded(path, filepath.Dir(path))
+	return cpusetCopyIfNeeded(path, filepath.Dir(path))
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/devices.go
@@ -8,9 +8,10 @@ import (
 	"reflect"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
-	"github.com/opencontainers/runc/libcontainer/cgroups/devices"
+	cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runc/libcontainer/system"
 )

@@ -34,17 +35,17 @@ func (s *DevicesGroup) Apply(path string, d *cgroupData) error {
 	return join(path, d.pid)
 }

-func loadEmulator(path string) (*devices.Emulator, error) {
+func loadEmulator(path string) (*cgroupdevices.Emulator, error) {
 	list, err := fscommon.ReadFile(path, "devices.list")
 	if err != nil {
 		return nil, err
 	}
-	return devices.EmulatorFromList(bytes.NewBufferString(list))
+	return cgroupdevices.EmulatorFromList(bytes.NewBufferString(list))
 }

-func buildEmulator(rules []*configs.DeviceRule) (*devices.Emulator, error) {
+func buildEmulator(rules []*devices.Rule) (*cgroupdevices.Emulator, error) {
 	// This defaults to a white-list -- which is what we want!
-	emu := &devices.Emulator{}
+	emu := &cgroupdevices.Emulator{}
 	for _, rule := range rules {
 		if err := emu.Apply(*rule); err != nil {
 			return nil, err
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/freezer.go
@@ -28,33 +28,54 @@ func (s *FreezerGroup) Apply(path string, d *cgroupData) error {

 func (s *FreezerGroup) Set(path string, cgroup *configs.Cgroup) error {
 	switch cgroup.Resources.Freezer {
-	case configs.Frozen, configs.Thawed:
-		for {
-			// In case this loop does not exit because it doesn't get the expected
-			// state, let's write again this state, hoping it's going to be properly
-			// set this time. Otherwise, this loop could run infinitely, waiting for
-			// a state change that would never happen.
-			if err := fscommon.WriteFile(path, "freezer.state", string(cgroup.Resources.Freezer)); err != nil {
+	case configs.Frozen:
+		// As per older kernel docs (freezer-subsystem.txt before
+		// kernel commit ef9fe980c6fcc1821), if FREEZING is seen,
+		// userspace should either retry or thaw. While current
+		// kernel cgroup v1 docs no longer mention a need to retry,
+		// the kernel (tested on v5.4, Ubuntu 20.04) can't reliably
+		// freeze a cgroup while new processes keep appearing in it
+		// (either via fork/clone or by writing new PIDs to
+		// cgroup.procs).
+		//
+		// The number of retries below is chosen to have a decent
+		// chance to succeed even in the worst case scenario (runc
+		// pause/unpause with parallel runc exec).
+		//
+		// Adding any amount of sleep in between retries did not
+		// increase the chances of successful freeze.
+		for i := 0; i < 1000; i++ {
+			if err := fscommon.WriteFile(path, "freezer.state", string(configs.Frozen)); err != nil {
 				return err
 			}

-			state, err := s.GetState(path)
+			state, err := fscommon.ReadFile(path, "freezer.state")
 			if err != nil {
 				return err
 			}
-			if state == cgroup.Resources.Freezer {
-				break
+			state = strings.TrimSpace(state)
+			switch state {
+			case "FREEZING":
+				continue
+			case string(configs.Frozen):
+				return nil
+			default:
+				// should never happen
+				return fmt.Errorf("unexpected state %s while freezing", strings.TrimSpace(state))
 			}
-
-			time.Sleep(1 * time.Millisecond)
 		}
+		// Despite our best efforts, it got stuck in FREEZING.
+		// Leaving it in this state is bad and dangerous, so
+		// let's (try to) thaw it back and error out.
+		_ = fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
+		return errors.New("unable to freeze")
+	case configs.Thawed:
+		return fscommon.WriteFile(path, "freezer.state", string(configs.Thawed))
 	case configs.Undefined:
 		return nil
 	default:
 		return fmt.Errorf("Invalid argument '%s' to freezer.state", string(cgroup.Resources.Freezer))
 	}
-
-	return nil
 }

 func (s *FreezerGroup) GetStats(path string, stats *cgroups.Stats) error {
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/fs.go
@@ -3,11 +3,9 @@
 package fs

 import (
-	"bufio"
 	"fmt"
 	"os"
 	"path/filepath"
-	"strings"
 	"sync"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -133,46 +131,19 @@ func getCgroupRoot() (string, error) {
 		return cgroupRoot, nil
 	}

-	// slow path: parse mountinfo, find the first mount where fs=cgroup
-	// (e.g. "/sys/fs/cgroup/memory"), use its parent.
-	f, err := os.Open("/proc/self/mountinfo")
+	// slow path: parse mountinfo
+	mi, err := cgroups.GetCgroupMounts(false)
 	if err != nil {
 		return "", err
 	}
-	defer f.Close()
-
-	var root string
-	scanner := bufio.NewScanner(f)
-	for scanner.Scan() {
-		text := scanner.Text()
-		fields := strings.Split(text, " ")
-		// Safe as mountinfo encodes mountpoints with spaces as \040.
-		index := strings.Index(text, " - ")
-		postSeparatorFields := strings.Fields(text[index+3:])
-		numPostFields := len(postSeparatorFields)
-
-		// This is an error as we can't detect if the mount is for "cgroup"
-		if numPostFields == 0 {
-			return "", fmt.Errorf("mountinfo: found no fields post '-' in %q", text)
-		}
-
-		if postSeparatorFields[0] == "cgroup" {
-			// Check that the mount is properly formatted.
-			if numPostFields < 3 {
-				return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
-			}
-
-			root = filepath.Dir(fields[4])
-			break
-		}
-	}
-	if err := scanner.Err(); err != nil {
-		return "", err
-	}
-	if root == "" {
+	if len(mi) < 1 {
 		return "", errors.New("no cgroup mount found in mountinfo")
 	}

+	// Get the first cgroup mount (e.g. "/sys/fs/cgroup/memory"),
+	// use its parent directory.
+	root := filepath.Dir(mi[0].Mountpoint)
+
 	if _, err := os.Stat(root); err != nil {
 		return "", err
 	}
@@ -218,28 +189,31 @@ func (m *manager) Apply(pid int) (err error) {
 	m.mu.Lock()
 	defer m.mu.Unlock()

-	var c = m.cgroups
-
-	d, err := getCgroupData(m.cgroups, pid)
-	if err != nil {
-		return err
+	c := m.cgroups
+	if c.Resources.Unified != nil {
+		return cgroups.ErrV1NoUnified
 	}

 	m.paths = make(map[string]string)
 	if c.Paths != nil {
+		cgMap, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
+		if err != nil {
+			return err
+		}
 		for name, path := range c.Paths {
-			_, err := d.path(name)
-			if err != nil {
-				if cgroups.IsNotFound(err) {
-					continue
-				}
-				return err
+			// XXX(kolyshkin@): why this check is needed?
+			if _, ok := cgMap[name]; ok {
+				m.paths[name] = path
 			}
-			m.paths[name] = path
 		}
 		return cgroups.EnterPid(m.paths, pid)
 	}

+	d, err := getCgroupData(m.cgroups, pid)
+	if err != nil {
+		return err
+	}
+
 	for _, sys := range subsystems {
 		p, err := d.path(sys.Name())
 		if err != nil {
@@ -274,11 +248,7 @@ func (m *manager) Destroy() error {
 	}
 	m.mu.Lock()
 	defer m.mu.Unlock()
-	if err := cgroups.RemovePaths(m.paths); err != nil {
-		return err
-	}
-	m.paths = make(map[string]string)
-	return nil
+	return cgroups.RemovePaths(m.paths)
 }

 func (m *manager) Path(subsys string) string {
@@ -313,6 +283,9 @@ func (m *manager) Set(container *configs.Config) error {
 	if m.cgroups != nil && m.cgroups.Paths != nil {
 		return nil
 	}
+	if container.Cgroups.Resources.Unified != nil {
+		return cgroups.ErrV1NoUnified
+	}

 	m.mu.Lock()
 	defer m.mu.Unlock()
@@ -425,16 +398,6 @@ func join(path string, pid int) error {
 	return cgroups.WriteCgroupProc(path, pid)
 }

-func removePath(p string, err error) error {
-	if err != nil {
-		return err
-	}
-	if p != "" {
-		return os.RemoveAll(p)
-	}
-	return nil
-}
-
 func (m *manager) GetPaths() map[string]string {
 	m.mu.Lock()
 	defer m.mu.Unlock()
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/hugetlb.go
@@ -5,7 +5,6 @@ package fs
 import (
 	"fmt"
 	"strconv"
-	"strings"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
@@ -25,7 +24,7 @@ func (s *HugetlbGroup) Apply(path string, d *cgroupData) error {

 func (s *HugetlbGroup) Set(path string, cgroup *configs.Cgroup) error {
 	for _, hugetlb := range cgroup.Resources.HugetlbLimit {
-		if err := fscommon.WriteFile(path, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "limit_in_bytes"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
+		if err := fscommon.WriteFile(path, "hugetlb."+hugetlb.Pagesize+".limit_in_bytes", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
 			return err
 		}
 	}
@@ -39,21 +38,21 @@ func (s *HugetlbGroup) GetStats(path string, stats *cgroups.Stats) error {
 		return nil
 	}
 	for _, pageSize := range HugePageSizes {
-		usage := strings.Join([]string{"hugetlb", pageSize, "usage_in_bytes"}, ".")
+		usage := "hugetlb." + pageSize + ".usage_in_bytes"
 		value, err := fscommon.GetCgroupParamUint(path, usage)
 		if err != nil {
 			return fmt.Errorf("failed to parse %s - %v", usage, err)
 		}
 		hugetlbStats.Usage = value

-		maxUsage := strings.Join([]string{"hugetlb", pageSize, "max_usage_in_bytes"}, ".")
+		maxUsage := "hugetlb." + pageSize + ".max_usage_in_bytes"
 		value, err = fscommon.GetCgroupParamUint(path, maxUsage)
 		if err != nil {
 			return fmt.Errorf("failed to parse %s - %v", maxUsage, err)
 		}
 		hugetlbStats.MaxUsage = value

-		failcnt := strings.Join([]string{"hugetlb", pageSize, "failcnt"}, ".")
+		failcnt := "hugetlb." + pageSize + ".failcnt"
 		value, err = fscommon.GetCgroupParamUint(path, failcnt)
 		if err != nil {
 			return fmt.Errorf("failed to parse %s - %v", failcnt, err)
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/kmem.go
@@ -5,11 +5,11 @@ package fs
 import (
 	"errors"
 	"fmt"
-	"io/ioutil"
 	"path/filepath"
 	"strconv"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"golang.org/x/sys/unix"
 )

@@ -42,7 +42,7 @@ func setKernelMemory(path string, kernelMemoryLimit int64) error {
 		// doesn't support it we *must* error out.
 		return errors.New("kernel memory accounting not supported by this kernel")
 	}
-	if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil {
+	if err := fscommon.WriteFile(path, cgroupKernelMemoryLimit, strconv.FormatInt(kernelMemoryLimit, 10)); err != nil {
 		// Check if the error number returned by the syscall is "EBUSY"
 		// The EBUSY signal is returned on attempts to write to the
 		// memory.kmem.limit_in_bytes file if the cgroup has children or
@@ -50,7 +50,7 @@ func setKernelMemory(path string, kernelMemoryLimit int64) error {
 		if errors.Is(err, unix.EBUSY) {
 			return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit)
 		}
-		return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err)
+		return err
 	}
 	return nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/memory.go
@@ -7,7 +7,6 @@ import (
 	"fmt"
 	"math"
 	"os"
-	"path"
 	"path/filepath"
 	"strconv"
 	"strings"
@@ -18,16 +17,8 @@ import (
 )

 const (
-	numaNodeSymbol            = "N"
-	numaStatColumnSeparator   = " "
-	numaStatKeyValueSeparator = "="
-	numaStatMaxColumns        = math.MaxUint8 + 1
-	numaStatValueIndex        = 1
-	numaStatTypeIndex         = 0
-	numaStatColumnSliceLength = 2
-	cgroupMemorySwapLimit     = "memory.memsw.limit_in_bytes"
-	cgroupMemoryLimit         = "memory.limit_in_bytes"
-	cgroupMemoryPagesByNuma   = "memory.numa_stat"
+	cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes"
+	cgroupMemoryLimit     = "memory.limit_in_bytes"
 )

 type MemoryGroup struct {
@@ -160,7 +151,7 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error {

 func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
 	// Set stats from memory.stat.
-	statsFile, err := os.Open(filepath.Join(path, "memory.stat"))
+	statsFile, err := fscommon.OpenFile(path, "memory.stat", os.O_RDONLY)
 	if err != nil {
 		if os.IsNotExist(err) {
 			return nil
@@ -200,8 +191,7 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error {
 	}
 	stats.MemoryStats.KernelTCPUsage = kernelTCPUsage

-	useHierarchy := strings.Join([]string{"memory", "use_hierarchy"}, ".")
-	value, err := fscommon.GetCgroupParamUint(path, useHierarchy)
+	value, err := fscommon.GetCgroupParamUint(path, "memory.use_hierarchy")
 	if err != nil {
 		return err
 	}
@@ -233,12 +223,14 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {

 	moduleName := "memory"
 	if name != "" {
-		moduleName = strings.Join([]string{"memory", name}, ".")
+		moduleName = "memory." + name
 	}
-	usage := strings.Join([]string{moduleName, "usage_in_bytes"}, ".")
-	maxUsage := strings.Join([]string{moduleName, "max_usage_in_bytes"}, ".")
-	failcnt := strings.Join([]string{moduleName, "failcnt"}, ".")
-	limit := strings.Join([]string{moduleName, "limit_in_bytes"}, ".")
+	var (
+		usage    = moduleName + ".usage_in_bytes"
+		maxUsage = moduleName + ".max_usage_in_bytes"
+		failcnt  = moduleName + ".failcnt"
+		limit    = moduleName + ".limit_in_bytes"
+	)

 	value, err := fscommon.GetCgroupParamUint(path, usage)
 	if err != nil {
@@ -277,47 +269,81 @@ func getMemoryData(path, name string) (cgroups.MemoryData, error) {
 }

 func getPageUsageByNUMA(cgroupPath string) (cgroups.PageUsageByNUMA, error) {
+	const (
+		maxColumns = math.MaxUint8 + 1
+		filename   = "memory.numa_stat"
+	)
 	stats := cgroups.PageUsageByNUMA{}

-	file, err := os.Open(path.Join(cgroupPath, cgroupMemoryPagesByNuma))
+	file, err := fscommon.OpenFile(cgroupPath, filename, os.O_RDONLY)
 	if os.IsNotExist(err) {
 		return stats, nil
 	} else if err != nil {
 		return stats, err
 	}

+	// File format is documented in linux/Documentation/cgroup-v1/memory.txt
+	// and it looks like this:
+	//
+	// total=<total pages> N0=<node 0 pages> N1=<node 1 pages> ...
+	// file=<total file pages> N0=<node 0 pages> N1=<node 1 pages> ...
+	// anon=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+	// unevictable=<total anon pages> N0=<node 0 pages> N1=<node 1 pages> ...
+	// hierarchical_<counter>=<counter pages> N0=<node 0 pages> N1=<node 1 pages> ...
+
 	scanner := bufio.NewScanner(file)
 	for scanner.Scan() {
-		var statsType string
-		statsByType := cgroups.PageStats{Nodes: map[uint8]uint64{}}
-		columns := strings.SplitN(scanner.Text(), numaStatColumnSeparator, numaStatMaxColumns)
+		var field *cgroups.PageStats

-		for _, column := range columns {
-			pagesByNode := strings.SplitN(column, numaStatKeyValueSeparator, numaStatColumnSliceLength)
+		line := scanner.Text()
+		columns := strings.SplitN(line, " ", maxColumns)
+		for i, column := range columns {
+			byNode := strings.SplitN(column, "=", 2)
+			// Some custom kernels have non-standard fields, like
+			//   numa_locality 0 0 0 0 0 0 0 0 0 0
+			//   numa_exectime 0
+			if len(byNode) < 2 {
+				if i == 0 {
+					// Ignore/skip those.
+					break
+				} else {
+					// The first column was already validated,
+					// so be strict to the rest.
+					return stats, fmt.Errorf("malformed line %q in %s",
+						line, filename)
+				}
+			}
+			key, val := byNode[0], byNode[1]
+			if i == 0 { // First column: key is name, val is total.
+				field = getNUMAField(&stats, key)
+				if field == nil { // unknown field (new kernel?)
+					break
+				}
+				field.Total, err = strconv.ParseUint(val, 0, 64)
+				if err != nil {
+					return stats, err
+				}
+				field.Nodes = map[uint8]uint64{}
+			} else { // Subsequent columns: key is N<id>, val is usage.
+				if len(key) < 2 || key[0] != 'N' {
+					// This is definitely an error.
+					return stats, fmt.Errorf("malformed line %q in %s",
+						line, filename)
+				}

-			if strings.HasPrefix(pagesByNode[numaStatTypeIndex], numaNodeSymbol) {
-				nodeID, err := strconv.ParseUint(pagesByNode[numaStatTypeIndex][1:], 10, 8)
+				n, err := strconv.ParseUint(key[1:], 10, 8)
 				if err != nil {
 					return cgroups.PageUsageByNUMA{}, err
 				}

-				statsByType.Nodes[uint8(nodeID)], err = strconv.ParseUint(pagesByNode[numaStatValueIndex], 0, 64)
-				if err != nil {
-					return cgroups.PageUsageByNUMA{}, err
-				}
-			} else {
-				statsByType.Total, err = strconv.ParseUint(pagesByNode[numaStatValueIndex], 0, 64)
+				usage, err := strconv.ParseUint(val, 10, 64)
 				if err != nil {
 					return cgroups.PageUsageByNUMA{}, err
 				}

-				statsType = pagesByNode[numaStatTypeIndex]
+				field.Nodes[uint8(n)] = usage
 			}

-			err := addNUMAStatsByType(&stats, statsByType, statsType)
-			if err != nil {
-				return cgroups.PageUsageByNUMA{}, err
-			}
 		}
 	}
 	err = scanner.Err()
@@ -328,26 +354,24 @@ func getPageUsageByNUMA(cgroupPath string) (cgroups.PageUsageByNUMA, error) {
 	return stats, nil
 }

-func addNUMAStatsByType(stats *cgroups.PageUsageByNUMA, byTypeStats cgroups.PageStats, statsType string) error {
-	switch statsType {
+func getNUMAField(stats *cgroups.PageUsageByNUMA, name string) *cgroups.PageStats {
+	switch name {
 	case "total":
-		stats.Total = byTypeStats
+		return &stats.Total
 	case "file":
-		stats.File = byTypeStats
+		return &stats.File
 	case "anon":
-		stats.Anon = byTypeStats
+		return &stats.Anon
 	case "unevictable":
-		stats.Unevictable = byTypeStats
+		return &stats.Unevictable
 	case "hierarchical_total":
-		stats.Hierarchical.Total = byTypeStats
+		return &stats.Hierarchical.Total
 	case "hierarchical_file":
-		stats.Hierarchical.File = byTypeStats
+		return &stats.Hierarchical.File
 	case "hierarchical_anon":
-		stats.Hierarchical.Anon = byTypeStats
+		return &stats.Hierarchical.Anon
 	case "hierarchical_unevictable":
-		stats.Hierarchical.Unevictable = byTypeStats
-	default:
-		return fmt.Errorf("unsupported NUMA page type found: %s", statsType)
+		return &stats.Hierarchical.Unevictable
 	}
 	return nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs/name.go
@@ -19,7 +19,7 @@ func (s *NameGroup) Name() string {
 func (s *NameGroup) Apply(path string, d *cgroupData) error {
 	if s.Join {
 		// ignore errors if the named cgroup does not exist
-		join(path, d.pid)
+		_ = join(path, d.pid)
 	}
 	return nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/cpu.go
@@ -5,7 +5,6 @@ package fs2
 import (
 	"bufio"
 	"os"
-	"path/filepath"
 	"strconv"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
@@ -50,7 +49,7 @@ func setCpu(dirPath string, cgroup *configs.Cgroup) error {
 	return nil
 }
 func statCpu(dirPath string, stats *cgroups.Stats) error {
-	f, err := os.Open(filepath.Join(dirPath, "cpu.stat"))
+	f, err := fscommon.OpenFile(dirPath, "cpu.stat", os.O_RDONLY)
 	if err != nil {
 		return err
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/create.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/create.go
@@ -1,19 +1,17 @@
 package fs2

 import (
-	"bytes"
 	"fmt"
-	"io/ioutil"
 	"os"
 	"path/filepath"
 	"strings"

+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

-func supportedControllers(cgroup *configs.Cgroup) ([]byte, error) {
-	const file = UnifiedMountpoint + "/cgroup.controllers"
-	return ioutil.ReadFile(file)
+func supportedControllers(cgroup *configs.Cgroup) (string, error) {
+	return fscommon.ReadFile(UnifiedMountpoint, "/cgroup.controllers")
 }

 // needAnyControllers returns whether we enable some supported controllers or not,
@@ -31,7 +29,7 @@ func needAnyControllers(cgroup *configs.Cgroup) (bool, error) {
 		return false, err
 	}
 	avail := make(map[string]struct{})
-	for _, ctr := range strings.Fields(string(content)) {
+	for _, ctr := range strings.Fields(content) {
 		avail[ctr] = struct{}{}
 	}

@@ -81,8 +79,12 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
 		return err
 	}

-	ctrs := bytes.Fields(content)
-	res := append([]byte("+"), bytes.Join(ctrs, []byte(" +"))...)
+	const (
+		cgTypeFile  = "cgroup.type"
+		cgStCtlFile = "cgroup.subtree_control"
+	)
+	ctrs := strings.Fields(content)
+	res := "+" + strings.Join(ctrs, " +")

 	elements := strings.Split(path, "/")
 	elements = elements[3:]
@@ -103,9 +105,9 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
 					}
 				}()
 			}
-			cgTypeFile := filepath.Join(current, "cgroup.type")
-			cgType, _ := ioutil.ReadFile(cgTypeFile)
-			switch strings.TrimSpace(string(cgType)) {
+			cgType, _ := fscommon.ReadFile(current, cgTypeFile)
+			cgType = strings.TrimSpace(cgType)
+			switch cgType {
 			// If the cgroup is in an invalid mode (usually this means there's an internal
 			// process in the cgroup tree, because we created a cgroup under an
 			// already-populated-by-other-processes cgroup), then we have to error out if
@@ -120,7 +122,7 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
 					// since that means we're a properly delegated cgroup subtree) but in
 					// this case there's not much we can do and it's better than giving an
 					// error.
-					_ = ioutil.WriteFile(cgTypeFile, []byte("threaded"), 0644)
+					_ = fscommon.WriteFile(current, cgTypeFile, "threaded")
 				}
 			// If the cgroup is in (threaded) or (domain threaded) mode, we can only use thread-aware controllers
 			// (and you cannot usually take a cgroup out of threaded mode).
@@ -128,18 +130,17 @@ func CreateCgroupPath(path string, c *configs.Cgroup) (Err error) {
 				fallthrough
 			case "threaded":
 				if containsDomainController(c) {
-					return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, strings.TrimSpace(string(cgType)))
+					return fmt.Errorf("cannot enter cgroupv2 %q with domain controllers -- it is in %s mode", current, cgType)
 				}
 			}
 		}
 		// enable all supported controllers
 		if i < len(elements)-1 {
-			file := filepath.Join(current, "cgroup.subtree_control")
-			if err := ioutil.WriteFile(file, res, 0644); err != nil {
+			if err := fscommon.WriteFile(current, cgStCtlFile, res); err != nil {
 				// try write one by one
-				allCtrs := bytes.Split(res, []byte(" "))
+				allCtrs := strings.Split(res, " ")
 				for _, ctr := range allCtrs {
-					_ = ioutil.WriteFile(file, ctr, 0644)
+					_ = fscommon.WriteFile(current, cgStCtlFile, ctr)
 				}
 			}
 			// Some controllers might not be enabled when rootless or containerized,
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/devices.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/devices.go
@@ -6,11 +6,12 @@ import (
 	"github.com/opencontainers/runc/libcontainer/cgroups/ebpf"
 	"github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/pkg/errors"
 	"golang.org/x/sys/unix"
 )

-func isRWM(perms configs.DevicePermissions) bool {
+func isRWM(perms devices.Permissions) bool {
 	var r, w, m bool
 	for _, perm := range perms {
 		switch perm {
@@ -61,7 +62,7 @@ func setDevices(dirPath string, cgroup *configs.Cgroup) error {
 	//
 	//      The real issue is that BPF_F_ALLOW_MULTI makes it hard to have a
 	//      race-free blacklist because it acts as a whitelist by default, and
-	//      having a deny-everything program cannot be overriden by other
+	//      having a deny-everything program cannot be overridden by other
 	//      programs. You could temporarily insert a deny-everything program
 	//      but that would result in spurrious failures during updates.
 	if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil {
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/freezer.go
@@ -19,7 +19,7 @@ func setFreezer(dirPath string, state configs.FreezerState) error {
 		// freeze the container (since without the freezer cgroup, that's a
 		// no-op).
 		if state == configs.Undefined || state == configs.Thawed {
-			err = nil
+			return nil
 		}
 		return errors.Wrap(err, "freezer not supported")
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/fs2.go
@@ -3,15 +3,14 @@
 package fs2

 import (
-	"io/ioutil"
+	"fmt"
 	"os"
-	"path/filepath"
 	"strings"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/pkg/errors"
-	"golang.org/x/sys/unix"
 )

 type manager struct {
@@ -52,15 +51,14 @@ func (m *manager) getControllers() error {
 		return nil
 	}

-	file := filepath.Join(m.dirPath, "cgroup.controllers")
-	data, err := ioutil.ReadFile(file)
+	data, err := fscommon.ReadFile(m.dirPath, "cgroup.controllers")
 	if err != nil {
 		if m.rootless && m.config.Path == "" {
 			return nil
 		}
 		return err
 	}
-	fields := strings.Fields(string(data))
+	fields := strings.Fields(data)
 	m.controllers = make(map[string]struct{}, len(fields))
 	for _, c := range fields {
 		m.controllers[c] = struct{}{}
@@ -157,45 +155,8 @@ func (m *manager) Freeze(state configs.FreezerState) error {
 	return nil
 }

-func rmdir(path string) error {
-	err := unix.Rmdir(path)
-	if err == nil || err == unix.ENOENT {
-		return nil
-	}
-	return &os.PathError{Op: "rmdir", Path: path, Err: err}
-}
-
-// removeCgroupPath aims to remove cgroup path recursively
-// Because there may be subcgroups in it.
-func removeCgroupPath(path string) error {
-	// try the fast path first
-	if err := rmdir(path); err == nil {
-		return nil
-	}
-
-	infos, err := ioutil.ReadDir(path)
-	if err != nil {
-		if os.IsNotExist(err) {
-			err = nil
-		}
-		return err
-	}
-	for _, info := range infos {
-		if info.IsDir() {
-			// We should remove subcgroups dir first
-			if err = removeCgroupPath(filepath.Join(path, info.Name())); err != nil {
-				break
-			}
-		}
-	}
-	if err == nil {
-		err = rmdir(path)
-	}
-	return err
-}
-
 func (m *manager) Destroy() error {
-	return removeCgroupPath(m.dirPath)
+	return cgroups.RemovePath(m.dirPath)
 }

 func (m *manager) Path(_ string) string {
@@ -245,10 +206,40 @@ func (m *manager) Set(container *configs.Config) error {
 	if err := setFreezer(m.dirPath, container.Cgroups.Freezer); err != nil {
 		return err
 	}
+	if err := m.setUnified(container.Cgroups.Unified); err != nil {
+		return err
+	}
 	m.config = container.Cgroups
 	return nil
 }

+func (m *manager) setUnified(res map[string]string) error {
+	for k, v := range res {
+		if strings.Contains(k, "/") {
+			return fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
+		}
+		if err := fscommon.WriteFile(m.dirPath, k, v); err != nil {
+			errC := errors.Cause(err)
+			// Check for both EPERM and ENOENT since O_CREAT is used by WriteFile.
+			if errors.Is(errC, os.ErrPermission) || errors.Is(errC, os.ErrNotExist) {
+				// Check if a controller is available,
+				// to give more specific error if not.
+				sk := strings.SplitN(k, ".", 2)
+				if len(sk) != 2 {
+					return fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
+				}
+				c := sk[0]
+				if _, ok := m.controllers[c]; !ok && c != "cgroup" {
+					return fmt.Errorf("unified resource %q can't be set: controller %q not available", k, c)
+				}
+			}
+			return errors.Wrapf(err, "can't set unified resource %q", k)
+		}
+	}
+
+	return nil
+}
+
 func (m *manager) GetPaths() map[string]string {
 	paths := make(map[string]string, 1)
 	paths[""] = m.dirPath
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/hugetlb.go
@@ -3,10 +3,7 @@
 package fs2

 import (
-	"io/ioutil"
-	"path/filepath"
 	"strconv"
-	"strings"

 	"github.com/pkg/errors"

@@ -24,7 +21,7 @@ func setHugeTlb(dirPath string, cgroup *configs.Cgroup) error {
 		return nil
 	}
 	for _, hugetlb := range cgroup.Resources.HugetlbLimit {
-		if err := fscommon.WriteFile(dirPath, strings.Join([]string{"hugetlb", hugetlb.Pagesize, "max"}, "."), strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
+		if err := fscommon.WriteFile(dirPath, "hugetlb."+hugetlb.Pagesize+".max", strconv.FormatUint(hugetlb.Limit, 10)); err != nil {
 			return err
 		}
 	}
@@ -40,22 +37,20 @@ func statHugeTlb(dirPath string, stats *cgroups.Stats) error {
 	hugetlbStats := cgroups.HugetlbStats{}

 	for _, pagesize := range hugePageSizes {
-		usage := strings.Join([]string{"hugetlb", pagesize, "current"}, ".")
-		value, err := fscommon.GetCgroupParamUint(dirPath, usage)
+		value, err := fscommon.GetCgroupParamUint(dirPath, "hugetlb."+pagesize+".current")
 		if err != nil {
-			return errors.Wrapf(err, "failed to parse hugetlb.%s.current file", pagesize)
+			return err
 		}
 		hugetlbStats.Usage = value

-		fileName := strings.Join([]string{"hugetlb", pagesize, "events"}, ".")
-		filePath := filepath.Join(dirPath, fileName)
-		contents, err := ioutil.ReadFile(filePath)
+		fileName := "hugetlb." + pagesize + ".events"
+		contents, err := fscommon.ReadFile(dirPath, fileName)
 		if err != nil {
-			return errors.Wrapf(err, "failed to parse hugetlb.%s.events file", pagesize)
+			return errors.Wrap(err, "failed to read stats")
 		}
-		_, value, err = fscommon.GetCgroupParamKeyValue(string(contents))
+		_, value, err = fscommon.GetCgroupParamKeyValue(contents)
 		if err != nil {
-			return errors.Wrapf(err, "failed to parse hugetlb.%s.events file", pagesize)
+			return errors.Wrap(err, "failed to parse "+fileName)
 		}
 		hugetlbStats.Failcnt = value

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/io.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/io.go
@@ -5,7 +5,6 @@ package fs2
 import (
 	"bufio"
 	"os"
-	"path/filepath"
 	"strconv"
 	"strings"

@@ -60,8 +59,7 @@ func setIo(dirPath string, cgroup *configs.Cgroup) error {

 func readCgroup2MapFile(dirPath string, name string) (map[string][]string, error) {
 	ret := map[string][]string{}
-	p := filepath.Join(dirPath, name)
-	f, err := os.Open(p)
+	f, err := fscommon.OpenFile(dirPath, name, os.O_RDONLY)
 	if err != nil {
 		return nil, err
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/memory.go
@@ -5,9 +5,7 @@ package fs2
 import (
 	"bufio"
 	"os"
-	"path/filepath"
 	"strconv"
-	"strings"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
@@ -76,7 +74,7 @@ func setMemory(dirPath string, cgroup *configs.Cgroup) error {

 func statMemory(dirPath string, stats *cgroups.Stats) error {
 	// Set stats from memory.stat.
-	statsFile, err := os.Open(filepath.Join(dirPath, "memory.stat"))
+	statsFile, err := fscommon.OpenFile(dirPath, "memory.stat", os.O_RDONLY)
 	if err != nil {
 		return err
 	}
@@ -112,10 +110,10 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) {

 	moduleName := "memory"
 	if name != "" {
-		moduleName = strings.Join([]string{"memory", name}, ".")
+		moduleName = "memory." + name
 	}
-	usage := strings.Join([]string{moduleName, "current"}, ".")
-	limit := strings.Join([]string{moduleName, "max"}, ".")
+	usage := moduleName + ".current"
+	limit := moduleName + ".max"

 	value, err := fscommon.GetCgroupParamUint(path, usage)
 	if err != nil {
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/pids.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fs2/pids.go
@@ -3,7 +3,6 @@
 package fs2

 import (
-	"io/ioutil"
 	"path/filepath"
 	"strings"

@@ -34,15 +33,15 @@ func setPids(dirPath string, cgroup *configs.Cgroup) error {
 func statPidsWithoutController(dirPath string, stats *cgroups.Stats) error {
 	// if the controller is not enabled, let's read PIDS from cgroups.procs
 	// (or threads if cgroup.threads is enabled)
-	contents, err := ioutil.ReadFile(filepath.Join(dirPath, "cgroup.procs"))
+	contents, err := fscommon.ReadFile(dirPath, "cgroup.procs")
 	if errors.Is(err, unix.ENOTSUP) {
-		contents, err = ioutil.ReadFile(filepath.Join(dirPath, "cgroup.threads"))
+		contents, err = fscommon.ReadFile(dirPath, "cgroup.threads")
 	}
 	if err != nil {
 		return err
 	}
 	pids := make(map[string]string)
-	for _, i := range strings.Split(string(contents), "\n") {
+	for _, i := range strings.Split(contents, "\n") {
 		if i != "" {
 			pids[i] = i
 		}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go
@@ -3,46 +3,47 @@
 package fscommon

 import (
-	"io/ioutil"
+	"bytes"
 	"os"

-	securejoin "github.com/cyphar/filepath-securejoin"
 	"github.com/pkg/errors"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )

+// WriteFile writes data to a cgroup file in dir.
+// It is supposed to be used for cgroup files only.
 func WriteFile(dir, file, data string) error {
-	if dir == "" {
-		return errors.Errorf("no directory specified for %s", file)
-	}
-	path, err := securejoin.SecureJoin(dir, file)
+	fd, err := OpenFile(dir, file, unix.O_WRONLY)
 	if err != nil {
 		return err
 	}
-	if err := retryingWriteFile(path, []byte(data), 0700); err != nil {
-		return errors.Wrapf(err, "failed to write %q to %q", data, path)
+	defer fd.Close()
+	if err := retryingWriteFile(fd, data); err != nil {
+		return errors.Wrapf(err, "failed to write %q", data)
 	}
 	return nil
 }

+// ReadFile reads data from a cgroup file in dir.
+// It is supposed to be used for cgroup files only.
 func ReadFile(dir, file string) (string, error) {
-	if dir == "" {
-		return "", errors.Errorf("no directory specified for %s", file)
-	}
-	path, err := securejoin.SecureJoin(dir, file)
+	fd, err := OpenFile(dir, file, unix.O_RDONLY)
 	if err != nil {
 		return "", err
 	}
-	data, err := ioutil.ReadFile(path)
-	return string(data), err
+	defer fd.Close()
+	var buf bytes.Buffer
+
+	_, err = buf.ReadFrom(fd)
+	return buf.String(), err
 }

-func retryingWriteFile(filename string, data []byte, perm os.FileMode) error {
+func retryingWriteFile(fd *os.File, data string) error {
 	for {
-		err := ioutil.WriteFile(filename, data, perm)
+		_, err := fd.Write([]byte(data))
 		if errors.Is(err, unix.EINTR) {
-			logrus.Infof("interrupted while writing %s to %s", string(data), filename)
+			logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
 			continue
 		}
 		return err
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go
@@ -0,0 +1,103 @@
+package fscommon
+
+import (
+	"os"
+	"strings"
+	"sync"
+
+	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+)
+
+const (
+	cgroupfsDir    = "/sys/fs/cgroup"
+	cgroupfsPrefix = cgroupfsDir + "/"
+)
+
+var (
+	// Set to true by fs unit tests
+	TestMode bool
+
+	cgroupFd     int = -1
+	prepOnce     sync.Once
+	prepErr      error
+	resolveFlags uint64
+)
+
+func prepareOpenat2() error {
+	prepOnce.Do(func() {
+		fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
+			Flags: unix.O_DIRECTORY | unix.O_PATH})
+		if err != nil {
+			prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
+			if err != unix.ENOSYS {
+				logrus.Warnf("falling back to securejoin: %s", prepErr)
+			} else {
+				logrus.Debug("openat2 not available, falling back to securejoin")
+			}
+			return
+		}
+		var st unix.Statfs_t
+		if err = unix.Fstatfs(fd, &st); err != nil {
+			prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
+			logrus.Warnf("falling back to securejoin: %s", prepErr)
+			return
+		}
+
+		cgroupFd = fd
+
+		resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
+		if st.Type == unix.CGROUP2_SUPER_MAGIC {
+			// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
+			resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
+		}
+
+	})
+
+	return prepErr
+}
+
+// OpenFile opens a cgroup file in a given dir with given flags.
+// It is supposed to be used for cgroup files only.
+func OpenFile(dir, file string, flags int) (*os.File, error) {
+	if dir == "" {
+		return nil, errors.Errorf("no directory specified for %s", file)
+	}
+	mode := os.FileMode(0)
+	if TestMode && flags&os.O_WRONLY != 0 {
+		// "emulate" cgroup fs for unit tests
+		flags |= os.O_TRUNC | os.O_CREATE
+		mode = 0o600
+	}
+	reldir := strings.TrimPrefix(dir, cgroupfsPrefix)
+	if len(reldir) == len(dir) { // non-standard path, old system?
+		return openWithSecureJoin(dir, file, flags, mode)
+	}
+	if prepareOpenat2() != nil {
+		return openWithSecureJoin(dir, file, flags, mode)
+	}
+
+	relname := reldir + "/" + file
+	fd, err := unix.Openat2(cgroupFd, relname,
+		&unix.OpenHow{
+			Resolve: resolveFlags,
+			Flags:   uint64(flags) | unix.O_CLOEXEC,
+			Mode:    uint64(mode),
+		})
+	if err != nil {
+		return nil, &os.PathError{Op: "openat2", Path: dir + "/" + file, Err: err}
+	}
+
+	return os.NewFile(uintptr(fd), cgroupfsPrefix+relname), nil
+}
+
+func openWithSecureJoin(dir, file string, flags int, mode os.FileMode) (*os.File, error) {
+	path, err := securejoin.SecureJoin(dir, file)
+	if err != nil {
+		return nil, err
+	}
+
+	return os.OpenFile(path, flags, mode)
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go
@@ -5,9 +5,7 @@ package fscommon
 import (
 	"errors"
 	"fmt"
-	"io/ioutil"
 	"math"
-	"path/filepath"
 	"strconv"
 	"strings"
 )
@@ -16,8 +14,9 @@ var (
 	ErrNotValidFormat = errors.New("line is not a valid key value format")
 )

-// Saturates negative values at zero and returns a uint64.
-// Due to kernel bugs, some of the memory cgroup stats can be negative.
+// ParseUint converts a string to an uint64 integer.
+// Negative values are returned at zero as, due to kernel bugs,
+// some of the memory cgroup stats can be negative.
 func ParseUint(s string, base, bitSize int) (uint64, error) {
 	value, err := strconv.ParseUint(s, base, bitSize)
 	if err != nil {
@@ -36,15 +35,16 @@ func ParseUint(s string, base, bitSize int) (uint64, error) {
 	return value, nil
 }

-// Parses a cgroup param and returns as name, value
-//  i.e. "io_service_bytes 1234" will return as io_service_bytes, 1234
+// GetCgroupParamKeyValue parses a space-separated "name value" kind of cgroup
+// parameter and returns its components. For example, "io_service_bytes 1234"
+// will return as "io_service_bytes", 1234.
 func GetCgroupParamKeyValue(t string) (string, uint64, error) {
 	parts := strings.Fields(t)
 	switch len(parts) {
 	case 2:
 		value, err := ParseUint(parts[1], 10, 64)
 		if err != nil {
-			return "", 0, fmt.Errorf("unable to convert param value (%q) to uint64: %v", parts[1], err)
+			return "", 0, fmt.Errorf("unable to convert to uint64: %v", err)
 		}

 		return parts[0], value, nil
@@ -53,31 +53,50 @@ func GetCgroupParamKeyValue(t string) (string, uint64, error) {
 	}
 }

-// Gets a single uint64 value from the specified cgroup file.
-func GetCgroupParamUint(cgroupPath, cgroupFile string) (uint64, error) {
-	fileName := filepath.Join(cgroupPath, cgroupFile)
-	contents, err := ioutil.ReadFile(fileName)
+// GetCgroupParamUint reads a single uint64 value from the specified cgroup file.
+// If the value read is "max", the math.MaxUint64 is returned.
+func GetCgroupParamUint(path, file string) (uint64, error) {
+	contents, err := GetCgroupParamString(path, file)
 	if err != nil {
 		return 0, err
 	}
-	trimmed := strings.TrimSpace(string(contents))
-	if trimmed == "max" {
+	contents = strings.TrimSpace(contents)
+	if contents == "max" {
 		return math.MaxUint64, nil
 	}

-	res, err := ParseUint(trimmed, 10, 64)
+	res, err := ParseUint(contents, 10, 64)
 	if err != nil {
-		return res, fmt.Errorf("unable to parse %q as a uint from Cgroup file %q", string(contents), fileName)
+		return res, fmt.Errorf("unable to parse file %q", path+"/"+file)
 	}
 	return res, nil
 }

-// Gets a string value from the specified cgroup file
-func GetCgroupParamString(cgroupPath, cgroupFile string) (string, error) {
-	contents, err := ioutil.ReadFile(filepath.Join(cgroupPath, cgroupFile))
+// GetCgroupParamInt reads a single int64 value from specified cgroup file.
+// If the value read is "max", the math.MaxInt64 is returned.
+func GetCgroupParamInt(path, file string) (int64, error) {
+	contents, err := ReadFile(path, file)
+	if err != nil {
+		return 0, err
+	}
+	contents = strings.TrimSpace(contents)
+	if contents == "max" {
+		return math.MaxInt64, nil
+	}
+
+	res, err := strconv.ParseInt(contents, 10, 64)
+	if err != nil {
+		return res, fmt.Errorf("unable to parse %q as a int from Cgroup file %q", contents, path+"/"+file)
+	}
+	return res, nil
+}
+
+// GetCgroupParamString reads a string from the specified cgroup file.
+func GetCgroupParamString(path, file string) (string, error) {
+	contents, err := ReadFile(path, file)
 	if err != nil {
 		return "", err
 	}

-	return strings.TrimSpace(string(contents)), nil
+	return strings.TrimSpace(contents), nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
@@ -39,6 +39,33 @@ type CpuStats struct {
 	ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
 }

+type CPUSetStats struct {
+	// List of the physical numbers of the CPUs on which processes
+	// in that cpuset are allowed to execute
+	CPUs []uint16 `json:"cpus,omitempty"`
+	// cpu_exclusive flag
+	CPUExclusive uint64 `json:"cpu_exclusive"`
+	// List of memory nodes on which processes in that cpuset
+	// are allowed to allocate memory
+	Mems []uint16 `json:"mems,omitempty"`
+	// mem_hardwall flag
+	MemHardwall uint64 `json:"mem_hardwall"`
+	// mem_exclusive flag
+	MemExclusive uint64 `json:"mem_exclusive"`
+	// memory_migrate flag
+	MemoryMigrate uint64 `json:"memory_migrate"`
+	// memory_spread page flag
+	MemorySpreadPage uint64 `json:"memory_spread_page"`
+	// memory_spread slab flag
+	MemorySpreadSlab uint64 `json:"memory_spread_slab"`
+	// memory_pressure
+	MemoryPressure uint64 `json:"memory_pressure"`
+	// sched_load balance flag
+	SchedLoadBalance uint64 `json:"sched_load_balance"`
+	// sched_relax_domain_level
+	SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
+}
+
 type MemoryData struct {
 	Usage    uint64 `json:"usage,omitempty"`
 	MaxUsage uint64 `json:"max_usage,omitempty"`
@@ -121,6 +148,7 @@ type HugetlbStats struct {

 type Stats struct {
 	CpuStats    CpuStats    `json:"cpu_stats,omitempty"`
+	CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
 	MemoryStats MemoryStats `json:"memory_stats,omitempty"`
 	PidsStats   PidsStats   `json:"pids_stats,omitempty"`
 	BlkioStats  BlkioStats  `json:"blkio_stats,omitempty"`
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/common.go
@@ -13,12 +13,20 @@ import (

 	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
 	dbus "github.com/godbus/dbus/v5"
-	"github.com/opencontainers/runc/libcontainer/cgroups/devices"
+	cgroupdevices "github.com/opencontainers/runc/libcontainer/cgroups/devices"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/pkg/errors"
 	"github.com/sirupsen/logrus"
 )

+const (
+	// Default kernel value for cpu quota period is 100000 us (100 ms), same for v1 and v2.
+	// v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and
+	// v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
+	defCPUQuotaPeriod = uint64(100000)
+)
+
 var (
 	connOnce sync.Once
 	connDbus *systemdDbus.Conn
@@ -26,7 +34,6 @@ var (

 	versionOnce sync.Once
 	version     int
-	versionErr  error

 	isRunningSystemdOnce sync.Once
 	isRunningSystemd     bool
@@ -81,11 +88,11 @@ func ExpandSlice(slice string) (string, error) {
 	return path, nil
 }

-func groupPrefix(ruleType configs.DeviceType) (string, error) {
+func groupPrefix(ruleType devices.Type) (string, error) {
 	switch ruleType {
-	case configs.BlockDevice:
+	case devices.BlockDevice:
 		return "block-", nil
-	case configs.CharDevice:
+	case devices.CharDevice:
 		return "char-", nil
 	default:
 		return "", errors.Errorf("device type %v has no group prefix", ruleType)
@@ -93,10 +100,10 @@ func groupPrefix(ruleType configs.DeviceType) (string, error) {
 }

 // findDeviceGroup tries to find the device group name (as listed in
-// /proc/devices) with the type prefixed as requried for DeviceAllow, for a
+// /proc/devices) with the type prefixed as required for DeviceAllow, for a
 // given (type, major) combination. If more than one device group exists, an
 // arbitrary one is chosen.
-func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, error) {
+func findDeviceGroup(ruleType devices.Type, ruleMajor int64) (string, error) {
 	fh, err := os.Open("/proc/devices")
 	if err != nil {
 		return "", err
@@ -109,7 +116,7 @@ func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, erro
 	}

 	scanner := bufio.NewScanner(fh)
-	var currentType configs.DeviceType
+	var currentType devices.Type
 	for scanner.Scan() {
 		// We need to strip spaces because the first number is column-aligned.
 		line := strings.TrimSpace(scanner.Text())
@@ -117,10 +124,10 @@ func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, erro
 		// Handle the "header" lines.
 		switch line {
 		case "Block devices:":
-			currentType = configs.BlockDevice
+			currentType = devices.BlockDevice
 			continue
 		case "Character devices:":
-			currentType = configs.CharDevice
+			currentType = devices.CharDevice
 			continue
 		case "":
 			continue
@@ -156,7 +163,7 @@ func findDeviceGroup(ruleType configs.DeviceType, ruleMajor int64) (string, erro

 // generateDeviceProperties takes the configured device rules and generates a
 // corresponding set of systemd properties to configure the devices correctly.
-func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Property, error) {
+func generateDeviceProperties(rules []*devices.Rule) ([]systemdDbus.Property, error) {
 	// DeviceAllow is the type "a(ss)" which means we need a temporary struct
 	// to represent it in Go.
 	type deviceAllowEntry struct {
@@ -172,7 +179,7 @@ func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Proper
 	}

 	// Figure out the set of rules.
-	configEmu := &devices.Emulator{}
+	configEmu := &cgroupdevices.Emulator{}
 	for _, rule := range rules {
 		if err := configEmu.Apply(*rule); err != nil {
 			return nil, errors.Wrap(err, "apply rule for systemd")
@@ -199,7 +206,7 @@ func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Proper
 	// Now generate the set of rules we actually need to apply. Unlike the
 	// normal devices cgroup, in "strict" mode systemd defaults to a deny-all
 	// whitelist which is the default for devices.Emulator.
-	baseEmu := &devices.Emulator{}
+	baseEmu := &cgroupdevices.Emulator{}
 	finalRules, err := baseEmu.Transition(configEmu)
 	if err != nil {
 		return nil, errors.Wrap(err, "get simplified rules for systemd")
@@ -211,7 +218,7 @@ func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Proper
 			return nil, errors.Errorf("[internal error] cannot add deny rule to systemd DeviceAllow list: %v", *rule)
 		}
 		switch rule.Type {
-		case configs.BlockDevice, configs.CharDevice:
+		case devices.BlockDevice, devices.CharDevice:
 		default:
 			// Should never happen.
 			return nil, errors.Errorf("invalid device type for DeviceAllow: %v", rule.Type)
@@ -243,9 +250,9 @@ func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Proper
 		// so we'll give a warning in that case (note that the fallback code
 		// will insert any rules systemd couldn't handle). What amazing fun.

-		if rule.Major == configs.Wildcard {
+		if rule.Major == devices.Wildcard {
 			// "_ *:n _" rules aren't supported by systemd.
-			if rule.Minor != configs.Wildcard {
+			if rule.Minor != devices.Wildcard {
 				logrus.Warnf("systemd doesn't support '*:n' device rules -- temporarily ignoring rule: %v", *rule)
 				continue
 			}
@@ -256,7 +263,7 @@ func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Proper
 				return nil, err
 			}
 			entry.Path = prefix + "*"
-		} else if rule.Minor == configs.Wildcard {
+		} else if rule.Minor == devices.Wildcard {
 			// "_ n:* _" rules require a device group from /proc/devices.
 			group, err := findDeviceGroup(rule.Type, rule.Major)
 			if err != nil {
@@ -271,9 +278,9 @@ func generateDeviceProperties(rules []*configs.DeviceRule) ([]systemdDbus.Proper
 		} else {
 			// "_ n:m _" rules are just a path in /dev/{block,char}/.
 			switch rule.Type {
-			case configs.BlockDevice:
+			case devices.BlockDevice:
 				entry.Path = fmt.Sprintf("/dev/block/%d:%d", rule.Major, rule.Minor)
-			case configs.CharDevice:
+			case devices.CharDevice:
 				entry.Path = fmt.Sprintf("/dev/char/%d:%d", rule.Major, rule.Minor)
 			}
 		}
@@ -307,7 +314,7 @@ func newProp(name string, units interface{}) systemdDbus.Property {
 func getUnitName(c *configs.Cgroup) string {
 	// by default, we create a scope unless the user explicitly asks for a slice.
 	if !strings.HasSuffix(c.Name, ".slice") {
-		return fmt.Sprintf("%s-%s.scope", c.ScopePrefix, c.Name)
+		return c.ScopePrefix + "-" + c.Name + ".scope"
 	}
 	return c.Name
 }
@@ -325,6 +332,9 @@ func isUnitExists(err error) bool {
 func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []systemdDbus.Property) error {
 	statusChan := make(chan string, 1)
 	if _, err := dbusConnection.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil {
+		timeout := time.NewTimer(30 * time.Second)
+		defer timeout.Stop()
+
 		select {
 		case s := <-statusChan:
 			close(statusChan)
@@ -333,8 +343,9 @@ func startUnit(dbusConnection *systemdDbus.Conn, unitName string, properties []s
 				dbusConnection.ResetFailedUnit(unitName)
 				return errors.Errorf("error creating systemd unit `%s`: got `%s`", unitName, s)
 			}
-		case <-time.After(time.Second):
-			logrus.Warnf("Timed out while waiting for StartTransientUnit(%s) completion signal from dbus. Continuing...", unitName)
+		case <-timeout.C:
+			dbusConnection.ResetFailedUnit(unitName)
+			return errors.New("Timeout waiting for systemd to create " + unitName)
 		}
 	} else if !isUnitExists(err) {
 		return err
@@ -360,20 +371,20 @@ func stopUnit(dbusConnection *systemdDbus.Conn, unitName string) error {
 	return nil
 }

-func systemdVersion(conn *systemdDbus.Conn) (int, error) {
+func systemdVersion(conn *systemdDbus.Conn) int {
 	versionOnce.Do(func() {
 		version = -1
 		verStr, err := conn.GetManagerProperty("Version")
-		if err != nil {
-			versionErr = err
-			return
+		if err == nil {
+			version, err = systemdVersionAtoi(verStr)
 		}

-		version, versionErr = systemdVersionAtoi(verStr)
-		return
+		if err != nil {
+			logrus.WithError(err).Error("unable to get systemd version")
+		}
 	})

-	return version, versionErr
+	return version
 }

 func systemdVersionAtoi(verStr string) (int, error) {
@@ -394,12 +405,13 @@ func systemdVersionAtoi(verStr string) (int, error) {
 func addCpuQuota(conn *systemdDbus.Conn, properties *[]systemdDbus.Property, quota int64, period uint64) {
 	if period != 0 {
 		// systemd only supports CPUQuotaPeriodUSec since v242
-		sdVer, err := systemdVersion(conn)
-		if err != nil {
-			logrus.Warnf("systemdVersion: %s", err)
-		} else if sdVer >= 242 {
+		sdVer := systemdVersion(conn)
+		if sdVer >= 242 {
 			*properties = append(*properties,
 				newProp("CPUQuotaPeriodUSec", period))
+		} else {
+			logrus.Debugf("systemd v%d is too old to support CPUQuotaPeriodSec "+
+				" (setting will still be applied to cgroupfs)", sdVer)
 		}
 	}
 	if quota != 0 || period != 0 {
@@ -407,10 +419,8 @@ func addCpuQuota(conn *systemdDbus.Conn, properties *[]systemdDbus.Property, quo
 		cpuQuotaPerSecUSec := uint64(math.MaxUint64)
 		if quota > 0 {
 			if period == 0 {
-				// assume the default kernel value of 100000 us (100 ms), same for v1 and v2.
-				// v1: https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html and
-				// v2: https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
-				period = 100000
+				// assume the default
+				period = defCPUQuotaPeriod
 			}
 			// systemd converts CPUQuotaPerSecUSec (microseconds per CPU second) to CPUQuota
 			// (integer percentage of CPU) internally.  This means that if a fractional percent of
@@ -425,3 +435,37 @@ func addCpuQuota(conn *systemdDbus.Conn, properties *[]systemdDbus.Property, quo
 			newProp("CPUQuotaPerSecUSec", cpuQuotaPerSecUSec))
 	}
 }
+
+func addCpuset(conn *systemdDbus.Conn, props *[]systemdDbus.Property, cpus, mems string) error {
+	if cpus == "" && mems == "" {
+		return nil
+	}
+
+	// systemd only supports AllowedCPUs/AllowedMemoryNodes since v244
+	sdVer := systemdVersion(conn)
+	if sdVer < 244 {
+		logrus.Debugf("systemd v%d is too old to support AllowedCPUs/AllowedMemoryNodes"+
+			" (settings will still be applied to cgroupfs)", sdVer)
+		return nil
+	}
+
+	if cpus != "" {
+		bits, err := rangeToBits(cpus)
+		if err != nil {
+			return fmt.Errorf("resources.CPU.Cpus=%q conversion error: %w",
+				cpus, err)
+		}
+		*props = append(*props,
+			newProp("AllowedCPUs", bits))
+	}
+	if mems != "" {
+		bits, err := rangeToBits(mems)
+		if err != nil {
+			return fmt.Errorf("resources.CPU.Mems=%q conversion error: %w",
+				mems, err)
+		}
+		*props = append(*props,
+			newProp("AllowedMemoryNodes", bits))
+	}
+	return nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/cpuset.go
@@ -0,0 +1,67 @@
+package systemd
+
+import (
+	"encoding/binary"
+	"strconv"
+	"strings"
+
+	"github.com/pkg/errors"
+	"github.com/willf/bitset"
+)
+
+// rangeToBits converts a text representation of a CPU mask (as written to
+// or read from cgroups' cpuset.* files, e.g. "1,3-5") to a slice of bytes
+// with the corresponding bits set (as consumed by systemd over dbus as
+// AllowedCPUs/AllowedMemoryNodes unit property value).
+func rangeToBits(str string) ([]byte, error) {
+	bits := &bitset.BitSet{}
+
+	for _, r := range strings.Split(str, ",") {
+		// allow extra spaces around
+		r = strings.TrimSpace(r)
+		// allow empty elements (extra commas)
+		if r == "" {
+			continue
+		}
+		ranges := strings.SplitN(r, "-", 2)
+		if len(ranges) > 1 {
+			start, err := strconv.ParseUint(ranges[0], 10, 32)
+			if err != nil {
+				return nil, err
+			}
+			end, err := strconv.ParseUint(ranges[1], 10, 32)
+			if err != nil {
+				return nil, err
+			}
+			if start > end {
+				return nil, errors.New("invalid range: " + r)
+			}
+			for i := uint(start); i <= uint(end); i++ {
+				bits.Set(i)
+			}
+		} else {
+			val, err := strconv.ParseUint(ranges[0], 10, 32)
+			if err != nil {
+				return nil, err
+			}
+			bits.Set(uint(val))
+		}
+	}
+
+	val := bits.Bytes()
+	if len(val) == 0 {
+		// do not allow empty values
+		return nil, errors.New("empty value")
+	}
+	ret := make([]byte, len(val)*8)
+	for i := range val {
+		// bitset uses BigEndian internally
+		binary.BigEndian.PutUint64(ret[i*8:], val[len(val)-1-i])
+	}
+	// remove upper all-zero bytes
+	for ret[0] == 0 {
+		ret = ret[1:]
+	}
+
+	return ret, nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/user.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/user.go
@@ -57,7 +57,7 @@ func DetectUID() (int, error) {
 	}
 	b, err := exec.Command("busctl", "--user", "--no-pager", "status").CombinedOutput()
 	if err != nil {
-		return -1, errors.Wrap(err, "could not execute `busctl --user --no-pager status`")
+		return -1, errors.Wrapf(err, "could not execute `busctl --user --no-pager status`: %q", string(b))
 	}
 	scanner := bufio.NewScanner(bytes.NewReader(b))
 	for scanner.Scan() {
@@ -102,5 +102,5 @@ func DetectUserDbusSessionBusAddress() (string, error) {
 			return strings.TrimPrefix(s, "DBUS_SESSION_BUS_ADDRESS="), nil
 		}
 	}
-	return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from `systemctl --user --no-pager show-environment`")
+	return "", errors.New("could not detect DBUS_SESSION_BUS_ADDRESS from `systemctl --user --no-pager show-environment`. Make sure you have installed the dbus-user-session or dbus-daemon package and then run: `systemctl --user start dbus`")
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v1.go
@@ -4,7 +4,6 @@ package systemd

 import (
 	"errors"
-	"io/ioutil"
 	"os"
 	"path/filepath"
 	"strings"
@@ -13,6 +12,7 @@ import (
 	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fs"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/sirupsen/logrus"
 )
@@ -90,6 +90,11 @@ func genV1ResourcesProperties(c *configs.Cgroup, conn *systemdDbus.Conn) ([]syst
 			newProp("TasksMax", uint64(r.PidsLimit)))
 	}

+	err = addCpuset(conn, &properties, r.CpusetCpus, r.CpusetMems)
+	if err != nil {
+		return nil, err
+	}
+
 	return properties, nil
 }

@@ -101,20 +106,23 @@ func (m *legacyManager) Apply(pid int) error {
 		properties []systemdDbus.Property
 	)

+	if c.Resources.Unified != nil {
+		return cgroups.ErrV1NoUnified
+	}
+
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	if c.Paths != nil {
 		paths := make(map[string]string)
+		cgMap, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
+		if err != nil {
+			return err
+		}
+		// XXX(kolyshkin@): why this check is needed?
 		for name, path := range c.Paths {
-			_, err := getSubsystemPath(m.cgroups, name)
-			if err != nil {
-				// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
-				if cgroups.IsNotFound(err) {
-					continue
-				}
-				return err
+			if _, ok := cgMap[name]; ok {
+				paths[name] = path
 			}
-			paths[name] = path
 		}
 		m.paths = paths
 		return cgroups.EnterPid(m.paths, pid)
@@ -179,14 +187,16 @@ func (m *legacyManager) Apply(pid int) error {
 		return err
 	}

-	if err := joinCgroups(c, pid); err != nil {
-		return err
-	}
-
 	paths := make(map[string]string)
 	for _, s := range legacySubsystems {
 		subsystemPath, err := getSubsystemPath(m.cgroups, s.Name())
 		if err != nil {
+			// Even if it's `not found` error, we'll return err
+			// because devices cgroup is hard requirement for
+			// container security.
+			if s.Name() == "devices" {
+				return err
+			}
 			// Don't fail if a cgroup hierarchy was not found, just skip this subsystem
 			if cgroups.IsNotFound(err) {
 				continue
@@ -196,6 +206,11 @@ func (m *legacyManager) Apply(pid int) error {
 		paths[s.Name()] = subsystemPath
 	}
 	m.paths = paths
+
+	if err := m.joinCgroups(pid); err != nil {
+		return err
+	}
+
 	return nil
 }

@@ -212,17 +227,14 @@ func (m *legacyManager) Destroy() error {
 	}
 	unitName := getUnitName(m.cgroups)

-	err = stopUnit(dbusConnection, unitName)
+	stopErr := stopUnit(dbusConnection, unitName)
 	// Both on success and on error, cleanup all the cgroups we are aware of.
 	// Some of them were created directly by Apply() and are not managed by systemd.
 	if err := cgroups.RemovePaths(m.paths); err != nil {
 		return err
 	}
-	if err != nil {
-		return err
-	}
-	m.paths = make(map[string]string)
-	return nil
+
+	return stopErr
 }

 func (m *legacyManager) Path(subsys string) string {
@@ -231,48 +243,25 @@ func (m *legacyManager) Path(subsys string) string {
 	return m.paths[subsys]
 }

-func join(c *configs.Cgroup, subsystem string, pid int) (string, error) {
-	path, err := getSubsystemPath(c, subsystem)
-	if err != nil {
-		return "", err
-	}
-
-	if err := os.MkdirAll(path, 0755); err != nil {
-		return "", err
-	}
-	if err := cgroups.WriteCgroupProc(path, pid); err != nil {
-		return "", err
-	}
-	return path, nil
-}
-
-func joinCgroups(c *configs.Cgroup, pid int) error {
+func (m *legacyManager) joinCgroups(pid int) error {
 	for _, sys := range legacySubsystems {
 		name := sys.Name()
 		switch name {
 		case "name=systemd":
 			// let systemd handle this
 		case "cpuset":
-			path, err := getSubsystemPath(c, name)
-			if err != nil && !cgroups.IsNotFound(err) {
-				return err
-			}
-			s := &fs.CpusetGroup{}
-			if err := s.ApplyDir(path, c, pid); err != nil {
-				return err
-			}
-		default:
-			_, err := join(c, name, pid)
-			if err != nil {
-				// Even if it's `not found` error, we'll return err
-				// because devices cgroup is hard requirement for
-				// container security.
-				if name == "devices" {
+			if path, ok := m.paths[name]; ok {
+				s := &fs.CpusetGroup{}
+				if err := s.ApplyDir(path, m.cgroups, pid); err != nil {
 					return err
 				}
-				// For other subsystems, omit the `not found` error
-				// because they are optional.
-				if !cgroups.IsNotFound(err) {
+			}
+		default:
+			if path, ok := m.paths[name]; ok {
+				if err := os.MkdirAll(path, 0755); err != nil {
+					return err
+				}
+				if err := cgroups.WriteCgroupProc(path, pid); err != nil {
 					return err
 				}
 			}
@@ -283,7 +272,7 @@ func joinCgroups(c *configs.Cgroup, pid int) error {
 }

 func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
-	mountpoint, err := cgroups.FindCgroupMountpoint(c.Path, subsystem)
+	mountpoint, err := cgroups.FindCgroupMountpoint("", subsystem)
 	if err != nil {
 		return "", err
 	}
@@ -309,15 +298,14 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) {
 }

 func (m *legacyManager) Freeze(state configs.FreezerState) error {
-	path, err := getSubsystemPath(m.cgroups, "freezer")
-	if err != nil {
-		return err
+	path, ok := m.paths["freezer"]
+	if !ok {
+		return errSubsystemDoesNotExist
 	}
 	prevState := m.cgroups.Resources.Freezer
 	m.cgroups.Resources.Freezer = state
 	freezer := &fs.FreezerGroup{}
-	err = freezer.Set(path, m.cgroups)
-	if err != nil {
+	if err := freezer.Set(path, m.cgroups); err != nil {
 		m.cgroups.Resources.Freezer = prevState
 		return err
 	}
@@ -325,17 +313,17 @@ func (m *legacyManager) Freeze(state configs.FreezerState) error {
 }

 func (m *legacyManager) GetPids() ([]int, error) {
-	path, err := getSubsystemPath(m.cgroups, "devices")
-	if err != nil {
-		return nil, err
+	path, ok := m.paths["devices"]
+	if !ok {
+		return nil, errSubsystemDoesNotExist
 	}
 	return cgroups.GetPids(path)
 }

 func (m *legacyManager) GetAllPids() ([]int, error) {
-	path, err := getSubsystemPath(m.cgroups, "devices")
-	if err != nil {
-		return nil, err
+	path, ok := m.paths["devices"]
+	if !ok {
+		return nil, errSubsystemDoesNotExist
 	}
 	return cgroups.GetAllPids(path)
 }
@@ -363,6 +351,9 @@ func (m *legacyManager) Set(container *configs.Config) error {
 	if m.cgroups.Paths != nil {
 		return nil
 	}
+	if container.Cgroups.Resources.Unified != nil {
+		return cgroups.ErrV1NoUnified
+	}
 	dbusConnection, err := getDbusConnection(false)
 	if err != nil {
 		return err
@@ -406,9 +397,9 @@ func (m *legacyManager) Set(container *configs.Config) error {

 	for _, sys := range legacySubsystems {
 		// Get the subsystem path, but don't error out for not found cgroups.
-		path, err := getSubsystemPath(container.Cgroups, sys.Name())
-		if err != nil && !cgroups.IsNotFound(err) {
-			return err
+		path, ok := m.paths[sys.Name()]
+		if !ok {
+			continue
 		}
 		if err := sys.Set(path, container.Cgroups); err != nil {
 			return err
@@ -420,7 +411,10 @@ func (m *legacyManager) Set(container *configs.Config) error {

 func enableKmem(c *configs.Cgroup) error {
 	path, err := getSubsystemPath(c, "memory")
-	if err != nil && !cgroups.IsNotFound(err) {
+	if err != nil {
+		if cgroups.IsNotFound(err) {
+			return nil
+		}
 		return err
 	}

@@ -429,7 +423,7 @@ func enableKmem(c *configs.Cgroup) error {
 	}
 	// do not try to enable the kernel memory if we already have
 	// tasks in the cgroup.
-	content, err := ioutil.ReadFile(filepath.Join(path, "tasks"))
+	content, err := fscommon.ReadFile(path, "tasks")
 	if err != nil {
 		return err
 	}
@@ -450,9 +444,9 @@ func (m *legacyManager) GetCgroups() (*configs.Cgroup, error) {
 }

 func (m *legacyManager) GetFreezerState() (configs.FreezerState, error) {
-	path, err := getSubsystemPath(m.cgroups, "freezer")
-	if err != nil && !cgroups.IsNotFound(err) {
-		return configs.Undefined, err
+	path, ok := m.paths["freezer"]
+	if !ok {
+		return configs.Undefined, nil
 	}
 	freezer := &fs.FreezerGroup{}
 	return freezer.GetState(path)
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/systemd/v2.go
@@ -3,6 +3,8 @@
 package systemd

 import (
+	"fmt"
+	"math"
 	"os"
 	"path/filepath"
 	"strconv"
@@ -34,6 +36,133 @@ func NewUnifiedManager(config *configs.Cgroup, path string, rootless bool) cgrou
 	}
 }

+// unifiedResToSystemdProps tries to convert from Cgroup.Resources.Unified
+// key/value map (where key is cgroupfs file name) to systemd unit properties.
+// This is on a best-effort basis, so the properties that are not known
+// (to this function and/or systemd) are ignored (but logged with "debug"
+// log level).
+//
+// For the list of keys, see https://www.kernel.org/doc/Documentation/cgroup-v2.txt
+//
+// For the list of systemd unit properties, see systemd.resource-control(5).
+func unifiedResToSystemdProps(conn *systemdDbus.Conn, res map[string]string) (props []systemdDbus.Property, _ error) {
+	var err error
+
+	for k, v := range res {
+		if strings.Contains(k, "/") {
+			return nil, fmt.Errorf("unified resource %q must be a file name (no slashes)", k)
+		}
+		sk := strings.SplitN(k, ".", 2)
+		if len(sk) != 2 {
+			return nil, fmt.Errorf("unified resource %q must be in the form CONTROLLER.PARAMETER", k)
+		}
+		// Kernel is quite forgiving to extra whitespace
+		// around the value, and so should we.
+		v = strings.TrimSpace(v)
+		// Please keep cases in alphabetical order.
+		switch k {
+		case "cpu.max":
+			// value: quota [period]
+			quota := int64(0) // 0 means "unlimited" for addCpuQuota, if period is set
+			period := defCPUQuotaPeriod
+			sv := strings.Fields(v)
+			if len(sv) < 1 || len(sv) > 2 {
+				return nil, fmt.Errorf("unified resource %q value invalid: %q", k, v)
+			}
+			// quota
+			if sv[0] != "max" {
+				quota, err = strconv.ParseInt(sv[0], 10, 64)
+				if err != nil {
+					return nil, fmt.Errorf("unified resource %q period value conversion error: %w", k, err)
+				}
+			}
+			// period
+			if len(sv) == 2 {
+				period, err = strconv.ParseUint(sv[1], 10, 64)
+				if err != nil {
+					return nil, fmt.Errorf("unified resource %q quota value conversion error: %w", k, err)
+				}
+			}
+			addCpuQuota(conn, &props, quota, period)
+
+		case "cpu.weight":
+			num, err := strconv.ParseUint(v, 10, 64)
+			if err != nil {
+				return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
+			}
+			props = append(props,
+				newProp("CPUWeight", num))
+
+		case "cpuset.cpus", "cpuset.mems":
+			bits, err := rangeToBits(v)
+			if err != nil {
+				return nil, fmt.Errorf("unified resource %q=%q conversion error: %w", k, v, err)
+			}
+			m := map[string]string{
+				"cpuset.cpus": "AllowedCPUs",
+				"cpuset.mems": "AllowedMemoryNodes",
+			}
+			// systemd only supports these properties since v244
+			sdVer := systemdVersion(conn)
+			if sdVer >= 244 {
+				props = append(props,
+					newProp(m[k], bits))
+			} else {
+				logrus.Debugf("systemd v%d is too old to support %s"+
+					" (setting will still be applied to cgroupfs)",
+					sdVer, m[k])
+			}
+
+		case "memory.high", "memory.low", "memory.min", "memory.max", "memory.swap.max":
+			num := uint64(math.MaxUint64)
+			if v != "max" {
+				num, err = strconv.ParseUint(v, 10, 64)
+				if err != nil {
+					return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
+				}
+			}
+			m := map[string]string{
+				"memory.high":     "MemoryHigh",
+				"memory.low":      "MemoryLow",
+				"memory.min":      "MemoryMin",
+				"memory.max":      "MemoryMax",
+				"memory.swap.max": "MemorySwapMax",
+			}
+			props = append(props,
+				newProp(m[k], num))
+
+		case "pids.max":
+			num := uint64(math.MaxUint64)
+			if v != "max" {
+				var err error
+				num, err = strconv.ParseUint(v, 10, 64)
+				if err != nil {
+					return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err)
+				}
+			}
+			props = append(props,
+				newProp("TasksAccounting", true),
+				newProp("TasksMax", num))
+
+		case "memory.oom.group":
+			// Setting this to 1 is roughly equivalent to OOMPolicy=kill
+			// (as per systemd.service(5) and
+			// https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html),
+			// but it's not clear what to do if it is unset or set
+			// to 0 in runc update, as there are two other possible
+			// values for OOMPolicy (continue/stop).
+			fallthrough
+
+		default:
+			// Ignore the unknown resource here -- will still be
+			// applied in Set which calls fs2.Set.
+			logrus.Debugf("don't know how to convert unified resource %q=%q to systemd unit property; skipping (will still be applied to cgroupfs)", k, v)
+		}
+	}
+
+	return props, nil
+}
+
 func genV2ResourcesProperties(c *configs.Cgroup, conn *systemdDbus.Conn) ([]systemdDbus.Property, error) {
 	var properties []systemdDbus.Property
 	r := c.Resources
@@ -80,8 +209,22 @@ func genV2ResourcesProperties(c *configs.Cgroup, conn *systemdDbus.Conn) ([]syst
 			newProp("TasksMax", uint64(r.PidsLimit)))
 	}

+	err = addCpuset(conn, &properties, r.CpusetCpus, r.CpusetMems)
+	if err != nil {
+		return nil, err
+	}
+
 	// ignore r.KernelMemory

+	// convert Resources.Unified map to systemd properties
+	if r.Unified != nil {
+		unifiedProps, err := unifiedResToSystemdProps(conn, r.Unified)
+		if err != nil {
+			return nil, err
+		}
+		properties = append(properties, unifiedProps...)
+	}
+
 	return properties, nil
 }

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
@@ -15,7 +15,9 @@ import (
 	"sync"
 	"time"

-	units "github.com/docker/go-units"
+	"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
+	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )

@@ -29,19 +31,19 @@ var (
 	isUnified     bool
 )

-// HugePageSizeUnitList is a list of the units used by the linux kernel when
-// naming the HugePage control files.
-// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt
-// TODO Since the kernel only use KB, MB and GB; TB and PB should be removed,
-// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393
-var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"}
-
 // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
 func IsCgroup2UnifiedMode() bool {
 	isUnifiedOnce.Do(func() {
 		var st unix.Statfs_t
-		if err := unix.Statfs(unifiedMountpoint, &st); err != nil {
-			panic("cannot statfs cgroup root")
+		err := unix.Statfs(unifiedMountpoint, &st)
+		if err != nil {
+			if os.IsNotExist(err) && system.RunningInUserNS() {
+				// ignore the "not found" error if running in userns
+				logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
+				isUnified = false
+				return
+			}
+			panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
 		}
 		isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
 	})
@@ -86,11 +88,11 @@ func GetAllSubsystems() ([]string, error) {
 		// - freezer: implemented in kernel 5.2
 		// We assume these are always available, as it is hard to detect availability.
 		pseudo := []string{"devices", "freezer"}
-		data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
+		data, err := fscommon.ReadFile("/sys/fs/cgroup", "cgroup.controllers")
 		if err != nil {
 			return nil, err
 		}
-		subsystems := append(pseudo, strings.Fields(string(data))...)
+		subsystems := append(pseudo, strings.Fields(data)...)
 		return subsystems, nil
 	}
 	f, err := os.Open("/proc/cgroups")
@@ -207,20 +209,66 @@ func EnterPid(cgroupPaths map[string]string, pid int) error {
 	return nil
 }

+func rmdir(path string) error {
+	err := unix.Rmdir(path)
+	if err == nil || err == unix.ENOENT {
+		return nil
+	}
+	return &os.PathError{Op: "rmdir", Path: path, Err: err}
+}
+
+// RemovePath aims to remove cgroup path. It does so recursively,
+// by removing any subdirectories (sub-cgroups) first.
+func RemovePath(path string) error {
+	// try the fast path first
+	if err := rmdir(path); err == nil {
+		return nil
+	}
+
+	infos, err := ioutil.ReadDir(path)
+	if err != nil {
+		if os.IsNotExist(err) {
+			err = nil
+		}
+		return err
+	}
+	for _, info := range infos {
+		if info.IsDir() {
+			// We should remove subcgroups dir first
+			if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
+				break
+			}
+		}
+	}
+	if err == nil {
+		err = rmdir(path)
+	}
+	return err
+}
+
 // RemovePaths iterates over the provided paths removing them.
 // We trying to remove all paths five times with increasing delay between tries.
 // If after all there are not removed cgroups - appropriate error will be
 // returned.
 func RemovePaths(paths map[string]string) (err error) {
+	const retries = 5
 	delay := 10 * time.Millisecond
-	for i := 0; i < 5; i++ {
+	for i := 0; i < retries; i++ {
 		if i != 0 {
 			time.Sleep(delay)
 			delay *= 2
 		}
 		for s, p := range paths {
-			os.RemoveAll(p)
-			// TODO: here probably should be logging
+			if err := RemovePath(p); err != nil {
+				// do not log intermediate iterations
+				switch i {
+				case 0:
+					logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
+				case retries - 1:
+					logrus.WithError(err).Error("Failed to remove cgroup")
+				}
+
+			}
 			_, err := os.Stat(p)
 			// We need this strange way of checking cgroups existence because
 			// RemoveAll almost always returns error, even on already removed
@@ -230,6 +278,8 @@ func RemovePaths(paths map[string]string) (err error) {
 			}
 		}
 		if len(paths) == 0 {
+			//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
+			paths = make(map[string]string)
 			return nil
 		}
 	}
@@ -237,27 +287,50 @@ func RemovePaths(paths map[string]string) (err error) {
 }

 func GetHugePageSize() ([]string, error) {
-	files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages")
+	dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
 	if err != nil {
-		return []string{}, err
+		return nil, err
 	}
-	var fileNames []string
-	for _, st := range files {
-		fileNames = append(fileNames, st.Name())
+	files, err := dir.Readdirnames(0)
+	dir.Close()
+	if err != nil {
+		return nil, err
 	}
-	return getHugePageSizeFromFilenames(fileNames)
+
+	return getHugePageSizeFromFilenames(files)
 }

 func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
-	var pageSizes []string
-	for _, fileName := range fileNames {
-		nameArray := strings.Split(fileName, "-")
-		pageSize, err := units.RAMInBytes(nameArray[1])
-		if err != nil {
-			return []string{}, err
+	pageSizes := make([]string, 0, len(fileNames))
+
+	for _, file := range fileNames {
+		// example: hugepages-1048576kB
+		val := strings.TrimPrefix(file, "hugepages-")
+		if len(val) == len(file) {
+			// unexpected file name: no prefix found
+			continue
 		}
-		sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList)
-		pageSizes = append(pageSizes, sizeString)
+		// The suffix is always "kB" (as of Linux 5.9)
+		eLen := len(val) - 2
+		val = strings.TrimSuffix(val, "kB")
+		if len(val) != eLen {
+			logrus.Warnf("GetHugePageSize: %s: invalid filename suffix (expected \"kB\")", file)
+			continue
+		}
+		size, err := strconv.Atoi(val)
+		if err != nil {
+			return nil, err
+		}
+		// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
+		// but in our case the size is in KB already.
+		if size >= (1 << 20) {
+			val = strconv.Itoa(size>>20) + "GB"
+		} else if size >= (1 << 10) {
+			val = strconv.Itoa(size>>10) + "MB"
+		} else {
+			val += "KB"
+		}
+		pageSizes = append(pageSizes, val)
 	}

 	return pageSizes, nil
@@ -303,14 +376,14 @@ func WriteCgroupProc(dir string, pid int) error {
 		return nil
 	}

-	cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700)
+	file, err := fscommon.OpenFile(dir, CgroupProcesses, os.O_WRONLY)
 	if err != nil {
 		return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
 	}
-	defer cgroupProcessesFile.Close()
+	defer file.Close()

 	for i := 0; i < 5; i++ {
-		_, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid))
+		_, err = file.WriteString(strconv.Itoa(pid))
 		if err == nil {
 			return nil
 		}
--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go
@@ -1,16 +1,16 @@
 package cgroups

 import (
-	"bufio"
 	"errors"
 	"fmt"
-	"io"
 	"os"
 	"path/filepath"
 	"strings"
+	"sync"
 	"syscall"

 	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/moby/sys/mountinfo"
 	"golang.org/x/sys/unix"
 )

@@ -23,7 +23,12 @@ const (
 )

 var (
-	errUnified = errors.New("not implemented for cgroup v2 unified hierarchy")
+	errUnified     = errors.New("not implemented for cgroup v2 unified hierarchy")
+	ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1")
+
+	readMountinfoOnce sync.Once
+	readMountinfoErr  error
+	cgroupMountinfo   []*mountinfo.Info
 )

 type NotFoundError struct {
@@ -90,6 +95,21 @@ func tryDefaultPath(cgroupPath, subsystem string) string {
 	return path
 }

+// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones
+// with fstype of "cgroup") for the current running process.
+//
+// The results are cached (to avoid re-reading mountinfo which is relatively
+// expensive), so it is assumed that cgroup mounts are not being changed.
+func readCgroupMountinfo() ([]*mountinfo.Info, error) {
+	readMountinfoOnce.Do(func() {
+		cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
+			mountinfo.FSTypeFilter("cgroup"),
+		)
+	})
+
+	return cgroupMountinfo, readMountinfoErr
+}
+
 // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
 func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
 	if IsCgroup2UnifiedMode() {
@@ -110,56 +130,28 @@ func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string,
 		return "", "", errUnified
 	}

-	// Avoid parsing mountinfo by checking if subsystem is valid/available.
-	if !isSubsystemAvailable(subsystem) {
-		return "", "", NewNotFoundError(subsystem)
-	}
-
-	f, err := os.Open("/proc/self/mountinfo")
+	mi, err := readCgroupMountinfo()
 	if err != nil {
 		return "", "", err
 	}
-	defer f.Close()

-	return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
+	return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem)
 }

-func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
-	scanner := bufio.NewScanner(reader)
-	for scanner.Scan() {
-		txt := scanner.Text()
-		fields := strings.Fields(txt)
-		if len(fields) < 9 {
-			continue
-		}
-		if strings.HasPrefix(fields[4], cgroupPath) {
-			for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) {
+	for _, mi := range mounts {
+		if strings.HasPrefix(mi.Mountpoint, cgroupPath) {
+			for _, opt := range strings.Split(mi.VFSOptions, ",") {
 				if opt == subsystem {
-					return fields[4], fields[3], nil
+					return mi.Mountpoint, mi.Root, nil
 				}
 			}
 		}
 	}
-	if err := scanner.Err(); err != nil {
-		return "", "", err
-	}

 	return "", "", NewNotFoundError(subsystem)
 }

-func isSubsystemAvailable(subsystem string) bool {
-	if IsCgroup2UnifiedMode() {
-		panic("don't call isSubsystemAvailable from cgroupv2 code")
-	}
-
-	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
-	if err != nil {
-		return false
-	}
-	_, avail := cgroups[subsystem]
-	return avail
-}
-
 func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
 	if len(m.Subsystems) == 0 {
 		return "", fmt.Errorf("no subsystem for mount")
@@ -168,25 +160,15 @@ func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
 	return getControllerPath(m.Subsystems[0], cgroups)
 }

-func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
+func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) {
 	res := make([]Mount, 0, len(ss))
-	scanner := bufio.NewScanner(mi)
 	numFound := 0
-	for scanner.Scan() && numFound < len(ss) {
-		txt := scanner.Text()
-		sepIdx := strings.Index(txt, " - ")
-		if sepIdx == -1 {
-			return nil, fmt.Errorf("invalid mountinfo format")
-		}
-		if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
-			continue
-		}
-		fields := strings.Split(txt, " ")
+	for _, mi := range mounts {
 		m := Mount{
-			Mountpoint: fields[4],
-			Root:       fields[3],
+			Mountpoint: mi.Mountpoint,
+			Root:       mi.Root,
 		}
-		for _, opt := range strings.Split(fields[len(fields)-1], ",") {
+		for _, opt := range strings.Split(mi.VFSOptions, ",") {
 			seen, known := ss[opt]
 			if !known || (!all && seen) {
 				continue
@@ -199,19 +181,18 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
 		if len(m.Subsystems) > 0 || all {
 			res = append(res, m)
 		}
-	}
-	if err := scanner.Err(); err != nil {
-		return nil, err
+		if !all && numFound >= len(ss) {
+			break
+		}
 	}
 	return res, nil
 }

 func getCgroupMountsV1(all bool) ([]Mount, error) {
-	f, err := os.Open("/proc/self/mountinfo")
+	mi, err := readCgroupMountinfo()
 	if err != nil {
 		return nil, err
 	}
-	defer f.Close()

 	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
 	if err != nil {
@@ -222,7 +203,8 @@ func getCgroupMountsV1(all bool) ([]Mount, error) {
 	for s := range allSubsystems {
 		allMap[s] = false
 	}
-	return getCgroupMountsHelper(allMap, f, all)
+
+	return getCgroupMountsHelper(allMap, mi, all)
 }

 // GetOwnCgroup returns the relative path to the cgroup docker is running in.
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go
@@ -2,6 +2,7 @@ package configs

 import (
 	systemdDbus "github.com/coreos/go-systemd/v22/dbus"
+	"github.com/opencontainers/runc/libcontainer/devices"
 )

 type FreezerState string
@@ -42,7 +43,7 @@ type Cgroup struct {

 type Resources struct {
 	// Devices is the set of access rules for devices in the container.
-	Devices []*DeviceRule `json:"devices"`
+	Devices []*devices.Rule `json:"devices"`

 	// Memory limit (in bytes)
 	Memory int64 `json:"memory"`
@@ -127,6 +128,9 @@ type Resources struct {
 	// CpuWeight sets a proportional bandwidth limit.
 	CpuWeight uint64 `json:"cpu_weight"`

+	// Unified is cgroupv2-only key-value map.
+	Unified map[string]string `json:"unified"`
+
 	// SkipDevices allows to skip configuring device permissions.
 	// Used by e.g. kubelet while creating a parent cgroup (kubepods)
 	// common for many containers.
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
@@ -7,6 +7,7 @@ import (
 	"os/exec"
 	"time"

+	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/pkg/errors"
 	"github.com/sirupsen/logrus"
@@ -92,6 +93,9 @@ type Config struct {
 	// Path to a directory containing the container's root filesystem.
 	Rootfs string `json:"rootfs"`

+	// Umask is the umask to use inside of the container.
+	Umask *uint32 `json:"umask"`
+
 	// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
 	// bind mounts are writtable.
 	Readonlyfs bool `json:"readonlyfs"`
@@ -104,7 +108,7 @@ type Config struct {
 	Mounts []*Mount `json:"mounts"`

 	// The device nodes that should be automatically created within the container upon container start.  Note, make sure that the node is marked as allowed in the cgroup as well!
-	Devices []*Device `json:"devices"`
+	Devices []*devices.Device `json:"devices"`

 	MountLabel string `json:"mount_label"`

--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go
@@ -1,5 +0,0 @@
-package configs
-
-func (d *DeviceRule) Mkdev() (uint64, error) {
-	return 0, nil
-}
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go
@@ -0,0 +1,17 @@
+package configs
+
+import "github.com/opencontainers/runc/libcontainer/devices"
+
+type (
+	// Deprecated: use libcontainer/devices.Device
+	Device = devices.Device
+
+	// Deprecated: use libcontainer/devices.Rule
+	DeviceRule = devices.Rule
+
+	// Deprecated: use libcontainer/devices.Type
+	DeviceType = devices.Type
+
+	// Deprecated: use libcontainer/devices.Permissions
+	DevicePermissions = devices.Permissions
+)
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
@@ -56,7 +56,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
 	if nsFile == "" {
 		return false
 	}
-	_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
+	_, err := os.Stat("/proc/self/ns/" + nsFile)
 	// a namespace is supported if it exists and we have permissions to read it
 	supported = err == nil
 	supportedNamespaces[ns] = supported
--- a/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/validate/validator.go
@@ -6,10 +6,12 @@ import (
 	"os"
 	"path/filepath"
 	"strings"
+	"sync"

 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/intelrdt"
 	selinux "github.com/opencontainers/selinux/go-selinux"
+	"golang.org/x/sys/unix"
 )

 type Validator interface {
@@ -144,6 +146,12 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
 		"kernel.shm_rmid_forced": true,
 	}

+	var (
+		netOnce    sync.Once
+		hostnet    bool
+		hostnetErr error
+	)
+
 	for s := range config.Sysctl {
 		if validSysctlMap[s] || strings.HasPrefix(s, "fs.mqueue.") {
 			if config.Namespaces.Contains(configs.NEWIPC) {
@@ -153,16 +161,27 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {
 			}
 		}
 		if strings.HasPrefix(s, "net.") {
-			if config.Namespaces.Contains(configs.NEWNET) {
-				if path := config.Namespaces.PathOf(configs.NEWNET); path != "" {
-					if err := checkHostNs(s, path); err != nil {
-						return err
-					}
+			// Is container using host netns?
+			// Here "host" means "current", not "initial".
+			netOnce.Do(func() {
+				if !config.Namespaces.Contains(configs.NEWNET) {
+					hostnet = true
+					return
 				}
-				continue
-			} else {
-				return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", s)
+				path := config.Namespaces.PathOf(configs.NEWNET)
+				if path == "" {
+					// own netns, so hostnet = false
+					return
+				}
+				hostnet, hostnetErr = isHostNetNS(path)
+			})
+			if hostnetErr != nil {
+				return hostnetErr
 			}
+			if hostnet {
+				return fmt.Errorf("sysctl %q not allowed in host network namespace", s)
+			}
+			continue
 		}
 		if config.Namespaces.Contains(configs.NEWUTS) {
 			switch s {
@@ -182,21 +201,21 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error {

 func (v *ConfigValidator) intelrdt(config *configs.Config) error {
 	if config.IntelRdt != nil {
-		if !intelrdt.IsCatEnabled() && !intelrdt.IsMbaEnabled() {
+		if !intelrdt.IsCATEnabled() && !intelrdt.IsMBAEnabled() {
 			return errors.New("intelRdt is specified in config, but Intel RDT is not supported or enabled")
 		}

-		if !intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema != "" {
+		if !intelrdt.IsCATEnabled() && config.IntelRdt.L3CacheSchema != "" {
 			return errors.New("intelRdt.l3CacheSchema is specified in config, but Intel RDT/CAT is not enabled")
 		}
-		if !intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema != "" {
+		if !intelrdt.IsMBAEnabled() && config.IntelRdt.MemBwSchema != "" {
 			return errors.New("intelRdt.memBwSchema is specified in config, but Intel RDT/MBA is not enabled")
 		}

-		if intelrdt.IsCatEnabled() && config.IntelRdt.L3CacheSchema == "" {
+		if intelrdt.IsCATEnabled() && config.IntelRdt.L3CacheSchema == "" {
 			return errors.New("Intel RDT/CAT is enabled and intelRdt is specified in config, but intelRdt.l3CacheSchema is empty")
 		}
-		if intelrdt.IsMbaEnabled() && config.IntelRdt.MemBwSchema == "" {
+		if intelrdt.IsMBAEnabled() && config.IntelRdt.MemBwSchema == "" {
 			return errors.New("Intel RDT/MBA is enabled and intelRdt is specified in config, but intelRdt.memBwSchema is empty")
 		}
 	}
@@ -204,43 +223,17 @@ func (v *ConfigValidator) intelrdt(config *configs.Config) error {
 	return nil
 }

-func isSymbolicLink(path string) (bool, error) {
-	fi, err := os.Lstat(path)
-	if err != nil {
-		return false, err
+func isHostNetNS(path string) (bool, error) {
+	const currentProcessNetns = "/proc/self/ns/net"
+
+	var st1, st2 unix.Stat_t
+
+	if err := unix.Stat(currentProcessNetns, &st1); err != nil {
+		return false, fmt.Errorf("unable to stat %q: %s", currentProcessNetns, err)
+	}
+	if err := unix.Stat(path, &st2); err != nil {
+		return false, fmt.Errorf("unable to stat %q: %s", path, err)
 	}

-	return fi.Mode()&os.ModeSymlink == os.ModeSymlink, nil
-}
-
-// checkHostNs checks whether network sysctl is used in host namespace.
-func checkHostNs(sysctlConfig string, path string) error {
-	var currentProcessNetns = "/proc/self/ns/net"
-	// readlink on the current processes network namespace
-	destOfCurrentProcess, err := os.Readlink(currentProcessNetns)
-	if err != nil {
-		return fmt.Errorf("read soft link %q error", currentProcessNetns)
-	}
-
-	// First check if the provided path is a symbolic link
-	symLink, err := isSymbolicLink(path)
-	if err != nil {
-		return fmt.Errorf("could not check that %q is a symlink: %v", path, err)
-	}
-
-	if symLink == false {
-		// The provided namespace is not a symbolic link,
-		// it is not the host namespace.
-		return nil
-	}
-
-	// readlink on the path provided in the struct
-	destOfContainer, err := os.Readlink(path)
-	if err != nil {
-		return fmt.Errorf("read soft link %q error", path)
-	}
-	if destOfContainer == destOfCurrentProcess {
-		return fmt.Errorf("sysctl %q is not allowed in the hosts network namespace", sysctlConfig)
-	}
-	return nil
+	return (st1.Dev == st2.Dev) && (st1.Ino == st2.Ino), nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/container_linux.go
@@ -14,6 +14,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"reflect"
+	"strconv"
 	"strings"
 	"sync"
 	"time"
@@ -363,24 +364,10 @@ func (c *linuxContainer) start(process *Process) error {
 	}
 	parent.forwardChildLogs()
 	if err := parent.start(); err != nil {
-		// terminate the process to ensure that it properly is reaped.
-		if err := ignoreTerminateErrors(parent.terminate()); err != nil {
-			logrus.Warn(err)
-		}
 		return newSystemErrorWithCause(err, "starting container process")
 	}
-	// generate a timestamp indicating when the container was started
-	c.created = time.Now().UTC()
-	if process.Init {
-		c.state = &createdState{
-			c: c,
-		}
-		state, err := c.updateState(parent)
-		if err != nil {
-			return err
-		}
-		c.initProcessStartTime = state.InitProcessStartTime

+	if process.Init {
 		if c.config.Hooks != nil {
 			s, err := c.currentOCIState()
 			if err != nil {
@@ -463,7 +450,7 @@ func (c *linuxContainer) includeExecFifo(cmd *exec.Cmd) error {

 	cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fifoFd), fifoName))
 	cmd.Env = append(cmd.Env,
-		fmt.Sprintf("_LIBCONTAINER_FIFOFD=%d", stdioFdCount+len(cmd.ExtraFiles)-1))
+		"_LIBCONTAINER_FIFOFD="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1))
 	return nil
 }

@@ -506,24 +493,24 @@ func (c *linuxContainer) commandTemplate(p *Process, childInitPipe *os.File, chi
 	if cmd.SysProcAttr == nil {
 		cmd.SysProcAttr = &unix.SysProcAttr{}
 	}
-	cmd.Env = append(cmd.Env, fmt.Sprintf("GOMAXPROCS=%s", os.Getenv("GOMAXPROCS")))
+	cmd.Env = append(cmd.Env, "GOMAXPROCS="+os.Getenv("GOMAXPROCS"))
 	cmd.ExtraFiles = append(cmd.ExtraFiles, p.ExtraFiles...)
 	if p.ConsoleSocket != nil {
 		cmd.ExtraFiles = append(cmd.ExtraFiles, p.ConsoleSocket)
 		cmd.Env = append(cmd.Env,
-			fmt.Sprintf("_LIBCONTAINER_CONSOLE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
+			"_LIBCONTAINER_CONSOLE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
 		)
 	}
 	cmd.ExtraFiles = append(cmd.ExtraFiles, childInitPipe)
 	cmd.Env = append(cmd.Env,
-		fmt.Sprintf("_LIBCONTAINER_INITPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
-		fmt.Sprintf("_LIBCONTAINER_STATEDIR=%s", c.root),
+		"_LIBCONTAINER_INITPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
+		"_LIBCONTAINER_STATEDIR="+c.root,
 	)

 	cmd.ExtraFiles = append(cmd.ExtraFiles, childLogPipe)
 	cmd.Env = append(cmd.Env,
-		fmt.Sprintf("_LIBCONTAINER_LOGPIPE=%d", stdioFdCount+len(cmd.ExtraFiles)-1),
-		fmt.Sprintf("_LIBCONTAINER_LOGLEVEL=%s", p.LogLevel),
+		"_LIBCONTAINER_LOGPIPE="+strconv.Itoa(stdioFdCount+len(cmd.ExtraFiles)-1),
+		"_LIBCONTAINER_LOGLEVEL="+p.LogLevel,
 	)

 	// NOTE: when running a container with no PID namespace and the parent process spawning the container is
@@ -693,8 +680,7 @@ var criuFeatures *criurpc.CriuFeatures

 func (c *linuxContainer) checkCriuFeatures(criuOpts *CriuOpts, rpcOpts *criurpc.CriuOpts, criuFeat *criurpc.CriuFeatures) error {

-	var t criurpc.CriuReqType
-	t = criurpc.CriuReqType_FEATURE_CHECK
+	t := criurpc.CriuReqType_FEATURE_CHECK

 	// make sure the features we are looking for are really not from
 	// some previous check
@@ -777,11 +763,7 @@ func (c *linuxContainer) checkCriuVersion(minVersion int) error {
 const descriptorsFilename = "descriptors.json"

 func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
-	mountDest := m.Destination
-	if strings.HasPrefix(mountDest, c.config.Rootfs) {
-		mountDest = mountDest[len(c.config.Rootfs):]
-	}
-
+	mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
 	extMnt := &criurpc.ExtMountMap{
 		Key: proto.String(mountDest),
 		Val: proto.String(mountDest),
@@ -853,7 +835,7 @@ func (c *linuxContainer) criuSupportsExtNS(t configs.NamespaceType) bool {
 	return c.checkCriuVersion(minVersion) == nil
 }

-func (c *linuxContainer) criuNsToKey(t configs.NamespaceType) string {
+func criuNsToKey(t configs.NamespaceType) string {
 	return "extRoot" + strings.Title(configs.NsName(t)) + "NS"
 }

@@ -873,12 +855,50 @@ func (c *linuxContainer) handleCheckpointingExternalNamespaces(rpcOpts *criurpc.
 	if err := unix.Stat(nsPath, &ns); err != nil {
 		return err
 	}
-	criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, c.criuNsToKey(t))
+	criuExternal := fmt.Sprintf("%s[%d]:%s", configs.NsName(t), ns.Ino, criuNsToKey(t))
 	rpcOpts.External = append(rpcOpts.External, criuExternal)

 	return nil
 }

+func (c *linuxContainer) handleRestoringNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File) error {
+	for _, ns := range c.config.Namespaces {
+		switch ns.Type {
+		case configs.NEWNET, configs.NEWPID:
+			// If the container is running in a network or PID namespace and has
+			// a path to the network or PID namespace configured, we will dump
+			// that network or PID namespace as an external namespace and we
+			// will expect that the namespace exists during restore.
+			// This basically means that CRIU will ignore the namespace
+			// and expect it to be setup correctly.
+			if err := c.handleRestoringExternalNamespaces(rpcOpts, extraFiles, ns.Type); err != nil {
+				return err
+			}
+		default:
+			// For all other namespaces except NET and PID CRIU has
+			// a simpler way of joining the existing namespace if set
+			nsPath := c.config.Namespaces.PathOf(ns.Type)
+			if nsPath == "" {
+				continue
+			}
+			if ns.Type == configs.NEWCGROUP {
+				// CRIU has no code to handle NEWCGROUP
+				return fmt.Errorf("Do not know how to handle namespace %v", ns.Type)
+			}
+			// CRIU has code to handle NEWTIME, but it does not seem to be defined in runc
+
+			// CRIU will issue a warning for NEWUSER:
+			// criu/namespaces.c: 'join-ns with user-namespace is not fully tested and dangerous'
+			rpcOpts.JoinNs = append(rpcOpts.JoinNs, &criurpc.JoinNamespace{
+				Ns:     proto.String(configs.NsName(ns.Type)),
+				NsFile: proto.String(nsPath),
+			})
+		}
+	}
+
+	return nil
+}
+
 func (c *linuxContainer) handleRestoringExternalNamespaces(rpcOpts *criurpc.CriuOpts, extraFiles *[]*os.File, t configs.NamespaceType) error {
 	if !c.criuSupportsExtNS(t) {
 		return nil
@@ -897,11 +917,12 @@ func (c *linuxContainer) handleRestoringExternalNamespaces(rpcOpts *criurpc.Criu
 		logrus.Errorf("If a specific network namespace is defined it must exist: %s", err)
 		return fmt.Errorf("Requested network namespace %v does not exist", nsPath)
 	}
-	inheritFd := new(criurpc.InheritFd)
-	inheritFd.Key = proto.String(c.criuNsToKey(t))
-	// The offset of four is necessary because 0, 1, 2 and 3 is already
-	// used by stdin, stdout, stderr, 'criu swrk' socket.
-	inheritFd.Fd = proto.Int32(int32(4 + len(*extraFiles)))
+	inheritFd := &criurpc.InheritFd{
+		Key: proto.String(criuNsToKey(t)),
+		// The offset of four is necessary because 0, 1, 2 and 3 are
+		// already used by stdin, stdout, stderr, 'criu swrk' socket.
+		Fd: proto.Int32(int32(4 + len(*extraFiles))),
+	}
 	rpcOpts.InheritFd = append(rpcOpts.InheritFd, inheritFd)
 	// All open FDs need to be transferred to CRIU via extraFiles
 	*extraFiles = append(*extraFiles, nsFd)
@@ -1120,11 +1141,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
 }

 func (c *linuxContainer) addCriuRestoreMount(req *criurpc.CriuReq, m *configs.Mount) {
-	mountDest := m.Destination
-	if strings.HasPrefix(mountDest, c.config.Rootfs) {
-		mountDest = mountDest[len(c.config.Rootfs):]
-	}
-
+	mountDest := strings.TrimPrefix(m.Destination, c.config.Rootfs)
 	extMnt := &criurpc.ExtMountMap{
 		Key: proto.String(mountDest),
 		Val: proto.String(m.Source),
@@ -1309,15 +1326,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {

 	c.handleCriuConfigurationFile(req.Opts)

-	// Same as during checkpointing. If the container has a specific network namespace
-	// assigned to it, this now expects that the checkpoint will be restored in a
-	// already created network namespace.
-	if err := c.handleRestoringExternalNamespaces(req.Opts, &extraFiles, configs.NEWNET); err != nil {
-		return err
-	}
-
-	// Same for PID namespaces.
-	if err := c.handleRestoringExternalNamespaces(req.Opts, &extraFiles, configs.NEWPID); err != nil {
+	if err := c.handleRestoringNamespaces(req.Opts, &extraFiles); err != nil {
 		return err
 	}

@@ -1540,7 +1549,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *

 	buf := make([]byte, 10*4096)
 	oob := make([]byte, 4096)
-	for true {
+	for {
 		n, oobn, _, _, err := criuClientCon.ReadMsgUnix(buf, oob)
 		if req.Opts != nil && req.Opts.StatusFd != nil {
 			// Close status_fd as soon as we got something back from criu,
@@ -1792,10 +1801,6 @@ func (c *linuxContainer) saveState(s *State) (retErr error) {
 	return os.Rename(tmpFile.Name(), stateFilePath)
 }

-func (c *linuxContainer) deleteState() error {
-	return os.Remove(filepath.Join(c.root, stateFilename))
-}
-
 func (c *linuxContainer) currentStatus() (Status, error) {
 	if err := c.refreshState(); err != nil {
 		return -1, err
@@ -2042,7 +2047,7 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na
 		// write oom_score_adj
 		r.AddData(&Bytemsg{
 			Type:  OomScoreAdjAttr,
-			Value: []byte(fmt.Sprintf("%d", *c.config.OomScoreAdj)),
+			Value: []byte(strconv.Itoa(*c.config.OomScoreAdj)),
 		})
 	}

@@ -2062,9 +2067,21 @@ func ignoreTerminateErrors(err error) error {
 	if err == nil {
 		return nil
 	}
+	// terminate() might return an error from ether Kill or Wait.
+	// The (*Cmd).Wait documentation says: "If the command fails to run
+	// or doesn't complete successfully, the error is of type *ExitError".
+	// Filter out such errors (like "exit status 1" or "signal: killed").
+	var exitErr *exec.ExitError
+	if errors.As(err, &exitErr) {
+		return nil
+	}
+	// TODO: use errors.Is(err, os.ErrProcessDone) here and
+	// remove "process already finished" string comparison below
+	// once go 1.16 is minimally supported version.
+
 	s := err.Error()
-	switch {
-	case strings.Contains(s, "process already finished"), strings.Contains(s, "Wait was already called"):
+	if strings.Contains(s, "process already finished") ||
+		strings.Contains(s, "Wait was already called") {
 		return nil
 	}
 	return err
--- a/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/criu_opts_linux.go
@@ -1,14 +1,6 @@
 package libcontainer

-// cgroup restoring strategy provided by criu
-type cgMode uint32
-
-const (
-	CRIU_CG_MODE_SOFT    cgMode = 3 + iota // restore cgroup properties if only dir created by criu
-	CRIU_CG_MODE_FULL                      // always restore all cgroups and their properties
-	CRIU_CG_MODE_STRICT                    // restore all, requiring them to not present in the system
-	CRIU_CG_MODE_DEFAULT                   // the same as CRIU_CG_MODE_SOFT
-)
+import criu "github.com/checkpoint-restore/go-criu/v4/rpc"

 type CriuPageServerInfo struct {
 	Address string // IP address of CRIU page server
@@ -32,7 +24,7 @@ type CriuOpts struct {
 	PreDump                 bool               // call criu predump to perform iterative checkpoint
 	PageServer              CriuPageServerInfo // allow to dump to criu page server
 	VethPairs               []VethPairName     // pass the veth to criu when restore
-	ManageCgroupsMode       cgMode             // dump or restore cgroup mode
+	ManageCgroupsMode       criu.CriuCgMode    // dump or restore cgroup mode
 	EmptyNs                 uint32             // don't c/r properties for namespace from this mask
 	AutoDedup               bool               // auto deduplication for incremental dumps
 	LazyPages               bool               // restore memory pages lazily using userfaultfd
--- a/vendor/github.com/opencontainers/runc/libcontainer/devices/device.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/devices/device.go
@@ -1,4 +1,4 @@
-package configs
+package devices

 import (
 	"fmt"
@@ -11,7 +11,7 @@ const (
 )

 type Device struct {
-	DeviceRule
+	Rule

 	// Path to the device.
 	Path string `json:"path"`
@@ -26,10 +26,10 @@ type Device struct {
 	Gid uint32 `json:"gid"`
 }

-// DevicePermissions is a cgroupv1-style string to represent device access. It
+// Permissions is a cgroupv1-style string to represent device access. It
 // has to be a string for backward compatibility reasons, hence why it has
 // methods to do set operations.
-type DevicePermissions string
+type Permissions string

 const (
 	deviceRead uint = (1 << iota)
@@ -37,7 +37,7 @@ const (
 	deviceMknod
 )

-func (p DevicePermissions) toSet() uint {
+func (p Permissions) toSet() uint {
 	var set uint
 	for _, perm := range p {
 		switch perm {
@@ -52,7 +52,7 @@ func (p DevicePermissions) toSet() uint {
 	return set
 }

-func fromSet(set uint) DevicePermissions {
+func fromSet(set uint) Permissions {
 	var perm string
 	if set&deviceRead == deviceRead {
 		perm += "r"
@@ -63,53 +63,53 @@ func fromSet(set uint) DevicePermissions {
 	if set&deviceMknod == deviceMknod {
 		perm += "m"
 	}
-	return DevicePermissions(perm)
+	return Permissions(perm)
 }

-// Union returns the union of the two sets of DevicePermissions.
-func (p DevicePermissions) Union(o DevicePermissions) DevicePermissions {
+// Union returns the union of the two sets of Permissions.
+func (p Permissions) Union(o Permissions) Permissions {
 	lhs := p.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs | rhs)
 }

-// Difference returns the set difference of the two sets of DevicePermissions.
+// Difference returns the set difference of the two sets of Permissions.
 // In set notation, A.Difference(B) gives you A\B.
-func (p DevicePermissions) Difference(o DevicePermissions) DevicePermissions {
+func (p Permissions) Difference(o Permissions) Permissions {
 	lhs := p.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs &^ rhs)
 }

-// Intersection computes the intersection of the two sets of DevicePermissions.
-func (p DevicePermissions) Intersection(o DevicePermissions) DevicePermissions {
+// Intersection computes the intersection of the two sets of Permissions.
+func (p Permissions) Intersection(o Permissions) Permissions {
 	lhs := p.toSet()
 	rhs := o.toSet()
 	return fromSet(lhs & rhs)
 }

-// IsEmpty returns whether the set of permissions in a DevicePermissions is
+// IsEmpty returns whether the set of permissions in a Permissions is
 // empty.
-func (p DevicePermissions) IsEmpty() bool {
-	return p == DevicePermissions("")
+func (p Permissions) IsEmpty() bool {
+	return p == Permissions("")
 }

 // IsValid returns whether the set of permissions is a subset of valid
 // permissions (namely, {r,w,m}).
-func (p DevicePermissions) IsValid() bool {
+func (p Permissions) IsValid() bool {
 	return p == fromSet(p.toSet())
 }

-type DeviceType rune
+type Type rune

 const (
-	WildcardDevice DeviceType = 'a'
-	BlockDevice    DeviceType = 'b'
-	CharDevice     DeviceType = 'c' // or 'u'
-	FifoDevice     DeviceType = 'p'
+	WildcardDevice Type = 'a'
+	BlockDevice    Type = 'b'
+	CharDevice     Type = 'c' // or 'u'
+	FifoDevice     Type = 'p'
 )

-func (t DeviceType) IsValid() bool {
+func (t Type) IsValid() bool {
 	switch t {
 	case WildcardDevice, BlockDevice, CharDevice, FifoDevice:
 		return true
@@ -118,7 +118,7 @@ func (t DeviceType) IsValid() bool {
 	}
 }

-func (t DeviceType) CanMknod() bool {
+func (t Type) CanMknod() bool {
 	switch t {
 	case BlockDevice, CharDevice, FifoDevice:
 		return true
@@ -127,7 +127,7 @@ func (t DeviceType) CanMknod() bool {
 	}
 }

-func (t DeviceType) CanCgroup() bool {
+func (t Type) CanCgroup() bool {
 	switch t {
 	case WildcardDevice, BlockDevice, CharDevice:
 		return true
@@ -136,10 +136,10 @@ func (t DeviceType) CanCgroup() bool {
 	}
 }

-type DeviceRule struct {
+type Rule struct {
 	// Type of device ('c' for char, 'b' for block). If set to 'a', this rule
 	// acts as a wildcard and all fields other than Allow are ignored.
-	Type DeviceType `json:"type"`
+	Type Type `json:"type"`

 	// Major is the device's major number.
 	Major int64 `json:"major"`
@@ -149,13 +149,13 @@ type DeviceRule struct {

 	// Permissions is the set of permissions that this rule applies to (in the
 	// cgroupv1 format -- any combination of "rwm").
-	Permissions DevicePermissions `json:"permissions"`
+	Permissions Permissions `json:"permissions"`

 	// Allow specifies whether this rule is allowed.
 	Allow bool `json:"allow"`
 }

-func (d *DeviceRule) CgroupString() string {
+func (d *Rule) CgroupString() string {
 	var (
 		major = strconv.FormatInt(d.Major, 10)
 		minor = strconv.FormatInt(d.Minor, 10)
--- a/vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go
@@ -1,6 +1,6 @@
 // +build !windows

-package configs
+package devices

 import (
 	"errors"
@@ -8,7 +8,7 @@ import (
 	"golang.org/x/sys/unix"
 )

-func (d *DeviceRule) Mkdev() (uint64, error) {
+func (d *Rule) Mkdev() (uint64, error) {
 	if d.Major == Wildcard || d.Minor == Wildcard {
 		return 0, errors.New("cannot mkdev() device with wildcards")
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/devices/device_windows.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/devices/device_windows.go
@@ -0,0 +1,5 @@
+package devices
+
+func (d *Rule) Mkdev() (uint64, error) {
+	return 0, nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go
@@ -0,0 +1,112 @@
+package devices
+
+import (
+	"errors"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+
+	"golang.org/x/sys/unix"
+)
+
+var (
+	// ErrNotADevice denotes that a file is not a valid linux device.
+	ErrNotADevice = errors.New("not a device node")
+)
+
+// Testing dependencies
+var (
+	unixLstat     = unix.Lstat
+	ioutilReadDir = ioutil.ReadDir
+)
+
+// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the
+// information about a linux device and return that information as a Device struct.
+func DeviceFromPath(path, permissions string) (*Device, error) {
+	var stat unix.Stat_t
+	err := unixLstat(path, &stat)
+	if err != nil {
+		return nil, err
+	}
+
+	var (
+		devType   Type
+		mode      = stat.Mode
+		devNumber = uint64(stat.Rdev)
+		major     = unix.Major(devNumber)
+		minor     = unix.Minor(devNumber)
+	)
+	switch mode & unix.S_IFMT {
+	case unix.S_IFBLK:
+		devType = BlockDevice
+	case unix.S_IFCHR:
+		devType = CharDevice
+	case unix.S_IFIFO:
+		devType = FifoDevice
+	default:
+		return nil, ErrNotADevice
+	}
+	return &Device{
+		Rule: Rule{
+			Type:        devType,
+			Major:       int64(major),
+			Minor:       int64(minor),
+			Permissions: Permissions(permissions),
+		},
+		Path:     path,
+		FileMode: os.FileMode(mode),
+		Uid:      stat.Uid,
+		Gid:      stat.Gid,
+	}, nil
+}
+
+// HostDevices returns all devices that can be found under /dev directory.
+func HostDevices() ([]*Device, error) {
+	return GetDevices("/dev")
+}
+
+// GetDevices recursively traverses a directory specified by path
+// and returns all devices found there.
+func GetDevices(path string) ([]*Device, error) {
+	files, err := ioutilReadDir(path)
+	if err != nil {
+		return nil, err
+	}
+	var out []*Device
+	for _, f := range files {
+		switch {
+		case f.IsDir():
+			switch f.Name() {
+			// ".lxc" & ".lxd-mounts" added to address https://github.com/lxc/lxd/issues/2825
+			// ".udev" added to address https://github.com/opencontainers/runc/issues/2093
+			case "pts", "shm", "fd", "mqueue", ".lxc", ".lxd-mounts", ".udev":
+				continue
+			default:
+				sub, err := GetDevices(filepath.Join(path, f.Name()))
+				if err != nil {
+					return nil, err
+				}
+
+				out = append(out, sub...)
+				continue
+			}
+		case f.Name() == "console":
+			continue
+		}
+		device, err := DeviceFromPath(filepath.Join(path, f.Name()), "rwm")
+		if err != nil {
+			if err == ErrNotADevice {
+				continue
+			}
+			if os.IsNotExist(err) {
+				continue
+			}
+			return nil, err
+		}
+		if device.Type == FifoDevice {
+			continue
+		}
+		out = append(out, device)
+	}
+	return out, nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/factory_linux.go
@@ -148,11 +148,11 @@ func RootlessCgroupfs(l *LinuxFactory) error {
 // containers that use the Intel RDT "resource control" filesystem to
 // create and manage Intel RDT resources (e.g., L3 cache, memory bandwidth).
 func IntelRdtFs(l *LinuxFactory) error {
-	l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager {
-		return &intelrdt.IntelRdtManager{
-			Config: config,
-			Id:     id,
-			Path:   path,
+	if !intelrdt.IsCATEnabled() && !intelrdt.IsMBAEnabled() {
+		l.NewIntelRdtManager = nil
+	} else {
+		l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager {
+			return intelrdt.NewManager(config, id, path)
 		}
 	}
 	return nil
@@ -276,7 +276,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err
 		newgidmapPath: l.NewgidmapPath,
 		cgroupManager: l.NewCgroupsManager(config.Cgroups, nil),
 	}
-	if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
+	if l.NewIntelRdtManager != nil {
 		c.intelRdtManager = l.NewIntelRdtManager(config, id, "")
 	}
 	c.state = &stoppedState{c: c}
@@ -318,13 +318,13 @@ func (l *LinuxFactory) Load(id string) (Container, error) {
 		root:                 containerRoot,
 		created:              state.Created,
 	}
+	if l.NewIntelRdtManager != nil {
+		c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath)
+	}
 	c.state = &loadedState{c: c}
 	if err := c.refreshState(); err != nil {
 		return nil, err
 	}
-	if intelrdt.IsCatEnabled() || intelrdt.IsMbaEnabled() {
-		c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath)
-	}
 	return c, nil
 }

@@ -335,35 +335,28 @@ func (l *LinuxFactory) Type() string {
 // StartInitialization loads a container by opening the pipe fd from the parent to read the configuration and state
 // This is a low level implementation detail of the reexec and should not be consumed externally
 func (l *LinuxFactory) StartInitialization() (err error) {
-	var (
-		pipefd, fifofd int
-		consoleSocket  *os.File
-		envInitPipe    = os.Getenv("_LIBCONTAINER_INITPIPE")
-		envFifoFd      = os.Getenv("_LIBCONTAINER_FIFOFD")
-		envConsole     = os.Getenv("_LIBCONTAINER_CONSOLE")
-	)
-
 	// Get the INITPIPE.
-	pipefd, err = strconv.Atoi(envInitPipe)
+	envInitPipe := os.Getenv("_LIBCONTAINER_INITPIPE")
+	pipefd, err := strconv.Atoi(envInitPipe)
 	if err != nil {
 		return fmt.Errorf("unable to convert _LIBCONTAINER_INITPIPE=%s to int: %s", envInitPipe, err)
 	}
-
-	var (
-		pipe = os.NewFile(uintptr(pipefd), "pipe")
-		it   = initType(os.Getenv("_LIBCONTAINER_INITTYPE"))
-	)
+	pipe := os.NewFile(uintptr(pipefd), "pipe")
 	defer pipe.Close()

 	// Only init processes have FIFOFD.
-	fifofd = -1
+	fifofd := -1
+	envInitType := os.Getenv("_LIBCONTAINER_INITTYPE")
+	it := initType(envInitType)
 	if it == initStandard {
+		envFifoFd := os.Getenv("_LIBCONTAINER_FIFOFD")
 		if fifofd, err = strconv.Atoi(envFifoFd); err != nil {
 			return fmt.Errorf("unable to convert _LIBCONTAINER_FIFOFD=%s to int: %s", envFifoFd, err)
 		}
 	}

-	if envConsole != "" {
+	var consoleSocket *os.File
+	if envConsole := os.Getenv("_LIBCONTAINER_CONSOLE"); envConsole != "" {
 		console, err := strconv.Atoi(envConsole)
 		if err != nil {
 			return fmt.Errorf("unable to convert _LIBCONTAINER_CONSOLE=%s to int: %s", envConsole, err)
--- a/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/init_linux.go
@@ -3,6 +3,7 @@
 package libcontainer

 import (
+	"bytes"
 	"encoding/json"
 	"fmt"
 	"io"
@@ -12,9 +13,8 @@ import (
 	"strings"
 	"unsafe"

-	"golang.org/x/sys/unix"
-
 	"github.com/containerd/console"
+	"github.com/opencontainers/runc/libcontainer/capabilities"
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/system"
@@ -24,6 +24,7 @@ import (
 	"github.com/pkg/errors"
 	"github.com/sirupsen/logrus"
 	"github.com/vishvananda/netlink"
+	"golang.org/x/sys/unix"
 )

 type initType string
@@ -128,19 +129,13 @@ func finalizeNamespace(config *initConfig) error {
 		return errors.Wrap(err, "close exec fds")
 	}

-	if config.Cwd != "" {
-		if err := unix.Chdir(config.Cwd); err != nil {
-			return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err)
-		}
-	}
-
-	capabilities := &configs.Capabilities{}
+	caps := &configs.Capabilities{}
 	if config.Capabilities != nil {
-		capabilities = config.Capabilities
+		caps = config.Capabilities
 	} else if config.Config.Capabilities != nil {
-		capabilities = config.Config.Capabilities
+		caps = config.Config.Capabilities
 	}
-	w, err := newContainerCapList(capabilities)
+	w, err := capabilities.New(caps)
 	if err != nil {
 		return err
 	}
@@ -155,6 +150,14 @@ func finalizeNamespace(config *initConfig) error {
 	if err := setupUser(config); err != nil {
 		return errors.Wrap(err, "setup user")
 	}
+	// Change working directory AFTER the user has been set up.
+	// Otherwise, if the cwd is also a volume that's been chowned to the container user (and not the user running runc),
+	// this command will EPERM.
+	if config.Cwd != "" {
+		if err := unix.Chdir(config.Cwd); err != nil {
+			return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err)
+		}
+	}
 	if err := system.ClearKeepCaps(); err != nil {
 		return errors.Wrap(err, "clear keep caps")
 	}
@@ -304,7 +307,7 @@ func setupUser(config *initConfig) error {
 	// There's nothing we can do about /etc/group entries, so we silently
 	// ignore setting groups here (since the user didn't explicitly ask us to
 	// set the group).
-	allowSupGroups := !config.RootlessEUID && strings.TrimSpace(string(setgroups)) != "deny"
+	allowSupGroups := !config.RootlessEUID && string(bytes.TrimSpace(setgroups)) != "deny"

 	if allowSupGroups {
 		suppGroups := append(execUser.Sgids, addGroups...)
@@ -431,6 +434,7 @@ func setupRlimits(limits []configs.Rlimit, pid int) error {

 const _P_PID = 1

+//nolint:structcheck,unused
 type siginfo struct {
 	si_signo int32
 	si_errno int32
@@ -480,7 +484,9 @@ func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
 	}
 	pids, err := m.GetAllPids()
 	if err != nil {
-		m.Freeze(configs.Thawed)
+		if err := m.Freeze(configs.Thawed); err != nil {
+			logrus.Warn(err)
+		}
 		return err
 	}
 	for _, pid := range pids {
--- a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/cmt.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/cmt.go
@@ -6,17 +6,20 @@ var (

 // Check if Intel RDT/CMT is enabled.
 func IsCMTEnabled() bool {
+	featuresInit()
 	return cmtEnabled
 }

 func getCMTNumaNodeStats(numaPath string) (*CMTNumaNodeStats, error) {
 	stats := &CMTNumaNodeStats{}

-	llcOccupancy, err := getIntelRdtParamUint(numaPath, "llc_occupancy")
-	if err != nil {
-		return nil, err
+	if enabledMonFeatures.llcOccupancy {
+		llcOccupancy, err := getIntelRdtParamUint(numaPath, "llc_occupancy")
+		if err != nil {
+			return nil, err
+		}
+		stats.LLCOccupancy = llcOccupancy
 	}
-	stats.LLCOccupancy = llcOccupancy

 	return stats, nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/intelrdt.go
@@ -4,7 +4,9 @@ package intelrdt

 import (
 	"bufio"
+	"bytes"
 	"fmt"
+	"io"
 	"io/ioutil"
 	"os"
 	"path/filepath"
@@ -12,6 +14,7 @@ import (
 	"strings"
 	"sync"

+	"github.com/moby/sys/mountinfo"
 	"github.com/opencontainers/runc/libcontainer/configs"
 )

@@ -162,11 +165,19 @@ type Manager interface {
 }

 // This implements interface Manager
-type IntelRdtManager struct {
+type intelRdtManager struct {
 	mu     sync.Mutex
-	Config *configs.Config
-	Id     string
-	Path   string
+	config *configs.Config
+	id     string
+	path   string
+}
+
+func NewManager(config *configs.Config, id string, path string) Manager {
+	return &intelRdtManager{
+		config: config,
+		id:     id,
+		path:   path,
+	}
 }

 const (
@@ -179,11 +190,14 @@ var (
 	intelRdtRootLock sync.Mutex

 	// The flag to indicate if Intel RDT/CAT is enabled
-	isCatEnabled bool
+	catEnabled bool
 	// The flag to indicate if Intel RDT/MBA is enabled
-	isMbaEnabled bool
+	mbaEnabled bool
 	// The flag to indicate if Intel RDT/MBA Software Controller is enabled
-	isMbaScEnabled bool
+	mbaScEnabled bool
+
+	// For Intel RDT initialization
+	initOnce sync.Once
 )

 type intelRdtData struct {
@@ -192,94 +206,80 @@ type intelRdtData struct {
 	pid    int
 }

-// Check if Intel RDT sub-features are enabled in init()
-func init() {
-	// 1. Check if hardware and kernel support Intel RDT sub-features
-	flagsSet, err := parseCpuInfoFile("/proc/cpuinfo")
-	if err != nil {
-		return
-	}
-
-	// 2. Check if Intel RDT "resource control" filesystem is mounted
-	// The user guarantees to mount the filesystem
-	if !isIntelRdtMounted() {
-		return
-	}
-
-	// 3. Double check if Intel RDT sub-features are available in
-	// "resource control" filesystem. Intel RDT sub-features can be
-	// selectively disabled or enabled by kernel command line
-	// (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel
-	if flagsSet.CAT {
-		if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3")); err == nil {
-			isCatEnabled = true
-		}
-	}
-	if isMbaScEnabled {
-		// We confirm MBA Software Controller is enabled in step 2,
-		// MBA should be enabled because MBA Software Controller
-		// depends on MBA
-		isMbaEnabled = true
-	} else if flagsSet.MBA {
-		if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "MB")); err == nil {
-			isMbaEnabled = true
-		}
-	}
-
-	if flagsSet.MBMTotal || flagsSet.MBMLocal {
-		if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3_MON")); err == nil {
-			mbmEnabled = true
-			cmtEnabled = true
-		}
-
-		enabledMonFeatures, err = getMonFeatures(intelRdtRoot)
+// Check if Intel RDT sub-features are enabled in featuresInit()
+func featuresInit() {
+	initOnce.Do(func() {
+		// 1. Check if hardware and kernel support Intel RDT sub-features
+		flagsSet, err := parseCpuInfoFile("/proc/cpuinfo")
 		if err != nil {
 			return
 		}
-	}
+
+		// 2. Check if Intel RDT "resource control" filesystem is mounted
+		// The user guarantees to mount the filesystem
+		if !isIntelRdtMounted() {
+			return
+		}
+
+		// 3. Double check if Intel RDT sub-features are available in
+		// "resource control" filesystem. Intel RDT sub-features can be
+		// selectively disabled or enabled by kernel command line
+		// (e.g., rdt=!l3cat,mba) in 4.14 and newer kernel
+		if flagsSet.CAT {
+			if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3")); err == nil {
+				catEnabled = true
+			}
+		}
+		if mbaScEnabled {
+			// We confirm MBA Software Controller is enabled in step 2,
+			// MBA should be enabled because MBA Software Controller
+			// depends on MBA
+			mbaEnabled = true
+		} else if flagsSet.MBA {
+			if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "MB")); err == nil {
+				mbaEnabled = true
+			}
+		}
+		if flagsSet.MBMTotal || flagsSet.MBMLocal || flagsSet.CMT {
+			if _, err := os.Stat(filepath.Join(intelRdtRoot, "info", "L3_MON")); err != nil {
+				return
+			}
+			enabledMonFeatures, err = getMonFeatures(intelRdtRoot)
+			if err != nil {
+				return
+			}
+			if enabledMonFeatures.mbmTotalBytes || enabledMonFeatures.mbmLocalBytes {
+				mbmEnabled = true
+			}
+			if enabledMonFeatures.llcOccupancy {
+				cmtEnabled = true
+			}
+		}
+	})
 }

 // Return the mount point path of Intel RDT "resource control" filesysem
-func findIntelRdtMountpointDir() (string, error) {
-	f, err := os.Open("/proc/self/mountinfo")
+func findIntelRdtMountpointDir(f io.Reader) (string, error) {
+	mi, err := mountinfo.GetMountsFromReader(f, func(m *mountinfo.Info) (bool, bool) {
+		// similar to mountinfo.FSTypeFilter but stops after the first match
+		if m.FSType == "resctrl" {
+			return false, true // don't skip, stop
+		}
+		return true, false // skip, keep going
+	})
 	if err != nil {
 		return "", err
 	}
-	defer f.Close()
-
-	s := bufio.NewScanner(f)
-	for s.Scan() {
-		text := s.Text()
-		fields := strings.Split(text, " ")
-		// Safe as mountinfo encodes mountpoints with spaces as \040.
-		index := strings.Index(text, " - ")
-		postSeparatorFields := strings.Fields(text[index+3:])
-		numPostFields := len(postSeparatorFields)
-
-		// This is an error as we can't detect if the mount is for "Intel RDT"
-		if numPostFields == 0 {
-			return "", fmt.Errorf("Found no fields post '-' in %q", text)
-		}
-
-		if postSeparatorFields[0] == "resctrl" {
-			// Check that the mount is properly formatted.
-			if numPostFields < 3 {
-				return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
-			}
-
-			// Check if MBA Software Controller is enabled through mount option "-o mba_MBps"
-			if strings.Contains(postSeparatorFields[2], "mba_MBps") {
-				isMbaScEnabled = true
-			}
-
-			return fields[4], nil
-		}
-	}
-	if err := s.Err(); err != nil {
-		return "", err
+	if len(mi) < 1 {
+		return "", NewNotFoundError("Intel RDT")
 	}

-	return "", NewNotFoundError("Intel RDT")
+	// Check if MBA Software Controller is enabled through mount option "-o mba_MBps"
+	if strings.Contains(","+mi[0].VFSOptions+",", ",mba_MBps,") {
+		mbaScEnabled = true
+	}
+
+	return mi[0].Mountpoint, nil
 }

 // Gets the root path of Intel RDT "resource control" filesystem
@@ -291,7 +291,12 @@ func getIntelRdtRoot() (string, error) {
 		return intelRdtRoot, nil
 	}

-	root, err := findIntelRdtMountpointDir()
+	f, err := os.Open("/proc/self/mountinfo")
+	if err != nil {
+		return "", err
+	}
+	root, err := findIntelRdtMountpointDir(f)
+	f.Close()
 	if err != nil {
 		return "", err
 	}
@@ -306,11 +311,7 @@ func getIntelRdtRoot() (string, error) {

 func isIntelRdtMounted() bool {
 	_, err := getIntelRdtRoot()
-	if err != nil {
-		return false
-	}
-
-	return true
+	return err == nil
 }

 type cpuInfoFlags struct {
@@ -320,6 +321,8 @@ type cpuInfoFlags struct {
 	// Memory Bandwidth Monitoring related.
 	MBMTotal bool
 	MBMLocal bool
+
+	CMT bool // Cache Monitoring Technology
 }

 func parseCpuInfoFile(path string) (cpuInfoFlags, error) {
@@ -349,6 +352,8 @@ func parseCpuInfoFile(path string) (cpuInfoFlags, error) {
 					infoFlags.MBMTotal = true
 				case "cqm_mbm_local":
 					infoFlags.MBMLocal = true
+				case "cqm_occup_llc":
+					infoFlags.CMT = true
 				}
 			}
 			return infoFlags, nil
@@ -387,7 +392,7 @@ func getIntelRdtParamUint(path, file string) (uint64, error) {
 		return 0, err
 	}

-	res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64)
+	res, err := parseUint(string(bytes.TrimSpace(contents)), 10, 64)
 	if err != nil {
 		return res, fmt.Errorf("unable to parse %q as a uint from file %q", string(contents), fileName)
 	}
@@ -401,14 +406,14 @@ func getIntelRdtParamString(path, file string) (string, error) {
 		return "", err
 	}

-	return strings.TrimSpace(string(contents)), nil
+	return string(bytes.TrimSpace(contents)), nil
 }

 func writeFile(dir, file, data string) error {
 	if dir == "" {
 		return fmt.Errorf("no such directory for %s", file)
 	}
-	if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0700); err != nil {
+	if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0o600); err != nil {
 		return fmt.Errorf("failed to write %v to %v: %v", data, file, err)
 	}
 	return nil
@@ -515,7 +520,7 @@ func WriteIntelRdtTasks(dir string, pid int) error {

 	// Don't attach any pid if -1 is specified as a pid
 	if pid != -1 {
-		if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil {
+		if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0o600); err != nil {
 			return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err)
 		}
 	}
@@ -523,18 +528,21 @@ func WriteIntelRdtTasks(dir string, pid int) error {
 }

 // Check if Intel RDT/CAT is enabled
-func IsCatEnabled() bool {
-	return isCatEnabled
+func IsCATEnabled() bool {
+	featuresInit()
+	return catEnabled
 }

 // Check if Intel RDT/MBA is enabled
-func IsMbaEnabled() bool {
-	return isMbaEnabled
+func IsMBAEnabled() bool {
+	featuresInit()
+	return mbaEnabled
 }

 // Check if Intel RDT/MBA Software Controller is enabled
-func IsMbaScEnabled() bool {
-	return isMbaScEnabled
+func IsMBAScEnabled() bool {
+	featuresInit()
+	return mbaScEnabled
 }

 // Get the 'container_id' path in Intel RDT "resource control" filesystem
@@ -549,51 +557,51 @@ func GetIntelRdtPath(id string) (string, error) {
 }

 // Applies Intel RDT configuration to the process with the specified pid
-func (m *IntelRdtManager) Apply(pid int) (err error) {
+func (m *intelRdtManager) Apply(pid int) (err error) {
 	// If intelRdt is not specified in config, we do nothing
-	if m.Config.IntelRdt == nil {
+	if m.config.IntelRdt == nil {
 		return nil
 	}
-	d, err := getIntelRdtData(m.Config, pid)
+	d, err := getIntelRdtData(m.config, pid)
 	if err != nil && !IsNotFound(err) {
 		return err
 	}

 	m.mu.Lock()
 	defer m.mu.Unlock()
-	path, err := d.join(m.Id)
+	path, err := d.join(m.id)
 	if err != nil {
 		return err
 	}

-	m.Path = path
+	m.path = path
 	return nil
 }

 // Destroys the Intel RDT 'container_id' group
-func (m *IntelRdtManager) Destroy() error {
+func (m *intelRdtManager) Destroy() error {
 	m.mu.Lock()
 	defer m.mu.Unlock()
 	if err := os.RemoveAll(m.GetPath()); err != nil {
 		return err
 	}
-	m.Path = ""
+	m.path = ""
 	return nil
 }

 // Returns Intel RDT path to save in a state file and to be able to
 // restore the object later
-func (m *IntelRdtManager) GetPath() string {
-	if m.Path == "" {
-		m.Path, _ = GetIntelRdtPath(m.Id)
+func (m *intelRdtManager) GetPath() string {
+	if m.path == "" {
+		m.path, _ = GetIntelRdtPath(m.id)
 	}
-	return m.Path
+	return m.path
 }

 // Returns statistics for Intel RDT
-func (m *IntelRdtManager) GetStats() (*Stats, error) {
+func (m *intelRdtManager) GetStats() (*Stats, error) {
 	// If intelRdt is not specified in config
-	if m.Config.IntelRdt == nil {
+	if m.config.IntelRdt == nil {
 		return nil, nil
 	}

@@ -620,7 +628,7 @@ func (m *IntelRdtManager) GetStats() (*Stats, error) {
 	}
 	schemaStrings := strings.Split(tmpStrings, "\n")

-	if IsCatEnabled() {
+	if IsCATEnabled() {
 		// The read-only L3 cache information
 		l3CacheInfo, err := getL3CacheInfo()
 		if err != nil {
@@ -643,7 +651,7 @@ func (m *IntelRdtManager) GetStats() (*Stats, error) {
 		}
 	}

-	if IsMbaEnabled() {
+	if IsMBAEnabled() {
 		// The read-only memory bandwidth information
 		memBwInfo, err := getMemBwInfo()
 		if err != nil {
@@ -666,16 +674,18 @@ func (m *IntelRdtManager) GetStats() (*Stats, error) {
 		}
 	}

-	err = getMonitoringStats(containerPath, stats)
-	if err != nil {
-		return nil, err
+	if IsMBMEnabled() || IsCMTEnabled() {
+		err = getMonitoringStats(containerPath, stats)
+		if err != nil {
+			return nil, err
+		}
 	}

 	return stats, nil
 }

 // Set Intel RDT "resource control" filesystem as configured.
-func (m *IntelRdtManager) Set(container *configs.Config) error {
+func (m *intelRdtManager) Set(container *configs.Config) error {
 	// About L3 cache schema:
 	// It has allocation bitmasks/values for L3 cache on each socket,
 	// which contains L3 cache id and capacity bitmask (CBM).
@@ -753,7 +763,7 @@ func (m *IntelRdtManager) Set(container *configs.Config) error {

 func (raw *intelRdtData) join(id string) (string, error) {
 	path := filepath.Join(raw.root, id)
-	if err := os.MkdirAll(path, 0755); err != nil {
+	if err := os.MkdirAll(path, 0o755); err != nil {
 		return "", NewLastCmdError(err)
 	}

--- a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/mbm.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/mbm.go
@@ -9,6 +9,7 @@ var (

 // Check if Intel RDT/MBM is enabled.
 func IsMBMEnabled() bool {
+	featuresInit()
 	return mbmEnabled
 }

--- a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/monitoring.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/monitoring.go
@@ -2,11 +2,12 @@ package intelrdt

 import (
 	"bufio"
-	"github.com/sirupsen/logrus"
 	"io"
 	"io/ioutil"
 	"os"
 	"path/filepath"
+
+	"github.com/sirupsen/logrus"
 )

 var (
@@ -21,10 +22,10 @@ type monFeatures struct {

 func getMonFeatures(intelRdtRoot string) (monFeatures, error) {
 	file, err := os.Open(filepath.Join(intelRdtRoot, "info", "L3_MON", "mon_features"))
-	defer file.Close()
 	if err != nil {
 		return monFeatures{}, err
 	}
+	defer file.Close()
 	return parseMonFeatures(file)
 }

--- a/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/intelrdt/stats.go
@@ -17,15 +17,15 @@ type MemBwInfo struct {

 type MBMNumaNodeStats struct {
 	// The 'mbm_total_bytes' in 'container_id' group.
-	MBMTotalBytes uint64 `json:"mbm_total_bytes,omitempty"`
+	MBMTotalBytes uint64 `json:"mbm_total_bytes"`

 	// The 'mbm_local_bytes' in 'container_id' group.
-	MBMLocalBytes uint64 `json:"mbm_local_bytes,omitempty"`
+	MBMLocalBytes uint64 `json:"mbm_local_bytes"`
 }

 type CMTNumaNodeStats struct {
 	// The 'llc_occupancy' in 'container_id' group.
-	LLCOccupancy uint64 `json:"llc_occupancy,omitempty"`
+	LLCOccupancy uint64 `json:"llc_occupancy"`
 }

 type Stats struct {
--- a/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/network_linux.go
@@ -3,11 +3,11 @@
 package libcontainer

 import (
+	"bytes"
 	"fmt"
 	"io/ioutil"
 	"path/filepath"
 	"strconv"
-	"strings"

 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/types"
@@ -79,7 +79,7 @@ func readSysfsNetworkStats(ethInterface, statsFile string) (uint64, error) {
 	if err != nil {
 		return 0, err
 	}
-	return strconv.ParseUint(strings.TrimSpace(string(data)), 10, 64)
+	return strconv.ParseUint(string(bytes.TrimSpace(data)), 10, 64)
 }

 // loopback is a network strategy that provides a basic loopback device
--- a/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/process_linux.go
@@ -11,6 +11,7 @@ import (
 	"os/exec"
 	"path/filepath"
 	"strconv"
+	"time"

 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
@@ -85,21 +86,29 @@ func (p *setnsProcess) signal(sig os.Signal) error {
 	return unix.Kill(p.pid(), s)
 }

-func (p *setnsProcess) start() (err error) {
+func (p *setnsProcess) start() (retErr error) {
 	defer p.messageSockPair.parent.Close()
-	err = p.cmd.Start()
+	err := p.cmd.Start()
 	// close the write-side of the pipes (controlled by child)
 	p.messageSockPair.child.Close()
 	p.logFilePair.child.Close()
 	if err != nil {
 		return newSystemErrorWithCause(err, "starting setns process")
 	}
+	defer func() {
+		if retErr != nil {
+			err := ignoreTerminateErrors(p.terminate())
+			if err != nil {
+				logrus.WithError(err).Warn("unable to terminate setnsProcess")
+			}
+		}
+	}()
 	if p.bootstrapData != nil {
 		if _, err := io.Copy(p.messageSockPair.parent, p.bootstrapData); err != nil {
 			return newSystemErrorWithCause(err, "copying bootstrap data to pipe")
 		}
 	}
-	if err = p.execSetns(); err != nil {
+	if err := p.execSetns(); err != nil {
 		return newSystemErrorWithCause(err, "executing setns process")
 	}
 	if len(p.cgroupPaths) > 0 {
@@ -312,6 +321,11 @@ func (p *initProcess) start() (retErr error) {
 	}
 	defer func() {
 		if retErr != nil {
+			// terminate the process to ensure we can remove cgroups
+			if err := ignoreTerminateErrors(p.terminate()); err != nil {
+				logrus.WithError(err).Warn("unable to terminate initProcess")
+			}
+
 			p.manager.Destroy()
 			if p.intelRdtManager != nil {
 				p.intelRdtManager.Destroy()
@@ -411,6 +425,28 @@ func (p *initProcess) start() (retErr error) {
 					}
 				}
 			}
+
+			// generate a timestamp indicating when the container was started
+			p.container.created = time.Now().UTC()
+			p.container.state = &createdState{
+				c: p.container,
+			}
+
+			// NOTE: If the procRun state has been synced and the
+			// runc-create process has been killed for some reason,
+			// the runc-init[2:stage] process will be leaky. And
+			// the runc command also fails to parse root directory
+			// because the container doesn't have state.json.
+			//
+			// In order to cleanup the runc-init[2:stage] by
+			// runc-delete/stop, we should store the status before
+			// procRun sync.
+			state, uerr := p.container.updateState(p)
+			if uerr != nil {
+				return newSystemErrorWithCause(err, "store init state")
+			}
+			p.container.initProcessStartTime = state.InitProcessStartTime
+
 			// Sync with child.
 			if err := writeSync(p.messageSockPair.parent, procRun); err != nil {
 				return newSystemErrorWithCause(err, "writing syncT 'run'")
@@ -475,14 +511,11 @@ func (p *initProcess) start() (retErr error) {

 func (p *initProcess) wait() (*os.ProcessState, error) {
 	err := p.cmd.Wait()
-	if err != nil {
-		return p.cmd.ProcessState, err
-	}
 	// we should kill all processes in cgroup when init is died if we use host PID namespace
 	if p.sharePidns {
 		signalAllProcesses(p.manager, unix.SIGKILL)
 	}
-	return p.cmd.ProcessState, nil
+	return p.cmd.ProcessState, err
 }

 func (p *initProcess) terminate() error {
--- a/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/rootfs_linux.go
@@ -18,11 +18,12 @@ import (
 	"github.com/mrunalp/fileutils"
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runc/libcontainer/utils"
 	libcontainerUtils "github.com/opencontainers/runc/libcontainer/utils"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/opencontainers/selinux/go-selinux/label"
-
 	"golang.org/x/sys/unix"
 )

@@ -156,7 +157,11 @@ func finalizeRootfs(config *configs.Config) (err error) {
 		}
 	}

-	unix.Umask(0022)
+	if config.Umask != nil {
+		unix.Umask(int(*config.Umask))
+	} else {
+		unix.Umask(0022)
+	}
 	return nil
 }

@@ -328,17 +333,20 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
 		if err := os.MkdirAll(dest, 0755); err != nil {
 			return err
 		}
-		if err := mountPropagate(m, rootfs, mountLabel); err != nil {
-			// older kernels do not support labeling of /dev/mqueue
-			if err := mountPropagate(m, rootfs, ""); err != nil {
-				return err
-			}
-			return label.SetFileLabel(dest, mountLabel)
+		if err := mountPropagate(m, rootfs, ""); err != nil {
+			return err
 		}
-		return nil
+		return label.SetFileLabel(dest, mountLabel)
 	case "tmpfs":
 		copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
 		tmpDir := ""
+		// dest might be an absolute symlink, so it needs
+		// to be resolved under rootfs.
+		dest, err := securejoin.SecureJoin(rootfs, m.Destination)
+		if err != nil {
+			return err
+		}
+		m.Destination = dest
 		stat, err := os.Stat(dest)
 		if err != nil {
 			if err := os.MkdirAll(dest, 0755); err != nil {
@@ -382,6 +390,12 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string, enableCgroupns b
 				return err
 			}
 		}
+		// Initially mounted rw in mountPropagate, remount to ro if flag set.
+		if m.Flags&unix.MS_RDONLY != 0 {
+			if err := remount(m, rootfs); err != nil {
+				return err
+			}
+		}
 		return nil
 	case "bind":
 		if err := prepareBindMount(m, rootfs); err != nil {
@@ -475,28 +489,6 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
 // if source is nil, don't stat the filesystem.  This is used for restore of a checkpoint.
 func checkProcMount(rootfs, dest, source string) error {
 	const procPath = "/proc"
-	// White list, it should be sub directories of invalid destinations
-	validDestinations := []string{
-		// These entries can be bind mounted by files emulated by fuse,
-		// so commands like top, free displays stats in container.
-		"/proc/cpuinfo",
-		"/proc/diskstats",
-		"/proc/meminfo",
-		"/proc/stat",
-		"/proc/swaps",
-		"/proc/uptime",
-		"/proc/loadavg",
-		"/proc/net/dev",
-	}
-	for _, valid := range validDestinations {
-		path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
-		if err != nil {
-			return err
-		}
-		if path == "." {
-			return nil
-		}
-	}
 	path, err := filepath.Rel(filepath.Join(rootfs, procPath), dest)
 	if err != nil {
 		return err
@@ -522,6 +514,30 @@ func checkProcMount(rootfs, dest, source string) error {
 		}
 		return fmt.Errorf("%q cannot be mounted because it is not of type proc", dest)
 	}
+
+	// Here dest is definitely under /proc. Do not allow those,
+	// except for a few specific entries emulated by lxcfs.
+	validProcMounts := []string{
+		"/proc/cpuinfo",
+		"/proc/diskstats",
+		"/proc/meminfo",
+		"/proc/stat",
+		"/proc/swaps",
+		"/proc/uptime",
+		"/proc/loadavg",
+		"/proc/slabinfo",
+		"/proc/net/dev",
+	}
+	for _, valid := range validProcMounts {
+		path, err := filepath.Rel(filepath.Join(rootfs, valid), dest)
+		if err != nil {
+			return err
+		}
+		if path == "." {
+			return nil
+		}
+	}
+
 	return fmt.Errorf("%q cannot be mounted because it is inside /proc", dest)
 }

@@ -590,6 +606,12 @@ func createDevices(config *configs.Config) error {
 	useBindMount := system.RunningInUserNS() || config.Namespaces.Contains(configs.NEWUSER)
 	oldMask := unix.Umask(0000)
 	for _, node := range config.Devices {
+
+		// The /dev/ptmx device is setup by setupPtmx()
+		if utils.CleanPath(node.Path) == "/dev/ptmx" {
+			continue
+		}
+
 		// containers running in a user namespace are not allowed to mknod
 		// devices so we can just bind mount it from the host.
 		if err := createDeviceNode(config.Rootfs, node, useBindMount); err != nil {
@@ -601,7 +623,7 @@ func createDevices(config *configs.Config) error {
 	return nil
 }

-func bindMountDeviceNode(dest string, node *configs.Device) error {
+func bindMountDeviceNode(dest string, node *devices.Device) error {
 	f, err := os.Create(dest)
 	if err != nil && !os.IsExist(err) {
 		return err
@@ -613,7 +635,7 @@ func bindMountDeviceNode(dest string, node *configs.Device) error {
 }

 // Creates the device node in the rootfs of the container.
-func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
+func createDeviceNode(rootfs string, node *devices.Device, bind bool) error {
 	if node.Path == "" {
 		// The node only exists for cgroup reasons, ignore it here.
 		return nil
@@ -636,14 +658,14 @@ func createDeviceNode(rootfs string, node *configs.Device, bind bool) error {
 	return nil
 }

-func mknodDevice(dest string, node *configs.Device) error {
+func mknodDevice(dest string, node *devices.Device) error {
 	fileMode := node.FileMode
 	switch node.Type {
-	case configs.BlockDevice:
+	case devices.BlockDevice:
 		fileMode |= unix.S_IFBLK
-	case configs.CharDevice:
+	case devices.CharDevice:
 		fileMode |= unix.S_IFCHR
-	case configs.FifoDevice:
+	case devices.FifoDevice:
 		fileMode |= unix.S_IFIFO
 	default:
 		return fmt.Errorf("%c is not a valid device type for device %s", node.Type, node.Path)
@@ -728,7 +750,19 @@ func prepareRoot(config *configs.Config) error {
 }

 func setReadonly() error {
-	return unix.Mount("/", "/", "bind", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "")
+	flags := uintptr(unix.MS_BIND | unix.MS_REMOUNT | unix.MS_RDONLY)
+
+	err := unix.Mount("", "/", "", flags, "")
+	if err == nil {
+		return nil
+	}
+	var s unix.Statfs_t
+	if err := unix.Statfs("/", &s); err != nil {
+		return &os.PathError{Op: "statfs", Path: "/", Err: err}
+	}
+	flags |= uintptr(s.Flags)
+	return unix.Mount("", "/", "", flags, "")
+
 }

 func setupPtmx(config *configs.Config) error {
@@ -802,24 +836,46 @@ func pivotRoot(rootfs string) error {
 }

 func msMoveRoot(rootfs string) error {
+	// Before we move the root and chroot we have to mask all "full" sysfs and
+	// procfs mounts which exist on the host. This is because while the kernel
+	// has protections against mounting procfs if it has masks, when using
+	// chroot(2) the *host* procfs mount is still reachable in the mount
+	// namespace and the kernel permits procfs mounts inside --no-pivot
+	// containers.
+	//
+	// Users shouldn't be using --no-pivot except in exceptional circumstances,
+	// but to avoid such a trivial security flaw we apply a best-effort
+	// protection here. The kernel only allows a mount of a pseudo-filesystem
+	// like procfs or sysfs if there is a *full* mount (the root of the
+	// filesystem is mounted) without any other locked mount points covering a
+	// subtree of the mount.
+	//
+	// So we try to unmount (or mount tmpfs on top of) any mountpoint which is
+	// a full mount of either sysfs or procfs (since those are the most
+	// concerning filesystems to us).
 	mountinfos, err := mountinfo.GetMounts(func(info *mountinfo.Info) (skip, stop bool) {
-		skip = false
-		stop = false
-		// Collect every sysfs and proc file systems, except those under the container rootfs
-		if (info.Fstype != "proc" && info.Fstype != "sysfs") || strings.HasPrefix(info.Mountpoint, rootfs) {
+		// Collect every sysfs and procfs filesystem, except for those which
+		// are non-full mounts or are inside the rootfs of the container.
+		if info.Root != "/" ||
+			(info.FSType != "proc" && info.FSType != "sysfs") ||
+			strings.HasPrefix(info.Mountpoint, rootfs) {
 			skip = true
-			return
 		}
 		return
 	})
 	if err != nil {
 		return err
 	}
-
 	for _, info := range mountinfos {
 		p := info.Mountpoint
 		// Be sure umount events are not propagated to the host.
 		if err := unix.Mount("", p, "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
+			if err == unix.ENOENT {
+				// If the mountpoint doesn't exist that means that we've
+				// already blasted away some parent directory of the mountpoint
+				// and so we don't care about this error.
+				continue
+			}
 			return err
 		}
 		if err := unix.Unmount(p, unix.MNT_DETACH); err != nil {
@@ -834,6 +890,8 @@ func msMoveRoot(rootfs string) error {
 			}
 		}
 	}
+
+	// Move the rootfs on top of "/" in our mount namespace.
 	if err := unix.Mount(rootfs, "/", "", unix.MS_MOVE, ""); err != nil {
 		return err
 	}
@@ -950,6 +1008,12 @@ func mountPropagate(m *configs.Mount, rootfs string, mountLabel string) error {
 		flags &= ^unix.MS_RDONLY
 	}

+	// Mount it rw to allow chmod operation. A remount will be performed
+	// later to make it ro if set.
+	if m.Device == "tmpfs" {
+		flags &= ^unix.MS_RDONLY
+	}
+
 	copyUp := m.Extensions&configs.EXT_COPYUP == configs.EXT_COPYUP
 	if !(copyUp || strings.HasPrefix(dest, rootfs)) {
 		dest = filepath.Join(rootfs, dest)
--- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/config.go
@@ -49,7 +49,7 @@ var archs = map[string]string{
 // Attempting to convert a string that is not a valid operator results in an
 // error.
 func ConvertStringToOperator(in string) (configs.Operator, error) {
-	if op, ok := operators[in]; ok == true {
+	if op, ok := operators[in]; ok {
 		return op, nil
 	}
 	return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in)
@@ -62,7 +62,7 @@ func ConvertStringToOperator(in string) (configs.Operator, error) {
 // Attempting to convert a string that is not a valid action results in an
 // error.
 func ConvertStringToAction(in string) (configs.Action, error) {
-	if act, ok := actions[in]; ok == true {
+	if act, ok := actions[in]; ok {
 		return act, nil
 	}
 	return 0, fmt.Errorf("string %s is not a valid action for seccomp", in)
@@ -70,7 +70,7 @@ func ConvertStringToAction(in string) (configs.Action, error) {

 // ConvertStringToArch converts a string into a Seccomp comparison arch.
 func ConvertStringToArch(in string) (string, error) {
-	if arch, ok := archs[in]; ok == true {
+	if arch, ok := archs[in]; ok {
 		return arch, nil
 	}
 	return "", fmt.Errorf("string %s is not a valid arch for seccomp", in)
--- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_linux.go
@@ -0,0 +1,628 @@
+// +build linux,cgo,seccomp
+
+package patchbpf
+
+import (
+	"encoding/binary"
+	"io"
+	"os"
+	"runtime"
+	"unsafe"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/utils"
+
+	"github.com/pkg/errors"
+	libseccomp "github.com/seccomp/libseccomp-golang"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/net/bpf"
+	"golang.org/x/sys/unix"
+)
+
+// #cgo pkg-config: libseccomp
+/*
+#include <errno.h>
+#include <stdint.h>
+#include <seccomp.h>
+#include <linux/seccomp.h>
+
+const uint32_t C_ACT_ERRNO_ENOSYS = SCMP_ACT_ERRNO(ENOSYS);
+
+// Copied from <linux/seccomp.h>.
+
+#ifndef SECCOMP_SET_MODE_FILTER
+#	define SECCOMP_SET_MODE_FILTER 1
+#endif
+const uintptr_t C_SET_MODE_FILTER = SECCOMP_SET_MODE_FILTER;
+
+#ifndef SECCOMP_FILTER_FLAG_LOG
+#	define SECCOMP_FILTER_FLAG_LOG (1UL << 1)
+#endif
+const uintptr_t C_FILTER_FLAG_LOG = SECCOMP_FILTER_FLAG_LOG;
+
+// We use the AUDIT_ARCH_* values because those are the ones used by the kernel
+// and SCMP_ARCH_* sometimes has fake values (such as SCMP_ARCH_X32). But we
+// use <seccomp.h> so we get libseccomp's fallback definitions of AUDIT_ARCH_*.
+
+const uint32_t C_AUDIT_ARCH_I386         = AUDIT_ARCH_I386;
+const uint32_t C_AUDIT_ARCH_X86_64       = AUDIT_ARCH_X86_64;
+const uint32_t C_AUDIT_ARCH_ARM          = AUDIT_ARCH_ARM;
+const uint32_t C_AUDIT_ARCH_AARCH64      = AUDIT_ARCH_AARCH64;
+const uint32_t C_AUDIT_ARCH_MIPS         = AUDIT_ARCH_MIPS;
+const uint32_t C_AUDIT_ARCH_MIPS64       = AUDIT_ARCH_MIPS64;
+const uint32_t C_AUDIT_ARCH_MIPS64N32    = AUDIT_ARCH_MIPS64N32;
+const uint32_t C_AUDIT_ARCH_MIPSEL       = AUDIT_ARCH_MIPSEL;
+const uint32_t C_AUDIT_ARCH_MIPSEL64     = AUDIT_ARCH_MIPSEL64;
+const uint32_t C_AUDIT_ARCH_MIPSEL64N32  = AUDIT_ARCH_MIPSEL64N32;
+const uint32_t C_AUDIT_ARCH_PPC          = AUDIT_ARCH_PPC;
+const uint32_t C_AUDIT_ARCH_PPC64        = AUDIT_ARCH_PPC64;
+const uint32_t C_AUDIT_ARCH_PPC64LE      = AUDIT_ARCH_PPC64LE;
+const uint32_t C_AUDIT_ARCH_S390         = AUDIT_ARCH_S390;
+const uint32_t C_AUDIT_ARCH_S390X        = AUDIT_ARCH_S390X;
+*/
+import "C"
+
+var retErrnoEnosys = uint32(C.C_ACT_ERRNO_ENOSYS)
+
+func isAllowAction(action configs.Action) bool {
+	switch action {
+	// Trace is considered an "allow" action because a good tracer should
+	// support future syscalls (by handling -ENOSYS on its own), and giving
+	// -ENOSYS will be disruptive for emulation.
+	case configs.Allow, configs.Log, configs.Trace:
+		return true
+	default:
+		return false
+	}
+}
+
+func parseProgram(rdr io.Reader) ([]bpf.RawInstruction, error) {
+	var program []bpf.RawInstruction
+loop:
+	for {
+		// Read the next instruction. We have to use NativeEndian because
+		// seccomp_export_bpf outputs the program in *host* endian-ness.
+		var insn unix.SockFilter
+		if err := binary.Read(rdr, utils.NativeEndian, &insn); err != nil {
+			switch err {
+			case io.EOF:
+				// Parsing complete.
+				break loop
+			case io.ErrUnexpectedEOF:
+				// Parsing stopped mid-instruction.
+				return nil, errors.Wrap(err, "program parsing halted mid-instruction")
+			default:
+				// All other errors.
+				return nil, errors.Wrap(err, "parsing instructions")
+			}
+		}
+		program = append(program, bpf.RawInstruction{
+			Op: insn.Code,
+			Jt: insn.Jt,
+			Jf: insn.Jf,
+			K:  insn.K,
+		})
+	}
+	return program, nil
+}
+
+func disassembleFilter(filter *libseccomp.ScmpFilter) ([]bpf.Instruction, error) {
+	rdr, wtr, err := os.Pipe()
+	if err != nil {
+		return nil, errors.Wrap(err, "creating scratch pipe")
+	}
+	defer wtr.Close()
+	defer rdr.Close()
+
+	if err := filter.ExportBPF(wtr); err != nil {
+		return nil, errors.Wrap(err, "exporting BPF")
+	}
+	// Close so that the reader actually gets EOF.
+	_ = wtr.Close()
+
+	// Parse the instructions.
+	rawProgram, err := parseProgram(rdr)
+	if err != nil {
+		return nil, errors.Wrap(err, "parsing generated BPF filter")
+	}
+	program, ok := bpf.Disassemble(rawProgram)
+	if !ok {
+		return nil, errors.Errorf("could not disassemble entire BPF filter")
+	}
+	return program, nil
+}
+
+type nativeArch uint32
+
+const invalidArch nativeArch = 0
+
+func archToNative(arch libseccomp.ScmpArch) (nativeArch, error) {
+	switch arch {
+	case libseccomp.ArchNative:
+		// Convert to actual native architecture.
+		arch, err := libseccomp.GetNativeArch()
+		if err != nil {
+			return invalidArch, errors.Wrap(err, "get native arch")
+		}
+		return archToNative(arch)
+	case libseccomp.ArchX86:
+		return nativeArch(C.C_AUDIT_ARCH_I386), nil
+	case libseccomp.ArchAMD64, libseccomp.ArchX32:
+		// NOTE: x32 is treated like x86_64 except all x32 syscalls have the
+		//       30th bit of the syscall number set to indicate that it's not a
+		//       normal x86_64 syscall.
+		return nativeArch(C.C_AUDIT_ARCH_X86_64), nil
+	case libseccomp.ArchARM:
+		return nativeArch(C.C_AUDIT_ARCH_ARM), nil
+	case libseccomp.ArchARM64:
+		return nativeArch(C.C_AUDIT_ARCH_AARCH64), nil
+	case libseccomp.ArchMIPS:
+		return nativeArch(C.C_AUDIT_ARCH_MIPS), nil
+	case libseccomp.ArchMIPS64:
+		return nativeArch(C.C_AUDIT_ARCH_MIPS64), nil
+	case libseccomp.ArchMIPS64N32:
+		return nativeArch(C.C_AUDIT_ARCH_MIPS64N32), nil
+	case libseccomp.ArchMIPSEL:
+		return nativeArch(C.C_AUDIT_ARCH_MIPSEL), nil
+	case libseccomp.ArchMIPSEL64:
+		return nativeArch(C.C_AUDIT_ARCH_MIPSEL64), nil
+	case libseccomp.ArchMIPSEL64N32:
+		return nativeArch(C.C_AUDIT_ARCH_MIPSEL64N32), nil
+	case libseccomp.ArchPPC:
+		return nativeArch(C.C_AUDIT_ARCH_PPC), nil
+	case libseccomp.ArchPPC64:
+		return nativeArch(C.C_AUDIT_ARCH_PPC64), nil
+	case libseccomp.ArchPPC64LE:
+		return nativeArch(C.C_AUDIT_ARCH_PPC64LE), nil
+	case libseccomp.ArchS390:
+		return nativeArch(C.C_AUDIT_ARCH_S390), nil
+	case libseccomp.ArchS390X:
+		return nativeArch(C.C_AUDIT_ARCH_S390X), nil
+	default:
+		return invalidArch, errors.Errorf("unknown architecture: %v", arch)
+	}
+}
+
+type lastSyscallMap map[nativeArch]map[libseccomp.ScmpArch]libseccomp.ScmpSyscall
+
+// Figure out largest syscall number referenced in the filter for each
+// architecture. We will be generating code based on the native architecture
+// representation, but SCMP_ARCH_X32 means we have to track cases where the
+// same architecture has different largest syscalls based on the mode.
+func findLastSyscalls(config *configs.Seccomp) (lastSyscallMap, error) {
+	lastSyscalls := make(lastSyscallMap)
+	// Only loop over architectures which are present in the filter. Any other
+	// architectures will get the libseccomp bad architecture action anyway.
+	for _, ociArch := range config.Architectures {
+		arch, err := libseccomp.GetArchFromString(ociArch)
+		if err != nil {
+			return nil, errors.Wrap(err, "validating seccomp architecture")
+		}
+
+		// Map native architecture to a real architecture value to avoid
+		// doubling-up the lastSyscall mapping.
+		if arch == libseccomp.ArchNative {
+			nativeArch, err := libseccomp.GetNativeArch()
+			if err != nil {
+				return nil, errors.Wrap(err, "get native arch")
+			}
+			arch = nativeArch
+		}
+
+		// Figure out native architecture representation of the architecture.
+		nativeArch, err := archToNative(arch)
+		if err != nil {
+			return nil, errors.Wrapf(err, "cannot map architecture %v to AUDIT_ARCH_ constant", arch)
+		}
+
+		if _, ok := lastSyscalls[nativeArch]; !ok {
+			lastSyscalls[nativeArch] = map[libseccomp.ScmpArch]libseccomp.ScmpSyscall{}
+		}
+		if _, ok := lastSyscalls[nativeArch][arch]; ok {
+			// Because of ArchNative we may hit the same entry multiple times.
+			// Just skip it if we've seen this (nativeArch, ScmpArch)
+			// combination before.
+			continue
+		}
+
+		// Find the largest syscall in the filter for this architecture.
+		var largestSyscall libseccomp.ScmpSyscall
+		for _, rule := range config.Syscalls {
+			sysno, err := libseccomp.GetSyscallFromNameByArch(rule.Name, arch)
+			if err != nil {
+				// Ignore unknown syscalls.
+				continue
+			}
+			if sysno > largestSyscall {
+				largestSyscall = sysno
+			}
+		}
+		if largestSyscall != 0 {
+			lastSyscalls[nativeArch][arch] = largestSyscall
+		} else {
+			logrus.Warnf("could not find any syscalls for arch %s", ociArch)
+			delete(lastSyscalls[nativeArch], arch)
+		}
+	}
+	return lastSyscalls, nil
+}
+
+// FIXME FIXME FIXME
+//
+// This solution is less than ideal. In the future it would be great to have
+// per-arch information about which syscalls were added in which kernel
+// versions so we can create far more accurate filter rules (handling holes in
+// the syscall table and determining -ENOSYS requirements based on kernel
+// minimum version alone.
+//
+// This implementation can in principle cause issues with syscalls like
+// close_range(2) which were added out-of-order in the syscall table between
+// kernel releases.
+func generateEnosysStub(lastSyscalls lastSyscallMap) ([]bpf.Instruction, error) {
+	// A jump-table for each nativeArch used to generate the initial
+	// conditional jumps -- measured from the *END* of the program so they
+	// remain valid after prepending to the tail.
+	archJumpTable := map[nativeArch]uint32{}
+
+	// Generate our own -ENOSYS rules for each architecture. They have to be
+	// generated in reverse (prepended to the tail of the program) because the
+	// JumpIf jumps need to be computed from the end of the program.
+	programTail := []bpf.Instruction{
+		// Fall-through rules jump into the filter.
+		bpf.Jump{Skip: 1},
+		// Rules which jump to here get -ENOSYS.
+		bpf.RetConstant{Val: retErrnoEnosys},
+	}
+
+	// Generate the syscall -ENOSYS rules.
+	for nativeArch, maxSyscalls := range lastSyscalls {
+		// The number of instructions from the tail of this section which need
+		// to be jumped in order to reach the -ENOSYS return. If the section
+		// does not jump, it will fall through to the actual filter.
+		baseJumpEnosys := uint32(len(programTail) - 1)
+		baseJumpFilter := baseJumpEnosys + 1
+
+		// Add the load instruction for the syscall number -- we jump here
+		// directly from the arch code so we need to do it here. Sadly we can't
+		// share this code between architecture branches.
+		section := []bpf.Instruction{
+			// load [0]
+			bpf.LoadAbsolute{Off: 0, Size: 4}, // NOTE: We assume sizeof(int) == 4.
+		}
+
+		switch len(maxSyscalls) {
+		case 0:
+			// No syscalls found for this arch -- skip it and move on.
+			continue
+		case 1:
+			// Get the only syscall in the map.
+			var sysno libseccomp.ScmpSyscall
+			for _, no := range maxSyscalls {
+				sysno = no
+			}
+
+			// The simplest case just boils down to a single jgt instruction,
+			// with special handling if baseJumpEnosys is larger than 255 (and
+			// thus a long jump is required).
+			var sectionTail []bpf.Instruction
+			if baseJumpEnosys+1 <= 255 {
+				sectionTail = []bpf.Instruction{
+					// jgt [syscall],[baseJumpEnosys+1]
+					bpf.JumpIf{
+						Cond:     bpf.JumpGreaterThan,
+						Val:      uint32(sysno),
+						SkipTrue: uint8(baseJumpEnosys + 1)},
+					// ja [baseJumpFilter]
+					bpf.Jump{Skip: baseJumpFilter},
+				}
+			} else {
+				sectionTail = []bpf.Instruction{
+					// jle [syscall],1
+					bpf.JumpIf{Cond: bpf.JumpLessOrEqual, Val: uint32(sysno), SkipTrue: 1},
+					// ja [baseJumpEnosys+1]
+					bpf.Jump{Skip: baseJumpEnosys + 1},
+					// ja [baseJumpFilter]
+					bpf.Jump{Skip: baseJumpFilter},
+				}
+			}
+
+			// If we're on x86 we need to add a check for x32 and if we're in
+			// the wrong mode we jump over the section.
+			if uint32(nativeArch) == uint32(C.C_AUDIT_ARCH_X86_64) {
+				// Grab the only architecture in the map.
+				var scmpArch libseccomp.ScmpArch
+				for arch := range maxSyscalls {
+					scmpArch = arch
+				}
+
+				// Generate a prefix to check the mode.
+				switch scmpArch {
+				case libseccomp.ArchAMD64:
+					sectionTail = append([]bpf.Instruction{
+						// jset (1<<30),[len(tail)-1]
+						bpf.JumpIf{Cond: bpf.JumpBitsSet,
+							Val:      1 << 30,
+							SkipTrue: uint8(len(sectionTail) - 1)},
+					}, sectionTail...)
+				case libseccomp.ArchX32:
+					sectionTail = append([]bpf.Instruction{
+						// jset (1<<30),0,[len(tail)-1]
+						bpf.JumpIf{Cond: bpf.JumpBitsNotSet,
+							Val:      1 << 30,
+							SkipTrue: uint8(len(sectionTail) - 1)},
+					}, sectionTail...)
+				default:
+					return nil, errors.Errorf("unknown amd64 native architecture %#x", scmpArch)
+				}
+			}
+
+			section = append(section, sectionTail...)
+		case 2:
+			// x32 and x86_64 are a unique case, we can't handle any others.
+			if uint32(nativeArch) != uint32(C.C_AUDIT_ARCH_X86_64) {
+				return nil, errors.Errorf("unknown architecture overlap on native arch %#x", nativeArch)
+			}
+
+			x32sysno, ok := maxSyscalls[libseccomp.ArchX32]
+			if !ok {
+				return nil, errors.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchX32, maxSyscalls)
+			}
+			x86sysno, ok := maxSyscalls[libseccomp.ArchAMD64]
+			if !ok {
+				return nil, errors.Errorf("missing %v in overlapping x86_64 arch: %v", libseccomp.ArchAMD64, maxSyscalls)
+			}
+
+			// The x32 ABI indicates that a syscall is being made by an x32
+			// process by setting the 30th bit of the syscall number, but we
+			// need to do some special-casing depending on whether we need to
+			// do long jumps.
+			if baseJumpEnosys+2 <= 255 {
+				// For the simple case we want to have something like:
+				//   jset (1<<30),1
+				//   jgt [x86 syscall],[baseJumpEnosys+2],1
+				//   jgt [x32 syscall],[baseJumpEnosys+1]
+				//   ja [baseJumpFilter]
+				section = append(section, []bpf.Instruction{
+					// jset (1<<30),1
+					bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
+					// jgt [x86 syscall],[baseJumpEnosys+1],1
+					bpf.JumpIf{
+						Cond:     bpf.JumpGreaterThan,
+						Val:      uint32(x86sysno),
+						SkipTrue: uint8(baseJumpEnosys + 2), SkipFalse: 1},
+					// jgt [x32 syscall],[baseJumpEnosys]
+					bpf.JumpIf{
+						Cond:     bpf.JumpGreaterThan,
+						Val:      uint32(x32sysno),
+						SkipTrue: uint8(baseJumpEnosys + 1)},
+					// ja [baseJumpFilter]
+					bpf.Jump{Skip: baseJumpFilter},
+				}...)
+			} else {
+				// But if the [baseJumpEnosys+2] jump is larger than 255 we
+				// need to do a long jump like so:
+				//   jset (1<<30),1
+				//   jgt [x86 syscall],1,2
+				//   jle [x32 syscall],1
+				//   ja [baseJumpEnosys+1]
+				//   ja [baseJumpFilter]
+				section = append(section, []bpf.Instruction{
+					// jset (1<<30),1
+					bpf.JumpIf{Cond: bpf.JumpBitsSet, Val: 1 << 30, SkipTrue: 1},
+					// jgt [x86 syscall],1,2
+					bpf.JumpIf{
+						Cond:     bpf.JumpGreaterThan,
+						Val:      uint32(x86sysno),
+						SkipTrue: 1, SkipFalse: 2},
+					// jle [x32 syscall],[baseJumpEnosys]
+					bpf.JumpIf{
+						Cond:     bpf.JumpLessOrEqual,
+						Val:      uint32(x32sysno),
+						SkipTrue: 1},
+					// ja [baseJumpEnosys+1]
+					bpf.Jump{Skip: baseJumpEnosys + 1},
+					// ja [baseJumpFilter]
+					bpf.Jump{Skip: baseJumpFilter},
+				}...)
+			}
+		default:
+			return nil, errors.Errorf("invalid number of architecture overlaps: %v", len(maxSyscalls))
+		}
+
+		// Prepend this section to the tail.
+		programTail = append(section, programTail...)
+
+		// Update jump table.
+		archJumpTable[nativeArch] = uint32(len(programTail))
+	}
+
+	// Add a dummy "jump to filter" for any architecture we might miss below.
+	// Such architectures will probably get the BadArch action of the filter
+	// regardless.
+	programTail = append([]bpf.Instruction{
+		// ja [end of stub and start of filter]
+		bpf.Jump{Skip: uint32(len(programTail))},
+	}, programTail...)
+
+	// Generate the jump rules for each architecture. This has to be done in
+	// reverse as well for the same reason as above. We add to programTail
+	// directly because the jumps are impacted by each architecture rule we add
+	// as well.
+	//
+	// TODO: Maybe we want to optimise to avoid long jumps here? So sort the
+	//       architectures based on how large the jumps are going to be, or
+	//       re-sort the candidate architectures each time to make sure that we
+	//       pick the largest jump which is going to be smaller than 255.
+	for nativeArch := range lastSyscalls {
+		// We jump forwards but the jump table is calculated from the *END*.
+		jump := uint32(len(programTail)) - archJumpTable[nativeArch]
+
+		// Same routine as above -- this is a basic jeq check, complicated
+		// slightly if it turns out that we need to do a long jump.
+		if jump <= 255 {
+			programTail = append([]bpf.Instruction{
+				// jeq [arch],[jump]
+				bpf.JumpIf{
+					Cond:     bpf.JumpEqual,
+					Val:      uint32(nativeArch),
+					SkipTrue: uint8(jump)},
+			}, programTail...)
+		} else {
+			programTail = append([]bpf.Instruction{
+				// jne [arch],1
+				bpf.JumpIf{
+					Cond:     bpf.JumpNotEqual,
+					Val:      uint32(nativeArch),
+					SkipTrue: 1},
+				// ja [jump]
+				bpf.Jump{Skip: jump},
+			}, programTail...)
+		}
+	}
+
+	// Prepend the load instruction for the architecture.
+	programTail = append([]bpf.Instruction{
+		// load [4]
+		bpf.LoadAbsolute{Off: 4, Size: 4}, // NOTE: We assume sizeof(int) == 4.
+	}, programTail...)
+
+	// And that's all folks!
+	return programTail, nil
+}
+
+func assemble(program []bpf.Instruction) ([]unix.SockFilter, error) {
+	rawProgram, err := bpf.Assemble(program)
+	if err != nil {
+		return nil, errors.Wrap(err, "assembling program")
+	}
+
+	// Convert to []unix.SockFilter for unix.SockFilter.
+	var filter []unix.SockFilter
+	for _, insn := range rawProgram {
+		filter = append(filter, unix.SockFilter{
+			Code: insn.Op,
+			Jt:   insn.Jt,
+			Jf:   insn.Jf,
+			K:    insn.K,
+		})
+	}
+	return filter, nil
+}
+
+func generatePatch(config *configs.Seccomp) ([]bpf.Instruction, error) {
+	// We only add the stub if the default action is not permissive.
+	if isAllowAction(config.DefaultAction) {
+		logrus.Debugf("seccomp: skipping -ENOSYS stub filter generation")
+		return nil, nil
+	}
+
+	lastSyscalls, err := findLastSyscalls(config)
+	if err != nil {
+		return nil, errors.Wrap(err, "finding last syscalls for -ENOSYS stub")
+	}
+	stubProgram, err := generateEnosysStub(lastSyscalls)
+	if err != nil {
+		return nil, errors.Wrap(err, "generating -ENOSYS stub")
+	}
+	return stubProgram, nil
+}
+
+func enosysPatchFilter(config *configs.Seccomp, filter *libseccomp.ScmpFilter) ([]unix.SockFilter, error) {
+	program, err := disassembleFilter(filter)
+	if err != nil {
+		return nil, errors.Wrap(err, "disassembling original filter")
+	}
+
+	patch, err := generatePatch(config)
+	if err != nil {
+		return nil, errors.Wrap(err, "generating patch for filter")
+	}
+	fullProgram := append(patch, program...)
+
+	logrus.Debugf("seccomp: prepending -ENOSYS stub filter to user filter...")
+	for idx, insn := range patch {
+		logrus.Debugf("  [%4.1d] %s", idx, insn)
+	}
+	logrus.Debugf("  [....] --- original filter ---")
+
+	fprog, err := assemble(fullProgram)
+	if err != nil {
+		return nil, errors.Wrap(err, "assembling modified filter")
+	}
+	return fprog, nil
+}
+
+func filterFlags(filter *libseccomp.ScmpFilter) (flags uint, noNewPrivs bool, err error) {
+	// Ignore the error since pre-2.4 libseccomp is treated as API level 0.
+	apiLevel, _ := libseccomp.GetApi()
+
+	noNewPrivs, err = filter.GetNoNewPrivsBit()
+	if err != nil {
+		return 0, false, errors.Wrap(err, "fetch no_new_privs filter bit")
+	}
+
+	if apiLevel >= 3 {
+		if logBit, err := filter.GetLogBit(); err != nil {
+			return 0, false, errors.Wrap(err, "fetch SECCOMP_FILTER_FLAG_LOG bit")
+		} else if logBit {
+			flags |= uint(C.C_FILTER_FLAG_LOG)
+		}
+	}
+
+	// TODO: Support seccomp flags not yet added to libseccomp-golang...
+	return
+}
+
+func sysSeccompSetFilter(flags uint, filter []unix.SockFilter) (err error) {
+	fprog := unix.SockFprog{
+		Len:    uint16(len(filter)),
+		Filter: &filter[0],
+	}
+	// If no seccomp flags were requested we can use the old-school prctl(2).
+	if flags == 0 {
+		err = unix.Prctl(unix.PR_SET_SECCOMP,
+			unix.SECCOMP_MODE_FILTER,
+			uintptr(unsafe.Pointer(&fprog)), 0, 0)
+	} else {
+		_, _, err = unix.RawSyscall(unix.SYS_SECCOMP,
+			uintptr(C.C_SET_MODE_FILTER),
+			uintptr(flags), uintptr(unsafe.Pointer(&fprog)))
+	}
+	runtime.KeepAlive(filter)
+	runtime.KeepAlive(fprog)
+	return
+}
+
+// PatchAndLoad takes a seccomp configuration and a libseccomp filter which has
+// been pre-configured with the set of rules in the seccomp config. It then
+// patches said filter to handle -ENOSYS in a much nicer manner than the
+// default libseccomp default action behaviour, and loads the patched filter
+// into the kernel for the current process.
+func PatchAndLoad(config *configs.Seccomp, filter *libseccomp.ScmpFilter) error {
+	// Generate a patched filter.
+	fprog, err := enosysPatchFilter(config, filter)
+	if err != nil {
+		return errors.Wrap(err, "patching filter")
+	}
+
+	// Get the set of libseccomp flags set.
+	seccompFlags, noNewPrivs, err := filterFlags(filter)
+	if err != nil {
+		return errors.Wrap(err, "fetch seccomp filter flags")
+	}
+
+	// Set no_new_privs if it was requested, though in runc we handle
+	// no_new_privs separately so warn if we hit this path.
+	if noNewPrivs {
+		logrus.Warnf("potentially misconfigured filter -- setting no_new_privs in seccomp path")
+		if err := unix.Prctl(unix.PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); err != nil {
+			return errors.Wrap(err, "enable no_new_privs bit")
+		}
+	}
+
+	// Finally, load the filter.
+	if err := sysSeccompSetFilter(seccompFlags, fprog); err != nil {
+		return errors.Wrap(err, "loading seccomp filter")
+	}
+	return nil
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/patchbpf/enosys_unsupported.go
@@ -0,0 +1,3 @@
+// +build !linux !cgo !seccomp
+
+package patchbpf
--- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go
@@ -10,8 +10,9 @@ import (
 	"strings"

 	"github.com/opencontainers/runc/libcontainer/configs"
-	libseccomp "github.com/seccomp/libseccomp-golang"
+	"github.com/opencontainers/runc/libcontainer/seccomp/patchbpf"

+	libseccomp "github.com/seccomp/libseccomp-golang"
 	"golang.org/x/sys/unix"
 )

@@ -54,7 +55,6 @@ func InitSeccomp(config *configs.Seccomp) error {
 		if err != nil {
 			return fmt.Errorf("error validating Seccomp architecture: %s", err)
 		}
-
 		if err := filter.AddArch(scmpArch); err != nil {
 			return fmt.Errorf("error adding architecture to seccomp filter: %s", err)
 		}
@@ -70,16 +70,13 @@ func InitSeccomp(config *configs.Seccomp) error {
 		if call == nil {
 			return errors.New("encountered nil syscall while initializing Seccomp")
 		}
-
-		if err = matchCall(filter, call); err != nil {
+		if err := matchCall(filter, call); err != nil {
 			return err
 		}
 	}
-
-	if err = filter.Load(); err != nil {
+	if err := patchbpf.PatchAndLoad(config, filter); err != nil {
 		return fmt.Errorf("error loading seccomp filter into kernel: %s", err)
 	}
-
 	return nil
 }

@@ -190,7 +187,7 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {

 	// Unconditional match - just add the rule
 	if len(call.Args) == 0 {
-		if err = filter.AddRule(callNum, callAct); err != nil {
+		if err := filter.AddRule(callNum, callAct); err != nil {
 			return fmt.Errorf("error adding seccomp filter rule for syscall %s: %s", call.Name, err)
 		}
 	} else {
@@ -224,14 +221,14 @@ func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
 			for _, cond := range conditions {
 				condArr := []libseccomp.ScmpCondition{cond}

-				if err = filter.AddRuleConditional(callNum, callAct, condArr); err != nil {
+				if err := filter.AddRuleConditional(callNum, callAct, condArr); err != nil {
 					return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
 				}
 			}
 		} else {
 			// No conditions share same argument
 			// Use new, proper behavior
-			if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
+			if err := filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
 				return fmt.Errorf("error adding seccomp rule for syscall %s: %s", call.Name, err)
 			}
 		}
@@ -266,3 +263,8 @@ func parseStatusFile(path string) (map[string]string, error) {

 	return status, nil
 }
+
+// Version returns major, minor, and micro.
+func Version() (uint, uint, uint) {
+	return libseccomp.GetLibraryVersion()
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_unsupported.go
@@ -22,3 +22,8 @@ func InitSeccomp(config *configs.Seccomp) error {
 func IsEnabled() bool {
 	return false
 }
+
+// Version returns major, minor, and micro.
+func Version() (uint, uint, uint) {
+	return 0, 0, 0
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/setns_init_linux.go
@@ -3,7 +3,6 @@
 package libcontainer

 import (
-	"fmt"
 	"os"
 	"runtime"

@@ -25,7 +24,7 @@ type linuxSetnsInit struct {
 }

 func (l *linuxSetnsInit) getSessionRingName() string {
-	return fmt.Sprintf("_ses.%s", l.config.ContainerId)
+	return "_ses." + l.config.ContainerId
 }

 func (l *linuxSetnsInit) Init() error {
--- a/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/standard_init_linux.go
@@ -3,10 +3,10 @@
 package libcontainer

 import (
-	"fmt"
 	"os"
 	"os/exec"
 	"runtime"
+	"strconv"

 	"github.com/opencontainers/runc/libcontainer/apparmor"
 	"github.com/opencontainers/runc/libcontainer/configs"
@@ -40,7 +40,7 @@ func (l *linuxStandardInit) getSessionRingParams() (string, uint32, uint32) {

 	// Create a unique per session container name that we can join in setns;
 	// However, other containers can also join it.
-	return fmt.Sprintf("_ses.%s", l.config.ContainerId), 0xffffffff, newperms
+	return "_ses." + l.config.ContainerId, 0xffffffff, newperms
 }

 func (l *linuxStandardInit) Init() error {
@@ -185,7 +185,7 @@ func (l *linuxStandardInit) Init() error {
 	// user process. We open it through /proc/self/fd/$fd, because the fd that
 	// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
 	// re-open an O_PATH fd through /proc.
-	fd, err := unix.Open(fmt.Sprintf("/proc/self/fd/%d", l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
+	fd, err := unix.Open("/proc/self/fd/"+strconv.Itoa(l.fifoFd), unix.O_WRONLY|unix.O_CLOEXEC, 0)
 	if err != nil {
 		return newSystemErrorWithCause(err, "open exec fifo")
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/state_linux.go
@@ -38,7 +38,8 @@ type containerState interface {
 }

 func destroy(c *linuxContainer) error {
-	if !c.config.Namespaces.Contains(configs.NEWPID) {
+	if !c.config.Namespaces.Contains(configs.NEWPID) ||
+		c.config.Namespaces.PathOf(configs.NEWPID) != "" {
 		if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil {
 			logrus.Warn(err)
 		}
--- a/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/system/proc.go
@@ -71,16 +71,6 @@ func Stat(pid int) (stat Stat_t, err error) {
 	return parseStat(string(bytes))
 }

-// GetProcessStartTime is deprecated.  Use Stat(pid) and
-// Stat_t.StartTime instead.
-func GetProcessStartTime(pid int) (string, error) {
-	stat, err := Stat(pid)
-	if err != nil {
-		return "", err
-	}
-	return fmt.Sprintf("%d", stat.StartTime), nil
-}
-
 func parseStat(data string) (stat Stat_t, err error) {
 	// From proc(5), field 2 could contain space and is inside `(` and `)`.
 	// The following is an example:
--- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go
@@ -3,8 +3,8 @@
 package user

 import (
-	"fmt"
 	"os/user"
+	"strconv"
 )

 func lookupUser(username string) (User, error) {
@@ -16,7 +16,7 @@ func lookupUser(username string) (User, error) {
 }

 func lookupUid(uid int) (User, error) {
-	u, err := user.LookupId(fmt.Sprintf("%d", uid))
+	u, err := user.LookupId(strconv.Itoa(uid))
 	if err != nil {
 		return User{}, err
 	}
@@ -32,7 +32,7 @@ func lookupGroup(groupname string) (Group, error) {
 }

 func lookupGid(gid int) (Group, error) {
-	g, err := user.LookupGroupId(fmt.Sprintf("%d", gid))
+	g, err := user.LookupGroupId(strconv.Itoa(gid))
 	if err != nil {
 		return Group{}, err
 	}
--- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go
@@ -466,7 +466,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
 		// we asked for a group but didn't find it. let's check to see
 		// if we wanted a numeric group
 		if !found {
-			gid, err := strconv.Atoi(ag)
+			gid, err := strconv.ParseInt(ag, 10, 64)
 			if err != nil {
 				return nil, fmt.Errorf("Unable to find group %s", ag)
 			}
@@ -474,7 +474,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
 			if gid < minId || gid > maxId {
 				return nil, ErrRange
 			}
-			gidMap[gid] = struct{}{}
+			gidMap[int(gid)] = struct{}{}
 		}
 	}
 	gids := []int{}
--- a/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/utils/utils.go
@@ -1,6 +1,7 @@
 package utils

 import (
+	"encoding/binary"
 	"encoding/json"
 	"io"
 	"os"
@@ -15,6 +16,20 @@ const (
 	exitSignalOffset = 128
 )

+// NativeEndian is the native byte order of the host system.
+var NativeEndian binary.ByteOrder
+
+func init() {
+	// Copied from <golang.org/x/net/internal/socket/sys.go>.
+	i := uint32(1)
+	b := (*[4]byte)(unsafe.Pointer(&i))
+	if b[0] == 1 {
+		NativeEndian = binary.LittleEndian
+	} else {
+		NativeEndian = binary.BigEndian
+	}
+}
+
 // ResolveRootfs ensures that the current working directory is
 // not a symlink and returns the absolute path to the rootfs
 func ResolveRootfs(uncleanRootfs string) (string, error) {
@@ -106,7 +121,3 @@ func Annotations(labels []string) (bundle string, userAnnotations map[string]str
 	}
 	return
 }
-
-func GetIntSize() int {
-	return int(unsafe.Sizeof(1))
-}