Have separate spec builder for each platform

Signed-off-by: Maksym Pavlenko <pavlenko.maksym@gmail.com>
2023-01-11 13:12:25 -08:00 · 2023-01-11 13:12:25 -08:00 · 40be96efa9
commit 40be96efa9
parent fdfa3519a3
21 changed files with 1086 additions and 746 deletions
--- a/oci/spec_opts_windows.go
+++ b/oci/spec_opts_windows.go
@ -45,3 +45,10 @@ func WithHostDevices(_ context.Context, _ Client, _ *containers.Container, s *Sp
 func DeviceFromPath(path string) (*specs.LinuxDevice, error) {
 	return nil, errors.New("device from path not supported on Windows")
 }
+
+// WithDevices does nothing on Windows.
+func WithDevices(devicePath, containerPath, permissions string) SpecOpts {
+	return func(ctx context.Context, client Client, container *containers.Container, spec *Spec) error {
+		return nil
+	}
+}
--- a/pkg/cri/opts/container.go
+++ b/pkg/cri/opts/container.go
@ -25,13 +25,14 @@ import (
 	goruntime "runtime"
 	"strings"

+	"github.com/containerd/continuity/fs"
+
 	"github.com/containerd/containerd"
 	"github.com/containerd/containerd/containers"
 	"github.com/containerd/containerd/errdefs"
 	"github.com/containerd/containerd/log"
 	"github.com/containerd/containerd/mount"
 	"github.com/containerd/containerd/snapshots"
-	"github.com/containerd/continuity/fs"
 )

 // WithNewSnapshot wraps `containerd.WithNewSnapshot` so that if creating the
--- a/pkg/cri/opts/spec_linux.go
+++ b/pkg/cri/opts/spec_linux.go
@ -22,8 +22,6 @@ import (
 	"fmt"
 	"os"
 	"path/filepath"
-	"sort"
-	"strconv"
 	"strings"
 	"sync"
 	"syscall"
@ -31,255 +29,15 @@ import (
 	"github.com/container-orchestrated-devices/container-device-interface/pkg/cdi"
 	"github.com/containerd/cgroups/v3"
 	"github.com/containerd/cgroups/v3/cgroup1"
-	runtimespec "github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/opencontainers/selinux/go-selinux/label"
 	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
-	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

 	"github.com/containerd/containerd/containers"
 	"github.com/containerd/containerd/log"
-	"github.com/containerd/containerd/mount"
 	"github.com/containerd/containerd/oci"
-	osinterface "github.com/containerd/containerd/pkg/os"
 )

-// WithMounts sorts and adds runtime and CRI mounts to the spec
-func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string) oci.SpecOpts {
-	return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) (err error) {
-		// mergeMounts merge CRI mounts with extra mounts. If a mount destination
-		// is mounted by both a CRI mount and an extra mount, the CRI mount will
-		// be kept.
-		var (
-			criMounts = config.GetMounts()
-			mounts    = append([]*runtime.Mount{}, criMounts...)
-		)
-		// Copy all mounts from extra mounts, except for mounts overridden by CRI.
-		for _, e := range extra {
-			found := false
-			for _, c := range criMounts {
-				if filepath.Clean(e.ContainerPath) == filepath.Clean(c.ContainerPath) {
-					found = true
-					break
-				}
-			}
-			if !found {
-				mounts = append(mounts, e)
-			}
-		}
-
-		// Sort mounts in number of parts. This ensures that high level mounts don't
-		// shadow other mounts.
-		sort.Sort(orderedMounts(mounts))
-
-		// Mount cgroup into the container as readonly, which inherits docker's behavior.
-		s.Mounts = append(s.Mounts, runtimespec.Mount{
-			Source:      "cgroup",
-			Destination: "/sys/fs/cgroup",
-			Type:        "cgroup",
-			Options:     []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
-		})
-
-		// Copy all mounts from default mounts, except for
-		// - mounts overridden by supplied mount;
-		// - all mounts under /dev if a supplied /dev is present.
-		mountSet := make(map[string]struct{})
-		for _, m := range mounts {
-			mountSet[filepath.Clean(m.ContainerPath)] = struct{}{}
-		}
-
-		defaultMounts := s.Mounts
-		s.Mounts = nil
-
-		for _, m := range defaultMounts {
-			dst := filepath.Clean(m.Destination)
-			if _, ok := mountSet[dst]; ok {
-				// filter out mount overridden by a supplied mount
-				continue
-			}
-			if _, mountDev := mountSet["/dev"]; mountDev && strings.HasPrefix(dst, "/dev/") {
-				// filter out everything under /dev if /dev is a supplied mount
-				continue
-			}
-			s.Mounts = append(s.Mounts, m)
-		}
-
-		for _, mount := range mounts {
-			var (
-				dst = mount.GetContainerPath()
-				src = mount.GetHostPath()
-			)
-			// Create the host path if it doesn't exist.
-			// TODO(random-liu): Add CRI validation test for this case.
-			if _, err := osi.Stat(src); err != nil {
-				if !os.IsNotExist(err) {
-					return fmt.Errorf("failed to stat %q: %w", src, err)
-				}
-				if err := osi.MkdirAll(src, 0755); err != nil {
-					return fmt.Errorf("failed to mkdir %q: %w", src, err)
-				}
-			}
-			// TODO(random-liu): Add cri-containerd integration test or cri validation test
-			// for this.
-			src, err := osi.ResolveSymbolicLink(src)
-			if err != nil {
-				return fmt.Errorf("failed to resolve symlink %q: %w", src, err)
-			}
-			if s.Linux == nil {
-				s.Linux = &runtimespec.Linux{}
-			}
-			options := []string{"rbind"}
-			switch mount.GetPropagation() {
-			case runtime.MountPropagation_PROPAGATION_PRIVATE:
-				options = append(options, "rprivate")
-				// Since default root propagation in runc is rprivate ignore
-				// setting the root propagation
-			case runtime.MountPropagation_PROPAGATION_BIDIRECTIONAL:
-				if err := ensureShared(src, osi.LookupMount); err != nil {
-					return err
-				}
-				options = append(options, "rshared")
-				s.Linux.RootfsPropagation = "rshared"
-			case runtime.MountPropagation_PROPAGATION_HOST_TO_CONTAINER:
-				if err := ensureSharedOrSlave(src, osi.LookupMount); err != nil {
-					return err
-				}
-				options = append(options, "rslave")
-				if s.Linux.RootfsPropagation != "rshared" &&
-					s.Linux.RootfsPropagation != "rslave" {
-					s.Linux.RootfsPropagation = "rslave"
-				}
-			default:
-				log.G(ctx).Warnf("Unknown propagation mode for hostPath %q", mount.HostPath)
-				options = append(options, "rprivate")
-			}
-
-			// NOTE(random-liu): we don't change all mounts to `ro` when root filesystem
-			// is readonly. This is different from docker's behavior, but make more sense.
-			if mount.GetReadonly() {
-				options = append(options, "ro")
-			} else {
-				options = append(options, "rw")
-			}
-
-			if mount.GetSelinuxRelabel() {
-				if err := label.Relabel(src, mountLabel, false); err != nil && err != unix.ENOTSUP {
-					return fmt.Errorf("relabel %q with %q failed: %w", src, mountLabel, err)
-				}
-			}
-			s.Mounts = append(s.Mounts, runtimespec.Mount{
-				Source:      src,
-				Destination: dst,
-				Type:        "bind",
-				Options:     options,
-			})
-		}
-		return nil
-	}
-}
-
-// Ensure mount point on which path is mounted, is shared.
-func ensureShared(path string, lookupMount func(string) (mount.Info, error)) error {
-	mountInfo, err := lookupMount(path)
-	if err != nil {
-		return err
-	}
-
-	// Make sure source mount point is shared.
-	optsSplit := strings.Split(mountInfo.Optional, " ")
-	for _, opt := range optsSplit {
-		if strings.HasPrefix(opt, "shared:") {
-			return nil
-		}
-	}
-
-	return fmt.Errorf("path %q is mounted on %q but it is not a shared mount", path, mountInfo.Mountpoint)
-}
-
-// ensure mount point on which path is mounted, is either shared or slave.
-func ensureSharedOrSlave(path string, lookupMount func(string) (mount.Info, error)) error {
-	mountInfo, err := lookupMount(path)
-	if err != nil {
-		return err
-	}
-	// Make sure source mount point is shared.
-	optsSplit := strings.Split(mountInfo.Optional, " ")
-	for _, opt := range optsSplit {
-		if strings.HasPrefix(opt, "shared:") {
-			return nil
-		} else if strings.HasPrefix(opt, "master:") {
-			return nil
-		}
-	}
-	return fmt.Errorf("path %q is mounted on %q but it is not a shared or slave mount", path, mountInfo.Mountpoint)
-}
-
-// getDeviceUserGroupID() is used to find the right uid/gid
-// value for the device node created in the container namespace.
-// The runtime executes mknod() and chmod()s the created
-// device with the values returned here.
-//
-// On Linux, uid and gid are sufficient and the user/groupname do not
-// need to be resolved.
-//
-// TODO(mythi): In case of user namespaces, the runtime simply bind
-// mounts the devices from the host. Additional logic is needed
-// to check that the runtimes effective UID/GID on the host has the
-// permissions to access the device node and/or the right user namespace
-// mappings are created.
-//
-// Ref: https://github.com/kubernetes/kubernetes/issues/92211
-func getDeviceUserGroupID(runAsVal *runtime.Int64Value) uint32 {
-	if runAsVal != nil {
-		return uint32(runAsVal.GetValue())
-	}
-	return 0
-}
-
-// WithDevices sets the provided devices onto the container spec
-func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDeviceOwnershipFromSecurityContext bool) oci.SpecOpts {
-	return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
-		if s.Linux == nil {
-			s.Linux = &runtimespec.Linux{}
-		}
-		if s.Linux.Resources == nil {
-			s.Linux.Resources = &runtimespec.LinuxResources{}
-		}
-
-		oldDevices := len(s.Linux.Devices)
-
-		for _, device := range config.GetDevices() {
-			path, err := osi.ResolveSymbolicLink(device.HostPath)
-			if err != nil {
-				return err
-			}
-
-			o := oci.WithDevices(path, device.ContainerPath, device.Permissions)
-			if err := o(ctx, client, c, s); err != nil {
-				return err
-			}
-		}
-
-		if enableDeviceOwnershipFromSecurityContext {
-			UID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsUser())
-			GID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsGroup())
-			// Loop all new devices added by oci.WithDevices() to update their
-			// dev.UID/dev.GID.
-			//
-			// non-zero UID/GID from SecurityContext is used to override host's
-			// device UID/GID for the container.
-			for idx := oldDevices; idx < len(s.Linux.Devices); idx++ {
-				if UID != 0 {
-					*s.Linux.Devices[idx].UID = UID
-				}
-				if GID != 0 {
-					*s.Linux.Devices[idx].GID = GID
-				}
-			}
-		}
-		return nil
-	}
-}
+// Linux dependent OCI spec opts.

 var (
 	swapControllerAvailability     bool
@ -312,88 +70,6 @@ func SwapControllerAvailable() bool {
 	return swapControllerAvailability
 }

-// WithResources sets the provided resource restrictions
-func WithResources(resources *runtime.LinuxContainerResources, tolerateMissingHugetlbController, disableHugetlbController bool) oci.SpecOpts {
-	return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
-		if resources == nil {
-			return nil
-		}
-		if s.Linux == nil {
-			s.Linux = &runtimespec.Linux{}
-		}
-		if s.Linux.Resources == nil {
-			s.Linux.Resources = &runtimespec.LinuxResources{}
-		}
-		if s.Linux.Resources.CPU == nil {
-			s.Linux.Resources.CPU = &runtimespec.LinuxCPU{}
-		}
-		if s.Linux.Resources.Memory == nil {
-			s.Linux.Resources.Memory = &runtimespec.LinuxMemory{}
-		}
-		var (
-			p         = uint64(resources.GetCpuPeriod())
-			q         = resources.GetCpuQuota()
-			shares    = uint64(resources.GetCpuShares())
-			limit     = resources.GetMemoryLimitInBytes()
-			swapLimit = resources.GetMemorySwapLimitInBytes()
-			hugepages = resources.GetHugepageLimits()
-		)
-
-		if p != 0 {
-			s.Linux.Resources.CPU.Period = &p
-		}
-		if q != 0 {
-			s.Linux.Resources.CPU.Quota = &q
-		}
-		if shares != 0 {
-			s.Linux.Resources.CPU.Shares = &shares
-		}
-		if cpus := resources.GetCpusetCpus(); cpus != "" {
-			s.Linux.Resources.CPU.Cpus = cpus
-		}
-		if mems := resources.GetCpusetMems(); mems != "" {
-			s.Linux.Resources.CPU.Mems = resources.GetCpusetMems()
-		}
-		if limit != 0 {
-			s.Linux.Resources.Memory.Limit = &limit
-			// swap/memory limit should be equal to prevent container from swapping by default
-			if swapLimit == 0 && SwapControllerAvailable() {
-				s.Linux.Resources.Memory.Swap = &limit
-			}
-		}
-		if swapLimit != 0 {
-			s.Linux.Resources.Memory.Swap = &swapLimit
-		}
-
-		if !disableHugetlbController {
-			if isHugetlbControllerPresent() {
-				for _, limit := range hugepages {
-					s.Linux.Resources.HugepageLimits = append(s.Linux.Resources.HugepageLimits, runtimespec.LinuxHugepageLimit{
-						Pagesize: limit.PageSize,
-						Limit:    limit.Limit,
-					})
-				}
-			} else {
-				if !tolerateMissingHugetlbController {
-					return errors.New("huge pages limits are specified but hugetlb cgroup controller is missing. " +
-						"Please set tolerate_missing_hugetlb_controller to `true` to ignore this error")
-				}
-				logrus.Warn("hugetlb cgroup controller is absent. skipping huge pages limits")
-			}
-		}
-
-		if unified := resources.GetUnified(); unified != nil {
-			if s.Linux.Resources.Unified == nil {
-				s.Linux.Resources.Unified = make(map[string]string)
-			}
-			for k, v := range unified {
-				s.Linux.Resources.Unified[k] = v
-			}
-		}
-		return nil
-	}
-}
-
 var (
 	supportsHugetlbOnce sync.Once
 	supportsHugetlb     bool
@ -463,72 +139,6 @@ func IsCgroup2UnifiedMode() bool {
 	return isUnified
 }

-// WithOOMScoreAdj sets the oom score
-func WithOOMScoreAdj(config *runtime.ContainerConfig, restrict bool) oci.SpecOpts {
-	return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
-		if s.Process == nil {
-			s.Process = &runtimespec.Process{}
-		}
-
-		resources := config.GetLinux().GetResources()
-		if resources == nil {
-			return nil
-		}
-		adj := int(resources.GetOomScoreAdj())
-		if restrict {
-			var err error
-			adj, err = restrictOOMScoreAdj(adj)
-			if err != nil {
-				return err
-			}
-		}
-		s.Process.OOMScoreAdj = &adj
-		return nil
-	}
-}
-
-// WithPodOOMScoreAdj sets the oom score for the pod sandbox
-func WithPodOOMScoreAdj(adj int, restrict bool) oci.SpecOpts {
-	return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
-		if s.Process == nil {
-			s.Process = &runtimespec.Process{}
-		}
-		if restrict {
-			var err error
-			adj, err = restrictOOMScoreAdj(adj)
-			if err != nil {
-				return err
-			}
-		}
-		s.Process.OOMScoreAdj = &adj
-		return nil
-	}
-}
-
-func getCurrentOOMScoreAdj() (int, error) {
-	b, err := os.ReadFile("/proc/self/oom_score_adj")
-	if err != nil {
-		return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
-	}
-	s := strings.TrimSpace(string(b))
-	i, err := strconv.Atoi(s)
-	if err != nil {
-		return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
-	}
-	return i, nil
-}
-
-func restrictOOMScoreAdj(preferredOOMScoreAdj int) (int, error) {
-	currentOOMScoreAdj, err := getCurrentOOMScoreAdj()
-	if err != nil {
-		return preferredOOMScoreAdj, err
-	}
-	if preferredOOMScoreAdj < currentOOMScoreAdj {
-		return currentOOMScoreAdj, nil
-	}
-	return preferredOOMScoreAdj, nil
-}
-
 // WithCDI updates OCI spec with CDI content
 func WithCDI(annotations map[string]string) oci.SpecOpts {
 	return func(ctx context.Context, _ oci.Client, c *containers.Container, s *oci.Spec) error {
--- a/pkg/cri/opts/spec_linux_opts.go
+++ b/pkg/cri/opts/spec_linux_opts.go
@ -0,0 +1,426 @@
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+package opts
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"os"
+	"path/filepath"
+	"sort"
+	"strconv"
+	"strings"
+	"syscall"
+
+	runtimespec "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/opencontainers/selinux/go-selinux/label"
+	"github.com/sirupsen/logrus"
+	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
+
+	"github.com/containerd/containerd/containers"
+	"github.com/containerd/containerd/log"
+	"github.com/containerd/containerd/mount"
+	"github.com/containerd/containerd/oci"
+	osinterface "github.com/containerd/containerd/pkg/os"
+)
+
+// WithMounts sorts and adds runtime and CRI mounts to the spec
+func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string) oci.SpecOpts {
+	return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) (err error) {
+		// mergeMounts merge CRI mounts with extra mounts. If a mount destination
+		// is mounted by both a CRI mount and an extra mount, the CRI mount will
+		// be kept.
+		var (
+			criMounts = config.GetMounts()
+			mounts    = append([]*runtime.Mount{}, criMounts...)
+		)
+		// Copy all mounts from extra mounts, except for mounts overridden by CRI.
+		for _, e := range extra {
+			found := false
+			for _, c := range criMounts {
+				if filepath.Clean(e.ContainerPath) == filepath.Clean(c.ContainerPath) {
+					found = true
+					break
+				}
+			}
+			if !found {
+				mounts = append(mounts, e)
+			}
+		}
+
+		// Sort mounts in number of parts. This ensures that high level mounts don't
+		// shadow other mounts.
+		sort.Sort(orderedMounts(mounts))
+
+		// Mount cgroup into the container as readonly, which inherits docker's behavior.
+		s.Mounts = append(s.Mounts, runtimespec.Mount{
+			Source:      "cgroup",
+			Destination: "/sys/fs/cgroup",
+			Type:        "cgroup",
+			Options:     []string{"nosuid", "noexec", "nodev", "relatime", "ro"},
+		})
+
+		// Copy all mounts from default mounts, except for
+		// - mounts overridden by supplied mount;
+		// - all mounts under /dev if a supplied /dev is present.
+		mountSet := make(map[string]struct{})
+		for _, m := range mounts {
+			mountSet[filepath.Clean(m.ContainerPath)] = struct{}{}
+		}
+
+		defaultMounts := s.Mounts
+		s.Mounts = nil
+
+		for _, m := range defaultMounts {
+			dst := filepath.Clean(m.Destination)
+			if _, ok := mountSet[dst]; ok {
+				// filter out mount overridden by a supplied mount
+				continue
+			}
+			if _, mountDev := mountSet["/dev"]; mountDev && strings.HasPrefix(dst, "/dev/") {
+				// filter out everything under /dev if /dev is a supplied mount
+				continue
+			}
+			s.Mounts = append(s.Mounts, m)
+		}
+
+		for _, mount := range mounts {
+			var (
+				dst = mount.GetContainerPath()
+				src = mount.GetHostPath()
+			)
+			// Create the host path if it doesn't exist.
+			// TODO(random-liu): Add CRI validation test for this case.
+			if _, err := osi.Stat(src); err != nil {
+				if !os.IsNotExist(err) {
+					return fmt.Errorf("failed to stat %q: %w", src, err)
+				}
+				if err := osi.MkdirAll(src, 0755); err != nil {
+					return fmt.Errorf("failed to mkdir %q: %w", src, err)
+				}
+			}
+			// TODO(random-liu): Add cri-containerd integration test or cri validation test
+			// for this.
+			src, err := osi.ResolveSymbolicLink(src)
+			if err != nil {
+				return fmt.Errorf("failed to resolve symlink %q: %w", src, err)
+			}
+			if s.Linux == nil {
+				s.Linux = &runtimespec.Linux{}
+			}
+			options := []string{"rbind"}
+			switch mount.GetPropagation() {
+			case runtime.MountPropagation_PROPAGATION_PRIVATE:
+				options = append(options, "rprivate")
+				// Since default root propagation in runc is rprivate ignore
+				// setting the root propagation
+			case runtime.MountPropagation_PROPAGATION_BIDIRECTIONAL:
+				if err := ensureShared(src, osi.LookupMount); err != nil {
+					return err
+				}
+				options = append(options, "rshared")
+				s.Linux.RootfsPropagation = "rshared"
+			case runtime.MountPropagation_PROPAGATION_HOST_TO_CONTAINER:
+				if err := ensureSharedOrSlave(src, osi.LookupMount); err != nil {
+					return err
+				}
+				options = append(options, "rslave")
+				if s.Linux.RootfsPropagation != "rshared" &&
+					s.Linux.RootfsPropagation != "rslave" {
+					s.Linux.RootfsPropagation = "rslave"
+				}
+			default:
+				log.G(ctx).Warnf("Unknown propagation mode for hostPath %q", mount.HostPath)
+				options = append(options, "rprivate")
+			}
+
+			// NOTE(random-liu): we don't change all mounts to `ro` when root filesystem
+			// is readonly. This is different from docker's behavior, but make more sense.
+			if mount.GetReadonly() {
+				options = append(options, "ro")
+			} else {
+				options = append(options, "rw")
+			}
+
+			if mount.GetSelinuxRelabel() {
+				ENOTSUP := syscall.Errno(0x5f) // Linux specific error code, this branch will not execute on non Linux platforms.
+				if err := label.Relabel(src, mountLabel, false); err != nil && err != ENOTSUP {
+					return fmt.Errorf("relabel %q with %q failed: %w", src, mountLabel, err)
+				}
+			}
+			s.Mounts = append(s.Mounts, runtimespec.Mount{
+				Source:      src,
+				Destination: dst,
+				Type:        "bind",
+				Options:     options,
+			})
+		}
+		return nil
+	}
+}
+
+// Ensure mount point on which path is mounted, is shared.
+func ensureShared(path string, lookupMount func(string) (mount.Info, error)) error {
+	mountInfo, err := lookupMount(path)
+	if err != nil {
+		return err
+	}
+
+	// Make sure source mount point is shared.
+	optsSplit := strings.Split(mountInfo.Optional, " ")
+	for _, opt := range optsSplit {
+		if strings.HasPrefix(opt, "shared:") {
+			return nil
+		}
+	}
+
+	return fmt.Errorf("path %q is mounted on %q but it is not a shared mount", path, mountInfo.Mountpoint)
+}
+
+// ensure mount point on which path is mounted, is either shared or slave.
+func ensureSharedOrSlave(path string, lookupMount func(string) (mount.Info, error)) error {
+	mountInfo, err := lookupMount(path)
+	if err != nil {
+		return err
+	}
+	// Make sure source mount point is shared.
+	optsSplit := strings.Split(mountInfo.Optional, " ")
+	for _, opt := range optsSplit {
+		if strings.HasPrefix(opt, "shared:") {
+			return nil
+		} else if strings.HasPrefix(opt, "master:") {
+			return nil
+		}
+	}
+	return fmt.Errorf("path %q is mounted on %q but it is not a shared or slave mount", path, mountInfo.Mountpoint)
+}
+
+// getDeviceUserGroupID() is used to find the right uid/gid
+// value for the device node created in the container namespace.
+// The runtime executes mknod() and chmod()s the created
+// device with the values returned here.
+//
+// On Linux, uid and gid are sufficient and the user/groupname do not
+// need to be resolved.
+//
+// TODO(mythi): In case of user namespaces, the runtime simply bind
+// mounts the devices from the host. Additional logic is needed
+// to check that the runtimes effective UID/GID on the host has the
+// permissions to access the device node and/or the right user namespace
+// mappings are created.
+//
+// Ref: https://github.com/kubernetes/kubernetes/issues/92211
+func getDeviceUserGroupID(runAsVal *runtime.Int64Value) uint32 {
+	if runAsVal != nil {
+		return uint32(runAsVal.GetValue())
+	}
+	return 0
+}
+
+// WithDevices sets the provided devices onto the container spec
+func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDeviceOwnershipFromSecurityContext bool) oci.SpecOpts {
+	return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
+		if s.Linux == nil {
+			s.Linux = &runtimespec.Linux{}
+		}
+		if s.Linux.Resources == nil {
+			s.Linux.Resources = &runtimespec.LinuxResources{}
+		}
+
+		oldDevices := len(s.Linux.Devices)
+
+		for _, device := range config.GetDevices() {
+			path, err := osi.ResolveSymbolicLink(device.HostPath)
+			if err != nil {
+				return err
+			}
+
+			o := oci.WithDevices(path, device.ContainerPath, device.Permissions)
+			if err := o(ctx, client, c, s); err != nil {
+				return err
+			}
+		}
+
+		if enableDeviceOwnershipFromSecurityContext {
+			UID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsUser())
+			GID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsGroup())
+			// Loop all new devices added by oci.WithDevices() to update their
+			// dev.UID/dev.GID.
+			//
+			// non-zero UID/GID from SecurityContext is used to override host's
+			// device UID/GID for the container.
+			for idx := oldDevices; idx < len(s.Linux.Devices); idx++ {
+				if UID != 0 {
+					*s.Linux.Devices[idx].UID = UID
+				}
+				if GID != 0 {
+					*s.Linux.Devices[idx].GID = GID
+				}
+			}
+		}
+		return nil
+	}
+}
+
+// WithResources sets the provided resource restrictions
+func WithResources(resources *runtime.LinuxContainerResources, tolerateMissingHugetlbController, disableHugetlbController bool) oci.SpecOpts {
+	return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
+		if resources == nil {
+			return nil
+		}
+		if s.Linux == nil {
+			s.Linux = &runtimespec.Linux{}
+		}
+		if s.Linux.Resources == nil {
+			s.Linux.Resources = &runtimespec.LinuxResources{}
+		}
+		if s.Linux.Resources.CPU == nil {
+			s.Linux.Resources.CPU = &runtimespec.LinuxCPU{}
+		}
+		if s.Linux.Resources.Memory == nil {
+			s.Linux.Resources.Memory = &runtimespec.LinuxMemory{}
+		}
+		var (
+			p         = uint64(resources.GetCpuPeriod())
+			q         = resources.GetCpuQuota()
+			shares    = uint64(resources.GetCpuShares())
+			limit     = resources.GetMemoryLimitInBytes()
+			swapLimit = resources.GetMemorySwapLimitInBytes()
+			hugepages = resources.GetHugepageLimits()
+		)
+
+		if p != 0 {
+			s.Linux.Resources.CPU.Period = &p
+		}
+		if q != 0 {
+			s.Linux.Resources.CPU.Quota = &q
+		}
+		if shares != 0 {
+			s.Linux.Resources.CPU.Shares = &shares
+		}
+		if cpus := resources.GetCpusetCpus(); cpus != "" {
+			s.Linux.Resources.CPU.Cpus = cpus
+		}
+		if mems := resources.GetCpusetMems(); mems != "" {
+			s.Linux.Resources.CPU.Mems = resources.GetCpusetMems()
+		}
+		if limit != 0 {
+			s.Linux.Resources.Memory.Limit = &limit
+			// swap/memory limit should be equal to prevent container from swapping by default
+			if swapLimit == 0 && SwapControllerAvailable() {
+				s.Linux.Resources.Memory.Swap = &limit
+			}
+		}
+		if swapLimit != 0 {
+			s.Linux.Resources.Memory.Swap = &swapLimit
+		}
+
+		if !disableHugetlbController {
+			if isHugetlbControllerPresent() {
+				for _, limit := range hugepages {
+					s.Linux.Resources.HugepageLimits = append(s.Linux.Resources.HugepageLimits, runtimespec.LinuxHugepageLimit{
+						Pagesize: limit.PageSize,
+						Limit:    limit.Limit,
+					})
+				}
+			} else {
+				if !tolerateMissingHugetlbController {
+					return errors.New("huge pages limits are specified but hugetlb cgroup controller is missing. " +
+						"Please set tolerate_missing_hugetlb_controller to `true` to ignore this error")
+				}
+				logrus.Warn("hugetlb cgroup controller is absent. skipping huge pages limits")
+			}
+		}
+
+		if unified := resources.GetUnified(); unified != nil {
+			if s.Linux.Resources.Unified == nil {
+				s.Linux.Resources.Unified = make(map[string]string)
+			}
+			for k, v := range unified {
+				s.Linux.Resources.Unified[k] = v
+			}
+		}
+		return nil
+	}
+}
+
+// WithOOMScoreAdj sets the oom score
+func WithOOMScoreAdj(config *runtime.ContainerConfig, restrict bool) oci.SpecOpts {
+	return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
+		if s.Process == nil {
+			s.Process = &runtimespec.Process{}
+		}
+
+		resources := config.GetLinux().GetResources()
+		if resources == nil {
+			return nil
+		}
+		adj := int(resources.GetOomScoreAdj())
+		if restrict {
+			var err error
+			adj, err = restrictOOMScoreAdj(adj)
+			if err != nil {
+				return err
+			}
+		}
+		s.Process.OOMScoreAdj = &adj
+		return nil
+	}
+}
+
+// WithPodOOMScoreAdj sets the oom score for the pod sandbox
+func WithPodOOMScoreAdj(adj int, restrict bool) oci.SpecOpts {
+	return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error {
+		if s.Process == nil {
+			s.Process = &runtimespec.Process{}
+		}
+		if restrict {
+			var err error
+			adj, err = restrictOOMScoreAdj(adj)
+			if err != nil {
+				return err
+			}
+		}
+		s.Process.OOMScoreAdj = &adj
+		return nil
+	}
+}
+
+func getCurrentOOMScoreAdj() (int, error) {
+	b, err := os.ReadFile("/proc/self/oom_score_adj")
+	if err != nil {
+		return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
+	}
+	s := strings.TrimSpace(string(b))
+	i, err := strconv.Atoi(s)
+	if err != nil {
+		return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err)
+	}
+	return i, nil
+}
+
+func restrictOOMScoreAdj(preferredOOMScoreAdj int) (int, error) {
+	currentOOMScoreAdj, err := getCurrentOOMScoreAdj()
+	if err != nil {
+		return preferredOOMScoreAdj, err
+	}
+	if preferredOOMScoreAdj < currentOOMScoreAdj {
+		return currentOOMScoreAdj, nil
+	}
+	return preferredOOMScoreAdj, nil
+}
--- a/pkg/cri/opts/spec_nonlinux.go
+++ b/pkg/cri/opts/spec_nonlinux.go
@ -0,0 +1,41 @@
+//go:build !linux
+
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+package opts
+
+import (
+	"context"
+
+	"github.com/containerd/containerd/containers"
+	"github.com/containerd/containerd/oci"
+)
+
+func isHugetlbControllerPresent() bool {
+	return false
+}
+
+func SwapControllerAvailable() bool {
+	return false
+}
+
+// WithCDI does nothing on non Linux platforms.
+func WithCDI(_ map[string]string) oci.SpecOpts {
+	return func(ctx context.Context, client oci.Client, container *containers.Container, spec *oci.Spec) error {
+		return nil
+	}
+}
--- a/pkg/cri/opts/spec_opts.go
+++ b/pkg/cri/opts/spec_opts.go
--- a/pkg/cri/opts/spec_opts_test.go
+++ b/pkg/cri/opts/spec_opts_test.go
--- a/pkg/cri/opts/spec_windows_opts.go
+++ b/pkg/cri/opts/spec_windows_opts.go
@ -24,11 +24,11 @@ import (
 	"sort"
 	"strings"

-	"github.com/containerd/containerd/containers"
-	"github.com/containerd/containerd/oci"
 	runtimespec "github.com/opencontainers/runtime-spec/specs-go"
 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

+	"github.com/containerd/containerd/containers"
+	"github.com/containerd/containerd/oci"
 	osinterface "github.com/containerd/containerd/pkg/os"
 )

@ -229,8 +229,8 @@ func WithWindowsCredentialSpec(credentialSpec string) oci.SpecOpts {
 	}
 }

-// WithDevices sets the provided devices onto the container spec
-func WithDevices(config *runtime.ContainerConfig) oci.SpecOpts {
+// WithWindowsDevices sets the provided devices onto the container spec
+func WithWindowsDevices(config *runtime.ContainerConfig) oci.SpecOpts {
 	return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
 		for _, device := range config.GetDevices() {
 			if device.ContainerPath != "" {
--- a/pkg/cri/opts/spec_windows_test.go
+++ b/pkg/cri/opts/spec_windows_test.go
@ -22,14 +22,15 @@ import (
 	"strings"
 	"testing"

-	"github.com/containerd/containerd/containers"
-	"github.com/containerd/containerd/namespaces"
-	"github.com/containerd/containerd/oci"
-	osinterface "github.com/containerd/containerd/pkg/os"
 	"github.com/opencontainers/runtime-spec/specs-go"
 	"github.com/stretchr/testify/assert"
 	"github.com/stretchr/testify/require"
 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
+
+	"github.com/containerd/containerd/containers"
+	"github.com/containerd/containerd/namespaces"
+	"github.com/containerd/containerd/oci"
+	osinterface "github.com/containerd/containerd/pkg/os"
 )

 func TestWithDevices(t *testing.T) {
@ -183,7 +184,7 @@ func TestWithDevices(t *testing.T) {
 			config := runtime.ContainerConfig{}
 			config.Devices = tc.devices

-			specOpts := []oci.SpecOpts{WithDevices(&config)}
+			specOpts := []oci.SpecOpts{WithWindowsDevices(&config)}

 			platform := "windows"
 			if tc.isLCOW {
--- a/pkg/cri/sbserver/blockio_stub_linux.go
+++ b/pkg/cri/sbserver/blockio_stub_linux.go
--- a/pkg/cri/sbserver/container_create.go
+++ b/pkg/cri/sbserver/container_create.go
@ -24,6 +24,14 @@ import (
 	"strconv"
 	"time"

+	"github.com/containerd/typeurl"
+	"github.com/davecgh/go-spew/spew"
+	imagespec "github.com/opencontainers/image-spec/specs-go/v1"
+	runtimespec "github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/opencontainers/selinux/go-selinux"
+	"github.com/opencontainers/selinux/go-selinux/label"
+	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
+
 	"github.com/containerd/containerd"
 	"github.com/containerd/containerd/api/types"
 	"github.com/containerd/containerd/containers"
@ -37,12 +45,6 @@ import (
 	containerstore "github.com/containerd/containerd/pkg/cri/store/container"
 	"github.com/containerd/containerd/pkg/cri/util"
 	ctrdutil "github.com/containerd/containerd/pkg/cri/util"
-	"github.com/containerd/typeurl"
-	"github.com/davecgh/go-spew/spew"
-	imagespec "github.com/opencontainers/image-spec/specs-go/v1"
-	runtimespec "github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/opencontainers/selinux/go-selinux"
-	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
 )

 func init() {
@ -419,44 +421,317 @@ func (c *criService) buildContainerSpec(
 	ociRuntime config.Runtime,
 ) (_ *runtimespec.Spec, retErr error) {
 	var (
-		specOpts []oci.SpecOpts
-
-		// Platform helpers
 		isLinux   = platform.OS == "linux"
 		isWindows = platform.OS == "windows"
+		isDarwin  = platform.OS == "darwin"
 	)

-	if isLinux {
-		specOpts = append(specOpts, oci.WithoutRunMount)
+	switch {
+	case isLinux:
+		return c.buildLinuxSpec(
+			id,
+			sandboxID,
+			sandboxPid,
+			netNSPath,
+			containerName,
+			imageName,
+			config,
+			sandboxConfig,
+			imageConfig,
+			extraMounts,
+			ociRuntime,
+		)
+	case isWindows:
+		return c.buildWindowsSpec(
+			id,
+			sandboxID,
+			sandboxPid,
+			netNSPath,
+			containerName,
+			imageName,
+			config,
+			sandboxConfig,
+			imageConfig,
+			extraMounts,
+			ociRuntime,
+		)
+	case isDarwin:
+		return c.buildDarwinSpec(
+			id,
+			sandboxID,
+			containerName,
+			imageName,
+			config,
+			sandboxConfig,
+			imageConfig,
+			extraMounts,
+			ociRuntime,
+		)
+	default:
+		return nil, fmt.Errorf("unsupported spec platform: %s", platform.OS)
+	}
+}

-		// Only clear the default security settings if the runtime does not have a custom
-		// base runtime spec. Admins can use this functionality to define
-		// default ulimits, seccomp, or other default settings.
-		if ociRuntime.BaseRuntimeSpec == "" {
-			specOpts = append(specOpts, customopts.WithoutDefaultSecuritySettings)
+func (c *criService) buildLinuxSpec(
+	id string,
+	sandboxID string,
+	sandboxPid uint32,
+	netNSPath string,
+	containerName string,
+	imageName string,
+	config *runtime.ContainerConfig,
+	sandboxConfig *runtime.PodSandboxConfig,
+	imageConfig *imagespec.ImageConfig,
+	extraMounts []*runtime.Mount,
+	ociRuntime config.Runtime,
+) (_ *runtimespec.Spec, retErr error) {
+	specOpts := []oci.SpecOpts{
+		oci.WithoutRunMount,
+	}
+	// only clear the default security settings if the runtime does not have a custom
+	// base runtime spec spec.  Admins can use this functionality to define
+	// default ulimits, seccomp, or other default settings.
+	if ociRuntime.BaseRuntimeSpec == "" {
+		specOpts = append(specOpts, customopts.WithoutDefaultSecuritySettings)
+	}
+
+	specOpts = append(specOpts,
+		customopts.WithRelativeRoot(relativeRootfsPath),
+		customopts.WithProcessArgs(config, imageConfig),
+		oci.WithDefaultPathEnv,
+		// this will be set based on the security context below
+		oci.WithNewPrivileges,
+	)
+
+	if config.GetWorkingDir() != "" {
+		specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir()))
+	} else if imageConfig.WorkingDir != "" {
+		specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
+	}
+
+	if config.GetTty() {
+		specOpts = append(specOpts, oci.WithTTY)
+	}
+
+	// Add HOSTNAME env.
+	var (
+		err      error
+		hostname = sandboxConfig.GetHostname()
+	)
+	if hostname == "" {
+		if hostname, err = c.os.Hostname(); err != nil {
+			return nil, err
 		}
+	}
+	specOpts = append(specOpts, oci.WithEnv([]string{hostnameEnv + "=" + hostname}))

-		specOpts = append(specOpts,
-			customopts.WithRelativeRoot(relativeRootfsPath),
-			oci.WithDefaultPathEnv,
-			// this will be set based on the security context below
-			oci.WithNewPrivileges,
-		)
+	// Apply envs from image config first, so that envs from container config
+	// can override them.
+	env := append([]string{}, imageConfig.Env...)
+	for _, e := range config.GetEnvs() {
+		env = append(env, e.GetKey()+"="+e.GetValue())
+	}
+	specOpts = append(specOpts, oci.WithEnv(env))

-		// Add HOSTNAME env.
-		var (
-			err      error
-			hostname = sandboxConfig.GetHostname()
-		)
-		if hostname == "" {
-			if hostname, err = c.os.Hostname(); err != nil {
+	securityContext := config.GetLinux().GetSecurityContext()
+	labelOptions, err := toLabel(securityContext.GetSelinuxOptions())
+	if err != nil {
+		return nil, err
+	}
+	if len(labelOptions) == 0 {
+		// Use pod level SELinux config
+		if sandbox, err := c.sandboxStore.Get(sandboxID); err == nil {
+			labelOptions, err = selinux.DupSecOpt(sandbox.ProcessLabel)
+			if err != nil {
 				return nil, err
 			}
 		}
-		specOpts = append(specOpts, oci.WithEnv([]string{hostnameEnv + "=" + hostname}))
 	}

-	specOpts = append(specOpts, customopts.WithProcessArgs(config, imageConfig))
+	processLabel, mountLabel, err := label.InitLabels(labelOptions)
+	if err != nil {
+		return nil, fmt.Errorf("failed to init selinux options %+v: %w", securityContext.GetSelinuxOptions(), err)
+	}
+	defer func() {
+		if retErr != nil {
+			selinux.ReleaseLabel(processLabel)
+		}
+	}()
+
+	specOpts = append(specOpts, customopts.WithMounts(c.os, config, extraMounts, mountLabel))
+
+	if !c.config.DisableProcMount {
+		// Change the default masked/readonly paths to empty slices
+		// See https://github.com/containerd/containerd/issues/5029
+		// TODO: Provide an option to set default paths to the ones in oci.populateDefaultUnixSpec()
+		specOpts = append(specOpts, oci.WithMaskedPaths([]string{}), oci.WithReadonlyPaths([]string{}))
+
+		// Apply masked paths if specified.
+		// If the container is privileged, this will be cleared later on.
+		if maskedPaths := securityContext.GetMaskedPaths(); maskedPaths != nil {
+			specOpts = append(specOpts, oci.WithMaskedPaths(maskedPaths))
+		}
+
+		// Apply readonly paths if specified.
+		// If the container is privileged, this will be cleared later on.
+		if readonlyPaths := securityContext.GetReadonlyPaths(); readonlyPaths != nil {
+			specOpts = append(specOpts, oci.WithReadonlyPaths(readonlyPaths))
+		}
+	}
+
+	specOpts = append(specOpts, customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext),
+		customopts.WithCapabilities(securityContext, c.allCaps))
+
+	if securityContext.GetPrivileged() {
+		if !sandboxConfig.GetLinux().GetSecurityContext().GetPrivileged() {
+			return nil, errors.New("no privileged container allowed in sandbox")
+		}
+		specOpts = append(specOpts, oci.WithPrivileged)
+		if !ociRuntime.PrivilegedWithoutHostDevices {
+			specOpts = append(specOpts, oci.WithHostDevices, oci.WithAllDevicesAllowed)
+		} else if ociRuntime.PrivilegedWithoutHostDevicesAllDevicesAllowed {
+			// allow rwm on all devices for the container
+			specOpts = append(specOpts, oci.WithAllDevicesAllowed)
+		}
+	}
+
+	// Clear all ambient capabilities. The implication of non-root + caps
+	// is not clearly defined in Kubernetes.
+	// See https://github.com/kubernetes/kubernetes/issues/56374
+	// Keep docker's behavior for now.
+	specOpts = append(specOpts,
+		customopts.WithoutAmbientCaps,
+		customopts.WithSelinuxLabels(processLabel, mountLabel),
+	)
+
+	// TODO: Figure out whether we should set no new privilege for sandbox container by default
+	if securityContext.GetNoNewPrivs() {
+		specOpts = append(specOpts, oci.WithNoNewPrivileges)
+	}
+	// TODO(random-liu): [P1] Set selinux options (privileged or not).
+	if securityContext.GetReadonlyRootfs() {
+		specOpts = append(specOpts, oci.WithRootFSReadonly())
+	}
+
+	if c.config.DisableCgroup {
+		specOpts = append(specOpts, customopts.WithDisabledCgroups)
+	} else {
+		specOpts = append(specOpts, customopts.WithResources(config.GetLinux().GetResources(), c.config.TolerateMissingHugetlbController, c.config.DisableHugetlbController))
+		if sandboxConfig.GetLinux().GetCgroupParent() != "" {
+			cgroupsPath := getCgroupsPath(sandboxConfig.GetLinux().GetCgroupParent(), id)
+			specOpts = append(specOpts, oci.WithCgroup(cgroupsPath))
+		}
+	}
+
+	supplementalGroups := securityContext.GetSupplementalGroups()
+
+	// Get blockio class
+	blockIOClass, err := c.blockIOClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations)
+	if err != nil {
+		return nil, fmt.Errorf("failed to set blockio class: %w", err)
+	}
+	if blockIOClass != "" {
+		if linuxBlockIO, err := blockIOToLinuxOci(blockIOClass); err == nil {
+			specOpts = append(specOpts, oci.WithBlockIO(linuxBlockIO))
+		} else {
+			return nil, err
+		}
+	}
+
+	// Get RDT class
+	rdtClass, err := c.rdtClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations)
+	if err != nil {
+		return nil, fmt.Errorf("failed to set RDT class: %w", err)
+	}
+	if rdtClass != "" {
+		specOpts = append(specOpts, oci.WithRdt(rdtClass, "", ""))
+	}
+
+	for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations,
+		ociRuntime.PodAnnotations) {
+		specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
+	}
+
+	for pKey, pValue := range getPassthroughAnnotations(config.Annotations,
+		ociRuntime.ContainerAnnotations) {
+		specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
+	}
+
+	// Default target PID namespace is the sandbox PID.
+	targetPid := sandboxPid
+	// If the container targets another container's PID namespace,
+	// set targetPid to the PID of that container.
+	nsOpts := securityContext.GetNamespaceOptions()
+	if nsOpts.GetPid() == runtime.NamespaceMode_TARGET {
+		targetContainer, err := c.validateTargetContainer(sandboxID, nsOpts.TargetId)
+		if err != nil {
+			return nil, fmt.Errorf("invalid target container: %w", err)
+		}
+
+		status := targetContainer.Status.Get()
+		targetPid = status.Pid
+	}
+
+	uids, gids, err := parseUsernsIDs(nsOpts.GetUsernsOptions())
+	if err != nil {
+		return nil, fmt.Errorf("user namespace configuration: %w", err)
+	}
+
+	// Check sandbox userns config is consistent with container config.
+	sandboxUsernsOpts := sandboxConfig.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions()
+	if !sameUsernsConfig(sandboxUsernsOpts, nsOpts.GetUsernsOptions()) {
+		return nil, fmt.Errorf("user namespace config for sandbox is different from container. Sandbox userns config: %v - Container userns config: %v", sandboxUsernsOpts, nsOpts.GetUsernsOptions())
+	}
+
+	specOpts = append(specOpts,
+		customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj),
+		customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, uids, gids),
+		customopts.WithSupplementalGroups(supplementalGroups),
+		customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer),
+		customopts.WithAnnotation(annotations.SandboxID, sandboxID),
+		customopts.WithAnnotation(annotations.SandboxNamespace, sandboxConfig.GetMetadata().GetNamespace()),
+		customopts.WithAnnotation(annotations.SandboxUID, sandboxConfig.GetMetadata().GetUid()),
+		customopts.WithAnnotation(annotations.SandboxName, sandboxConfig.GetMetadata().GetName()),
+		customopts.WithAnnotation(annotations.ContainerName, containerName),
+		customopts.WithAnnotation(annotations.ImageName, imageName),
+	)
+
+	// cgroupns is used for hiding /sys/fs/cgroup from containers.
+	// For compatibility, cgroupns is not used when running in cgroup v1 mode or in privileged.
+	// https://github.com/containers/libpod/issues/4363
+	// https://github.com/kubernetes/enhancements/blob/0e409b47497e398b369c281074485c8de129694f/keps/sig-node/20191118-cgroups-v2.md#cgroup-namespace
+	if isUnifiedCgroupsMode() && !securityContext.GetPrivileged() {
+		specOpts = append(specOpts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.CgroupNamespace}))
+	}
+
+	return c.runtimeSpec(id, ociRuntime.BaseRuntimeSpec, specOpts...)
+}
+
+func (c *criService) buildWindowsSpec(
+	id string,
+	sandboxID string,
+	sandboxPid uint32,
+	netNSPath string,
+	containerName string,
+	imageName string,
+	config *runtime.ContainerConfig,
+	sandboxConfig *runtime.PodSandboxConfig,
+	imageConfig *imagespec.ImageConfig,
+	extraMounts []*runtime.Mount,
+	ociRuntime config.Runtime,
+) (_ *runtimespec.Spec, retErr error) {
+	specOpts := []oci.SpecOpts{
+		customopts.WithProcessArgs(config, imageConfig),
+	}
+
+	// All containers in a pod need to have HostProcess set if it was set on the pod,
+	// and vice versa no containers in the pod can be HostProcess if the pods spec
+	// didn't have the field set. The only case that is valid is if these are the same value.
+	cntrHpc := config.GetWindows().GetSecurityContext().GetHostProcess()
+	sandboxHpc := sandboxConfig.GetWindows().GetSecurityContext().GetHostProcess()
+	if cntrHpc != sandboxHpc {
+		return nil, errors.New("pod spec and all containers inside must have the HostProcess field set to be valid")
+	}

 	if config.GetWorkingDir() != "" {
 		specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir()))
@ -476,116 +751,98 @@ func (c *criService) buildContainerSpec(
 	}
 	specOpts = append(specOpts, oci.WithEnv(env))

-	if isWindows {
-		specOpts = append(specOpts,
-			// Clear the root location since hcsshim expects it.
-			// NOTE: readonly rootfs doesn't work on windows.
-			customopts.WithoutRoot,
-			oci.WithWindowsNetworkNamespace(netNSPath),
-			oci.WithHostname(sandboxConfig.GetHostname()),
-		)
-
-		// All containers in a pod need to have HostProcess set if it was set on the pod,
-		// and vice versa no containers in the pod can be HostProcess if the pods spec
-		// didn't have the field set. The only case that is valid is if these are the same value.
-		cntrHpc := config.GetWindows().GetSecurityContext().GetHostProcess()
-		sandboxHpc := sandboxConfig.GetWindows().GetSecurityContext().GetHostProcess()
-		if cntrHpc != sandboxHpc {
-			return nil, errors.New("pod spec and all containers inside must have the HostProcess field set to be valid")
-		}
-
-		specOpts = append(specOpts, customopts.WithAnnotation(annotations.WindowsHostProcess, strconv.FormatBool(sandboxHpc)))
-	}
-
-	// Get spec opts that depend on features offered by the platform containerd daemon is running on.
-	platformSpecOpts, err := c.platformSpec(
-		id,
-		sandboxID,
-		config,
-		sandboxConfig,
-		imageConfig,
-		extraMounts,
+	specOpts = append(specOpts,
+		// Clear the root location since hcsshim expects it.
+		// NOTE: readonly rootfs doesn't work on windows.
+		customopts.WithoutRoot,
+		oci.WithWindowsNetworkNamespace(netNSPath),
+		oci.WithHostname(sandboxConfig.GetHostname()),
 	)
-	if err != nil {
-		return nil, err
+
+	specOpts = append(specOpts, customopts.WithWindowsMounts(c.os, config, extraMounts), customopts.WithWindowsDevices(config))
+
+	// Start with the image config user and override below if RunAsUsername is not "".
+	username := imageConfig.User
+
+	windowsConfig := config.GetWindows()
+	if windowsConfig != nil {
+		specOpts = append(specOpts, customopts.WithWindowsResources(windowsConfig.GetResources()))
+		securityCtx := windowsConfig.GetSecurityContext()
+		if securityCtx != nil {
+			runAsUser := securityCtx.GetRunAsUsername()
+			if runAsUser != "" {
+				username = runAsUser
+			}
+			cs := securityCtx.GetCredentialSpec()
+			if cs != "" {
+				specOpts = append(specOpts, customopts.WithWindowsCredentialSpec(cs))
+			}
+		}
 	}

-	specOpts = append(specOpts, platformSpecOpts...)
+	// There really isn't a good Windows way to verify that the username is available in the
+	// image as early as here like there is for Linux. Later on in the stack hcsshim
+	// will handle the behavior of erroring out if the user isn't available in the image
+	// when trying to run the init process.
+	specOpts = append(specOpts, oci.WithUser(username))

-	if isLinux {
-		securityContext := config.GetLinux().GetSecurityContext()
-
-		if !c.config.DisableProcMount {
-			// Change the default masked/readonly paths to empty slices
-			// See https://github.com/containerd/containerd/issues/5029
-			// TODO: Provide an option to set default paths to the ones in oci.populateDefaultUnixSpec()
-			specOpts = append(specOpts, oci.WithMaskedPaths([]string{}), oci.WithReadonlyPaths([]string{}))
-
-			// Apply masked paths if specified.
-			// If the container is privileged, this will be cleared later on.
-			if maskedPaths := securityContext.GetMaskedPaths(); maskedPaths != nil {
-				specOpts = append(specOpts, oci.WithMaskedPaths(maskedPaths))
-			}
-
-			// Apply readonly paths if specified.
-			// If the container is privileged, this will be cleared later on.
-			if readonlyPaths := securityContext.GetReadonlyPaths(); readonlyPaths != nil {
-				specOpts = append(specOpts, oci.WithReadonlyPaths(readonlyPaths))
-			}
-		}
-
-		if securityContext.GetPrivileged() {
-			if !sandboxConfig.GetLinux().GetSecurityContext().GetPrivileged() {
-				return nil, errors.New("no privileged container allowed in sandbox")
-			}
-			specOpts = append(specOpts, oci.WithPrivileged)
-			if !ociRuntime.PrivilegedWithoutHostDevices {
-				specOpts = append(specOpts, oci.WithHostDevices, oci.WithAllDevicesAllowed)
-			} else if ociRuntime.PrivilegedWithoutHostDevicesAllDevicesAllowed {
-				// allow rwm on all devices for the container
-				specOpts = append(specOpts, oci.WithAllDevicesAllowed)
-			}
-		}
-
-		// Clear all ambient capabilities. The implication of non-root + caps
-		// is not clearly defined in Kubernetes.
-		// See https://github.com/kubernetes/kubernetes/issues/56374
-		// Keep docker's behavior for now.
-		specOpts = append(specOpts, customopts.WithoutAmbientCaps)
-
-		// TODO: Figure out whether we should set no new privilege for sandbox container by default
-		if securityContext.GetNoNewPrivs() {
-			specOpts = append(specOpts, oci.WithNoNewPrivileges)
-		}
-		// TODO(random-liu): [P1] Set selinux options (privileged or not).
-		if securityContext.GetReadonlyRootfs() {
-			specOpts = append(specOpts, oci.WithRootFSReadonly())
-		}
-
-		supplementalGroups := securityContext.GetSupplementalGroups()
-		specOpts = append(specOpts, customopts.WithSupplementalGroups(supplementalGroups))
-
-		// Default target PID namespace is the sandbox PID.
-		targetPid := sandboxPid
-		// If the container targets another container's PID namespace,
-		// set targetPid to the PID of that container.
-		nsOpts := securityContext.GetNamespaceOptions()
-		if nsOpts.GetPid() == runtime.NamespaceMode_TARGET {
-			targetContainer, err := c.validateTargetContainer(sandboxID, nsOpts.TargetId)
-			if err != nil {
-				return nil, fmt.Errorf("invalid target container: %w", err)
-			}
-
-			status := targetContainer.Status.Get()
-			targetPid = status.Pid
-		}
-
-		specOpts = append(specOpts,
-			// TODO: This is a hack to make this compile. We should move userns support to sbserver.
-			customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, nil, nil),
-		)
+	for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations,
+		ociRuntime.PodAnnotations) {
+		specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
 	}

+	for pKey, pValue := range getPassthroughAnnotations(config.Annotations,
+		ociRuntime.ContainerAnnotations) {
+		specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
+	}
+
+	specOpts = append(specOpts,
+		customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer),
+		customopts.WithAnnotation(annotations.SandboxID, sandboxID),
+		customopts.WithAnnotation(annotations.SandboxNamespace, sandboxConfig.GetMetadata().GetNamespace()),
+		customopts.WithAnnotation(annotations.SandboxUID, sandboxConfig.GetMetadata().GetUid()),
+		customopts.WithAnnotation(annotations.SandboxName, sandboxConfig.GetMetadata().GetName()),
+		customopts.WithAnnotation(annotations.ContainerName, containerName),
+		customopts.WithAnnotation(annotations.ImageName, imageName),
+		customopts.WithAnnotation(annotations.WindowsHostProcess, strconv.FormatBool(sandboxHpc)),
+	)
+
+	return c.runtimeSpec(id, ociRuntime.BaseRuntimeSpec, specOpts...)
+}
+
+func (c *criService) buildDarwinSpec(
+	id string,
+	sandboxID string,
+	containerName string,
+	imageName string,
+	config *runtime.ContainerConfig,
+	sandboxConfig *runtime.PodSandboxConfig,
+	imageConfig *imagespec.ImageConfig,
+	extraMounts []*runtime.Mount,
+	ociRuntime config.Runtime,
+) (_ *runtimespec.Spec, retErr error) {
+	specOpts := []oci.SpecOpts{
+		customopts.WithProcessArgs(config, imageConfig),
+	}
+
+	if config.GetWorkingDir() != "" {
+		specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir()))
+	} else if imageConfig.WorkingDir != "" {
+		specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir))
+	}
+
+	if config.GetTty() {
+		specOpts = append(specOpts, oci.WithTTY)
+	}
+
+	// Apply envs from image config first, so that envs from container config
+	// can override them.
+	env := append([]string{}, imageConfig.Env...)
+	for _, e := range config.GetEnvs() {
+		env = append(env, e.GetKey()+"="+e.GetValue())
+	}
+	specOpts = append(specOpts, oci.WithEnv(env))
+
 	for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations,
 		ociRuntime.PodAnnotations) {
 		specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue))
--- a/pkg/cri/sbserver/container_create_linux.go
+++ b/pkg/cri/sbserver/container_create_linux.go
@ -25,16 +25,13 @@ import (
 	"strconv"
 	"strings"

-	"github.com/containerd/cgroups/v3"
+	imagespec "github.com/opencontainers/image-spec/specs-go/v1"
+	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
+
 	"github.com/containerd/containerd/contrib/apparmor"
 	"github.com/containerd/containerd/contrib/seccomp"
 	"github.com/containerd/containerd/oci"
 	"github.com/containerd/containerd/snapshots"
-	imagespec "github.com/opencontainers/image-spec/specs-go/v1"
-	runtimespec "github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/opencontainers/selinux/go-selinux"
-	"github.com/opencontainers/selinux/go-selinux/label"
-	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

 	customopts "github.com/containerd/containerd/pkg/cri/opts"
 )
@ -111,93 +108,6 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container
 	return mounts
 }

-func (c *criService) platformSpec(
-	id string,
-	sandboxID string,
-	config *runtime.ContainerConfig,
-	sandboxConfig *runtime.PodSandboxConfig,
-	imageConfig *imagespec.ImageConfig,
-	extraMounts []*runtime.Mount,
-) (_ []oci.SpecOpts, retErr error) {
-	specOpts := []oci.SpecOpts{}
-
-	securityContext := config.GetLinux().GetSecurityContext()
-	labelOptions, err := toLabel(securityContext.GetSelinuxOptions())
-	if err != nil {
-		return nil, err
-	}
-	if len(labelOptions) == 0 {
-		// Use pod level SELinux config
-		if sandbox, err := c.sandboxStore.Get(sandboxID); err == nil {
-			labelOptions, err = selinux.DupSecOpt(sandbox.ProcessLabel)
-			if err != nil {
-				return nil, err
-			}
-		}
-	}
-
-	processLabel, mountLabel, err := label.InitLabels(labelOptions)
-	if err != nil {
-		return nil, fmt.Errorf("failed to init selinux options %+v: %w", securityContext.GetSelinuxOptions(), err)
-	}
-	defer func() {
-		if retErr != nil {
-			selinux.ReleaseLabel(processLabel)
-		}
-	}()
-
-	specOpts = append(specOpts,
-		customopts.WithSelinuxLabels(processLabel, mountLabel),
-		customopts.WithMounts(c.os, config, extraMounts, mountLabel),
-		customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext),
-		customopts.WithCapabilities(securityContext, c.allCaps),
-	)
-
-	if c.config.DisableCgroup {
-		specOpts = append(specOpts, customopts.WithDisabledCgroups)
-	} else {
-		specOpts = append(specOpts, customopts.WithResources(config.GetLinux().GetResources(), c.config.TolerateMissingHugetlbController, c.config.DisableHugetlbController))
-		if sandboxConfig.GetLinux().GetCgroupParent() != "" {
-			cgroupsPath := getCgroupsPath(sandboxConfig.GetLinux().GetCgroupParent(), id)
-			specOpts = append(specOpts, oci.WithCgroup(cgroupsPath))
-		}
-	}
-
-	// Get blockio class
-	blockIOClass, err := c.blockIOClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations)
-	if err != nil {
-		return nil, fmt.Errorf("failed to set blockio class: %w", err)
-	}
-	if blockIOClass != "" {
-		if linuxBlockIO, err := blockIOToLinuxOci(blockIOClass); err == nil {
-			specOpts = append(specOpts, oci.WithBlockIO(linuxBlockIO))
-		} else {
-			return nil, err
-		}
-	}
-
-	// Get RDT class
-	rdtClass, err := c.rdtClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations)
-	if err != nil {
-		return nil, fmt.Errorf("failed to set RDT class: %w", err)
-	}
-	if rdtClass != "" {
-		specOpts = append(specOpts, oci.WithRdt(rdtClass, "", ""))
-	}
-
-	specOpts = append(specOpts, customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj))
-
-	// cgroupns is used for hiding /sys/fs/cgroup from containers.
-	// For compatibility, cgroupns is not used when running in cgroup v1 mode or in privileged.
-	// https://github.com/containers/libpod/issues/4363
-	// https://github.com/kubernetes/enhancements/blob/0e409b47497e398b369c281074485c8de129694f/keps/sig-node/20191118-cgroups-v2.md#cgroup-namespace
-	if cgroups.Mode() == cgroups.Unified && !securityContext.GetPrivileged() {
-		specOpts = append(specOpts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.CgroupNamespace}))
-	}
-
-	return specOpts, nil
-}
-
 func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
 	var specOpts []oci.SpecOpts
 	securityContext := config.GetLinux().GetSecurityContext()
--- a/pkg/cri/sbserver/container_create_other.go
+++ b/pkg/cri/sbserver/container_create_other.go
@ -19,10 +19,11 @@
 package sbserver

 import (
-	"github.com/containerd/containerd/oci"
-	"github.com/containerd/containerd/snapshots"
 	imagespec "github.com/opencontainers/image-spec/specs-go/v1"
 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
+
+	"github.com/containerd/containerd/oci"
+	"github.com/containerd/containerd/snapshots"
 )

 // containerMounts sets up necessary container system file mounts
@ -31,17 +32,6 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container
 	return []*runtime.Mount{}
 }

-func (c *criService) platformSpec(
-	id string,
-	sandboxID string,
-	config *runtime.ContainerConfig,
-	sandboxConfig *runtime.PodSandboxConfig,
-	imageConfig *imagespec.ImageConfig,
-	extraMounts []*runtime.Mount,
-) ([]oci.SpecOpts, error) {
-	return []oci.SpecOpts{}, nil
-}
-
 func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
 	return []oci.SpecOpts{}, nil
 }
--- a/pkg/cri/sbserver/container_create_windows.go
+++ b/pkg/cri/sbserver/container_create_windows.go
@ -23,7 +23,6 @@ import (
 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

 	"github.com/containerd/containerd/oci"
-	customopts "github.com/containerd/containerd/pkg/cri/opts"
 	"github.com/containerd/containerd/snapshots"
 )

@ -32,49 +31,6 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container
 	return nil
 }

-func (c *criService) platformSpec(
-	id string,
-	sandboxID string,
-	config *runtime.ContainerConfig,
-	sandboxConfig *runtime.PodSandboxConfig,
-	imageConfig *imagespec.ImageConfig,
-	extraMounts []*runtime.Mount,
-) ([]oci.SpecOpts, error) {
-	specOpts := []oci.SpecOpts{}
-
-	specOpts = append(specOpts,
-		customopts.WithWindowsMounts(c.os, config, extraMounts),
-		customopts.WithDevices(config),
-	)
-
-	// Start with the image config user and override below if RunAsUsername is not "".
-	username := imageConfig.User
-
-	windowsConfig := config.GetWindows()
-	if windowsConfig != nil {
-		specOpts = append(specOpts, customopts.WithWindowsResources(windowsConfig.GetResources()))
-		securityCtx := windowsConfig.GetSecurityContext()
-		if securityCtx != nil {
-			runAsUser := securityCtx.GetRunAsUsername()
-			if runAsUser != "" {
-				username = runAsUser
-			}
-			cs := securityCtx.GetCredentialSpec()
-			if cs != "" {
-				specOpts = append(specOpts, customopts.WithWindowsCredentialSpec(cs))
-			}
-		}
-	}
-
-	// There really isn't a good Windows way to verify that the username is available in the
-	// image as early as here like there is for Linux. Later on in the stack hcsshim
-	// will handle the behavior of erroring out if the user isn't available in the image
-	// when trying to run the init process.
-	specOpts = append(specOpts, oci.WithUser(username))
-
-	return specOpts, nil
-}
-
 // No extra spec options needed for windows.
 func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) {
 	return nil, nil
--- a/pkg/cri/sbserver/helpers.go
+++ b/pkg/cri/sbserver/helpers.go
@ -21,6 +21,7 @@ import (
 	"fmt"
 	"path"
 	"path/filepath"
+	"regexp"
 	goruntime "runtime"
 	"strconv"
 	"strings"
@ -603,3 +604,180 @@ func hostNetwork(config *runtime.PodSandboxConfig) bool {
 	}
 	return hostNet
 }
+
+// getCgroupsPath generates container cgroups path.
+func getCgroupsPath(cgroupsParent, id string) string {
+	base := path.Base(cgroupsParent)
+	if strings.HasSuffix(base, ".slice") {
+		// For a.slice/b.slice/c.slice, base is c.slice.
+		// runc systemd cgroup path format is "slice:prefix:name".
+		return strings.Join([]string{base, "cri-containerd", id}, ":")
+	}
+	return filepath.Join(cgroupsParent, id)
+}
+
+func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) {
+	var labels []string
+
+	if selinuxOptions == nil {
+		return nil, nil
+	}
+	if err := checkSelinuxLevel(selinuxOptions.Level); err != nil {
+		return nil, err
+	}
+	if selinuxOptions.User != "" {
+		labels = append(labels, "user:"+selinuxOptions.User)
+	}
+	if selinuxOptions.Role != "" {
+		labels = append(labels, "role:"+selinuxOptions.Role)
+	}
+	if selinuxOptions.Type != "" {
+		labels = append(labels, "type:"+selinuxOptions.Type)
+	}
+	if selinuxOptions.Level != "" {
+		labels = append(labels, "level:"+selinuxOptions.Level)
+	}
+
+	return labels, nil
+}
+
+func checkSelinuxLevel(level string) error {
+	if len(level) == 0 {
+		return nil
+	}
+
+	matched, err := regexp.MatchString(`^s\d(-s\d)??(:c\d{1,4}(\.c\d{1,4})?(,c\d{1,4}(\.c\d{1,4})?)*)?$`, level)
+	if err != nil {
+		return fmt.Errorf("the format of 'level' %q is not correct: %w", level, err)
+	}
+	if !matched {
+		return fmt.Errorf("the format of 'level' %q is not correct", level)
+	}
+	return nil
+}
+
+func parseUsernsIDMap(runtimeIDMap []*runtime.IDMapping) ([]runtimespec.LinuxIDMapping, error) {
+	var m []runtimespec.LinuxIDMapping
+
+	if len(runtimeIDMap) == 0 {
+		return m, nil
+	}
+
+	if len(runtimeIDMap) > 1 {
+		// We only accept 1 line, because containerd.WithRemappedSnapshot() only supports that.
+		return m, fmt.Errorf("only one mapping line supported, got %v mapping lines", len(runtimeIDMap))
+	}
+
+	// We know len is 1 now.
+	if runtimeIDMap[0] == nil {
+		return m, nil
+	}
+	uidMap := *runtimeIDMap[0]
+
+	if uidMap.Length < 1 {
+		return m, fmt.Errorf("invalid mapping length: %v", uidMap.Length)
+	}
+
+	m = []runtimespec.LinuxIDMapping{
+		{
+			ContainerID: uidMap.ContainerId,
+			HostID:      uidMap.HostId,
+			Size:        uidMap.Length,
+		},
+	}
+
+	return m, nil
+}
+
+func parseUsernsIDs(userns *runtime.UserNamespace) (uids, gids []runtimespec.LinuxIDMapping, retErr error) {
+	if userns == nil {
+		// If userns is not set, the kubelet doesn't support this option
+		// and we should just fallback to no userns. This is completely
+		// valid.
+		return nil, nil, nil
+	}
+
+	uids, err := parseUsernsIDMap(userns.GetUids())
+	if err != nil {
+		return nil, nil, fmt.Errorf("UID mapping: %w", err)
+	}
+
+	gids, err = parseUsernsIDMap(userns.GetGids())
+	if err != nil {
+		return nil, nil, fmt.Errorf("GID mapping: %w", err)
+	}
+
+	switch mode := userns.GetMode(); mode {
+	case runtime.NamespaceMode_NODE:
+		if len(uids) != 0 || len(gids) != 0 {
+			return nil, nil, fmt.Errorf("can't use user namespace mode %q with mappings. Got %v UID mappings and %v GID mappings", mode, len(uids), len(gids))
+		}
+	case runtime.NamespaceMode_POD:
+		// This is valid, we will handle it in WithPodNamespaces().
+		if len(uids) == 0 || len(gids) == 0 {
+			return nil, nil, fmt.Errorf("can't use user namespace mode %q without UID and GID mappings", mode)
+		}
+	default:
+		return nil, nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
+	}
+
+	return uids, gids, nil
+}
+
+// sameUsernsConfig checks if the userns configs are the same. If the mappings
+// on each config are the same but in different order, it returns false.
+// XXX: If the runtime.UserNamespace struct changes, we should update this
+// function accordingly.
+func sameUsernsConfig(a, b *runtime.UserNamespace) bool {
+	// If both are nil, they are the same.
+	if a == nil && b == nil {
+		return true
+	}
+	// If only one is nil, they are different.
+	if a == nil || b == nil {
+		return false
+	}
+	// At this point, a is not nil nor b.
+
+	if a.GetMode() != b.GetMode() {
+		return false
+	}
+
+	aUids, aGids, err := parseUsernsIDs(a)
+	if err != nil {
+		return false
+	}
+	bUids, bGids, err := parseUsernsIDs(b)
+	if err != nil {
+		return false
+	}
+
+	if !sameMapping(aUids, bUids) {
+		return false
+	}
+	if !sameMapping(aGids, bGids) {
+		return false
+	}
+	return true
+}
+
+// sameMapping checks if the mappings are the same. If the mappings are the same
+// but in different order, it returns false.
+func sameMapping(a, b []runtimespec.LinuxIDMapping) bool {
+	if len(a) != len(b) {
+		return false
+	}
+
+	for x := range a {
+		if a[x].ContainerID != b[x].ContainerID {
+			return false
+		}
+		if a[x].HostID != b[x].HostID {
+			return false
+		}
+		if a[x].Size != b[x].Size {
+			return false
+		}
+	}
+	return true
+}
--- a/pkg/cri/sbserver/helpers_linux.go
+++ b/pkg/cri/sbserver/helpers_linux.go
@ -20,23 +20,22 @@ import (
 	"context"
 	"fmt"
 	"os"
-	"path"
 	"path/filepath"
-	"regexp"
 	"sort"
 	"strings"
 	"syscall"
 	"time"

+	"github.com/containerd/cgroups/v3"
+	"github.com/moby/sys/mountinfo"
+	"github.com/opencontainers/runtime-spec/specs-go"
+	"golang.org/x/sys/unix"
+
 	"github.com/containerd/containerd/log"
 	"github.com/containerd/containerd/mount"
 	"github.com/containerd/containerd/pkg/apparmor"
 	"github.com/containerd/containerd/pkg/seccomp"
 	"github.com/containerd/containerd/pkg/seutil"
-	"github.com/moby/sys/mountinfo"
-	"github.com/opencontainers/runtime-spec/specs-go"
-	"golang.org/x/sys/unix"
-	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
 )

 const (
@ -50,17 +49,6 @@ const (
 	resolvConfPath = "/etc/resolv.conf"
 )

-// getCgroupsPath generates container cgroups path.
-func getCgroupsPath(cgroupsParent, id string) string {
-	base := path.Base(cgroupsParent)
-	if strings.HasSuffix(base, ".slice") {
-		// For a.slice/b.slice/c.slice, base is c.slice.
-		// runc systemd cgroup path format is "slice:prefix:name".
-		return strings.Join([]string{base, "cri-containerd", id}, ":")
-	}
-	return filepath.Join(cgroupsParent, id)
-}
-
 // getSandboxRootDir returns the root directory for managing sandbox files,
 // e.g. hosts files.
 func (c *criService) getSandboxRootDir(id string) string {
@ -93,46 +81,6 @@ func (c *criService) getSandboxDevShm(id string) string {
 	return filepath.Join(c.getVolatileSandboxRootDir(id), "shm")
 }

-func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) {
-	var labels []string
-
-	if selinuxOptions == nil {
-		return nil, nil
-	}
-	if err := checkSelinuxLevel(selinuxOptions.Level); err != nil {
-		return nil, err
-	}
-	if selinuxOptions.User != "" {
-		labels = append(labels, "user:"+selinuxOptions.User)
-	}
-	if selinuxOptions.Role != "" {
-		labels = append(labels, "role:"+selinuxOptions.Role)
-	}
-	if selinuxOptions.Type != "" {
-		labels = append(labels, "type:"+selinuxOptions.Type)
-	}
-	if selinuxOptions.Level != "" {
-		labels = append(labels, "level:"+selinuxOptions.Level)
-	}
-
-	return labels, nil
-}
-
-func checkSelinuxLevel(level string) error {
-	if len(level) == 0 {
-		return nil
-	}
-
-	matched, err := regexp.MatchString(`^s\d(-s\d)??(:c\d{1,4}(\.c\d{1,4})?(,c\d{1,4}(\.c\d{1,4})?)*)?$`, level)
-	if err != nil {
-		return fmt.Errorf("the format of 'level' %q is not correct: %w", level, err)
-	}
-	if !matched {
-		return fmt.Errorf("the format of 'level' %q is not correct", level)
-	}
-	return nil
-}
-
 // apparmorEnabled returns true if apparmor is enabled, supported by the host,
 // if apparmor_parser is installed, and if we are not running docker-in-docker.
 func (c *criService) apparmorEnabled() bool {
@ -270,3 +218,9 @@ func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
 	spec.Process.SelinuxLabel = l
 	return nil
 }
+
+// getCgroupsMode returns cgropu mode.
+// TODO: add build constraints to cgroups package and remove this helper
+func isUnifiedCgroupsMode() bool {
+	return cgroups.Mode() == cgroups.Unified
+}
--- a/pkg/cri/sbserver/helpers_other.go
+++ b/pkg/cri/sbserver/helpers_other.go
@ -41,3 +41,7 @@ func ensureRemoveAll(ctx context.Context, dir string) error {
 func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
 	return nil
 }
+
+func isUnifiedCgroupsMode() bool {
+	return false
+}
--- a/pkg/cri/sbserver/helpers_windows.go
+++ b/pkg/cri/sbserver/helpers_windows.go
@ -166,3 +166,7 @@ func ensureRemoveAll(_ context.Context, dir string) error {
 func modifyProcessLabel(runtimeType string, spec *specs.Spec) error {
 	return nil
 }
+
+func isUnifiedCgroupsMode() bool {
+	return false
+}
--- a/pkg/cri/sbserver/rdt_linux.go
+++ b/pkg/cri/sbserver/rdt_linux.go
--- a/pkg/cri/sbserver/rdt_stub_linux.go
+++ b/pkg/cri/sbserver/rdt_stub_linux.go
--- a/pkg/cri/server/container_create_windows.go
+++ b/pkg/cri/server/container_create_windows.go
@ -21,12 +21,13 @@ import (
 	"fmt"
 	"strconv"

-	"github.com/containerd/containerd/oci"
-	"github.com/containerd/containerd/snapshots"
 	imagespec "github.com/opencontainers/image-spec/specs-go/v1"
 	runtimespec "github.com/opencontainers/runtime-spec/specs-go"
 	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"

+	"github.com/containerd/containerd/oci"
+	"github.com/containerd/containerd/snapshots"
+
 	"github.com/containerd/containerd/pkg/cri/annotations"
 	"github.com/containerd/containerd/pkg/cri/config"
 	customopts "github.com/containerd/containerd/pkg/cri/opts"
@ -89,7 +90,7 @@ func (c *criService) containerSpec(
 		oci.WithHostname(sandboxConfig.GetHostname()),
 	)

-	specOpts = append(specOpts, customopts.WithWindowsMounts(c.os, config, extraMounts), customopts.WithDevices(config))
+	specOpts = append(specOpts, customopts.WithWindowsMounts(c.os, config, extraMounts), customopts.WithWindowsDevices(config))

 	// Start with the image config user and override below if RunAsUsername is not "".
 	username := imageConfig.User