diff --git a/oci/spec_opts_windows.go b/oci/spec_opts_windows.go index dd925a8b1..1a6cd942e 100644 --- a/oci/spec_opts_windows.go +++ b/oci/spec_opts_windows.go @@ -45,3 +45,10 @@ func WithHostDevices(_ context.Context, _ Client, _ *containers.Container, s *Sp func DeviceFromPath(path string) (*specs.LinuxDevice, error) { return nil, errors.New("device from path not supported on Windows") } + +// WithDevices does nothing on Windows. +func WithDevices(devicePath, containerPath, permissions string) SpecOpts { + return func(ctx context.Context, client Client, container *containers.Container, spec *Spec) error { + return nil + } +} diff --git a/pkg/cri/opts/container.go b/pkg/cri/opts/container.go index 5ea1b8739..04a3c1777 100644 --- a/pkg/cri/opts/container.go +++ b/pkg/cri/opts/container.go @@ -25,13 +25,14 @@ import ( goruntime "runtime" "strings" + "github.com/containerd/continuity/fs" + "github.com/containerd/containerd" "github.com/containerd/containerd/containers" "github.com/containerd/containerd/errdefs" "github.com/containerd/containerd/log" "github.com/containerd/containerd/mount" "github.com/containerd/containerd/snapshots" - "github.com/containerd/continuity/fs" ) // WithNewSnapshot wraps `containerd.WithNewSnapshot` so that if creating the diff --git a/pkg/cri/opts/spec_linux.go b/pkg/cri/opts/spec_linux.go index a7fde4a8f..f38d73a3d 100644 --- a/pkg/cri/opts/spec_linux.go +++ b/pkg/cri/opts/spec_linux.go @@ -22,8 +22,6 @@ import ( "fmt" "os" "path/filepath" - "sort" - "strconv" "strings" "sync" "syscall" @@ -31,255 +29,15 @@ import ( "github.com/container-orchestrated-devices/container-device-interface/pkg/cdi" "github.com/containerd/cgroups/v3" "github.com/containerd/cgroups/v3/cgroup1" - runtimespec "github.com/opencontainers/runtime-spec/specs-go" - "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" - runtime "k8s.io/cri-api/pkg/apis/runtime/v1" "github.com/containerd/containerd/containers" "github.com/containerd/containerd/log" - "github.com/containerd/containerd/mount" "github.com/containerd/containerd/oci" - osinterface "github.com/containerd/containerd/pkg/os" ) -// WithMounts sorts and adds runtime and CRI mounts to the spec -func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string) oci.SpecOpts { - return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) (err error) { - // mergeMounts merge CRI mounts with extra mounts. If a mount destination - // is mounted by both a CRI mount and an extra mount, the CRI mount will - // be kept. - var ( - criMounts = config.GetMounts() - mounts = append([]*runtime.Mount{}, criMounts...) - ) - // Copy all mounts from extra mounts, except for mounts overridden by CRI. - for _, e := range extra { - found := false - for _, c := range criMounts { - if filepath.Clean(e.ContainerPath) == filepath.Clean(c.ContainerPath) { - found = true - break - } - } - if !found { - mounts = append(mounts, e) - } - } - - // Sort mounts in number of parts. This ensures that high level mounts don't - // shadow other mounts. - sort.Sort(orderedMounts(mounts)) - - // Mount cgroup into the container as readonly, which inherits docker's behavior. - s.Mounts = append(s.Mounts, runtimespec.Mount{ - Source: "cgroup", - Destination: "/sys/fs/cgroup", - Type: "cgroup", - Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"}, - }) - - // Copy all mounts from default mounts, except for - // - mounts overridden by supplied mount; - // - all mounts under /dev if a supplied /dev is present. - mountSet := make(map[string]struct{}) - for _, m := range mounts { - mountSet[filepath.Clean(m.ContainerPath)] = struct{}{} - } - - defaultMounts := s.Mounts - s.Mounts = nil - - for _, m := range defaultMounts { - dst := filepath.Clean(m.Destination) - if _, ok := mountSet[dst]; ok { - // filter out mount overridden by a supplied mount - continue - } - if _, mountDev := mountSet["/dev"]; mountDev && strings.HasPrefix(dst, "/dev/") { - // filter out everything under /dev if /dev is a supplied mount - continue - } - s.Mounts = append(s.Mounts, m) - } - - for _, mount := range mounts { - var ( - dst = mount.GetContainerPath() - src = mount.GetHostPath() - ) - // Create the host path if it doesn't exist. - // TODO(random-liu): Add CRI validation test for this case. - if _, err := osi.Stat(src); err != nil { - if !os.IsNotExist(err) { - return fmt.Errorf("failed to stat %q: %w", src, err) - } - if err := osi.MkdirAll(src, 0755); err != nil { - return fmt.Errorf("failed to mkdir %q: %w", src, err) - } - } - // TODO(random-liu): Add cri-containerd integration test or cri validation test - // for this. - src, err := osi.ResolveSymbolicLink(src) - if err != nil { - return fmt.Errorf("failed to resolve symlink %q: %w", src, err) - } - if s.Linux == nil { - s.Linux = &runtimespec.Linux{} - } - options := []string{"rbind"} - switch mount.GetPropagation() { - case runtime.MountPropagation_PROPAGATION_PRIVATE: - options = append(options, "rprivate") - // Since default root propagation in runc is rprivate ignore - // setting the root propagation - case runtime.MountPropagation_PROPAGATION_BIDIRECTIONAL: - if err := ensureShared(src, osi.LookupMount); err != nil { - return err - } - options = append(options, "rshared") - s.Linux.RootfsPropagation = "rshared" - case runtime.MountPropagation_PROPAGATION_HOST_TO_CONTAINER: - if err := ensureSharedOrSlave(src, osi.LookupMount); err != nil { - return err - } - options = append(options, "rslave") - if s.Linux.RootfsPropagation != "rshared" && - s.Linux.RootfsPropagation != "rslave" { - s.Linux.RootfsPropagation = "rslave" - } - default: - log.G(ctx).Warnf("Unknown propagation mode for hostPath %q", mount.HostPath) - options = append(options, "rprivate") - } - - // NOTE(random-liu): we don't change all mounts to `ro` when root filesystem - // is readonly. This is different from docker's behavior, but make more sense. - if mount.GetReadonly() { - options = append(options, "ro") - } else { - options = append(options, "rw") - } - - if mount.GetSelinuxRelabel() { - if err := label.Relabel(src, mountLabel, false); err != nil && err != unix.ENOTSUP { - return fmt.Errorf("relabel %q with %q failed: %w", src, mountLabel, err) - } - } - s.Mounts = append(s.Mounts, runtimespec.Mount{ - Source: src, - Destination: dst, - Type: "bind", - Options: options, - }) - } - return nil - } -} - -// Ensure mount point on which path is mounted, is shared. -func ensureShared(path string, lookupMount func(string) (mount.Info, error)) error { - mountInfo, err := lookupMount(path) - if err != nil { - return err - } - - // Make sure source mount point is shared. - optsSplit := strings.Split(mountInfo.Optional, " ") - for _, opt := range optsSplit { - if strings.HasPrefix(opt, "shared:") { - return nil - } - } - - return fmt.Errorf("path %q is mounted on %q but it is not a shared mount", path, mountInfo.Mountpoint) -} - -// ensure mount point on which path is mounted, is either shared or slave. -func ensureSharedOrSlave(path string, lookupMount func(string) (mount.Info, error)) error { - mountInfo, err := lookupMount(path) - if err != nil { - return err - } - // Make sure source mount point is shared. - optsSplit := strings.Split(mountInfo.Optional, " ") - for _, opt := range optsSplit { - if strings.HasPrefix(opt, "shared:") { - return nil - } else if strings.HasPrefix(opt, "master:") { - return nil - } - } - return fmt.Errorf("path %q is mounted on %q but it is not a shared or slave mount", path, mountInfo.Mountpoint) -} - -// getDeviceUserGroupID() is used to find the right uid/gid -// value for the device node created in the container namespace. -// The runtime executes mknod() and chmod()s the created -// device with the values returned here. -// -// On Linux, uid and gid are sufficient and the user/groupname do not -// need to be resolved. -// -// TODO(mythi): In case of user namespaces, the runtime simply bind -// mounts the devices from the host. Additional logic is needed -// to check that the runtimes effective UID/GID on the host has the -// permissions to access the device node and/or the right user namespace -// mappings are created. -// -// Ref: https://github.com/kubernetes/kubernetes/issues/92211 -func getDeviceUserGroupID(runAsVal *runtime.Int64Value) uint32 { - if runAsVal != nil { - return uint32(runAsVal.GetValue()) - } - return 0 -} - -// WithDevices sets the provided devices onto the container spec -func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDeviceOwnershipFromSecurityContext bool) oci.SpecOpts { - return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) { - if s.Linux == nil { - s.Linux = &runtimespec.Linux{} - } - if s.Linux.Resources == nil { - s.Linux.Resources = &runtimespec.LinuxResources{} - } - - oldDevices := len(s.Linux.Devices) - - for _, device := range config.GetDevices() { - path, err := osi.ResolveSymbolicLink(device.HostPath) - if err != nil { - return err - } - - o := oci.WithDevices(path, device.ContainerPath, device.Permissions) - if err := o(ctx, client, c, s); err != nil { - return err - } - } - - if enableDeviceOwnershipFromSecurityContext { - UID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsUser()) - GID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsGroup()) - // Loop all new devices added by oci.WithDevices() to update their - // dev.UID/dev.GID. - // - // non-zero UID/GID from SecurityContext is used to override host's - // device UID/GID for the container. - for idx := oldDevices; idx < len(s.Linux.Devices); idx++ { - if UID != 0 { - *s.Linux.Devices[idx].UID = UID - } - if GID != 0 { - *s.Linux.Devices[idx].GID = GID - } - } - } - return nil - } -} +// Linux dependent OCI spec opts. var ( swapControllerAvailability bool @@ -312,88 +70,6 @@ func SwapControllerAvailable() bool { return swapControllerAvailability } -// WithResources sets the provided resource restrictions -func WithResources(resources *runtime.LinuxContainerResources, tolerateMissingHugetlbController, disableHugetlbController bool) oci.SpecOpts { - return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) { - if resources == nil { - return nil - } - if s.Linux == nil { - s.Linux = &runtimespec.Linux{} - } - if s.Linux.Resources == nil { - s.Linux.Resources = &runtimespec.LinuxResources{} - } - if s.Linux.Resources.CPU == nil { - s.Linux.Resources.CPU = &runtimespec.LinuxCPU{} - } - if s.Linux.Resources.Memory == nil { - s.Linux.Resources.Memory = &runtimespec.LinuxMemory{} - } - var ( - p = uint64(resources.GetCpuPeriod()) - q = resources.GetCpuQuota() - shares = uint64(resources.GetCpuShares()) - limit = resources.GetMemoryLimitInBytes() - swapLimit = resources.GetMemorySwapLimitInBytes() - hugepages = resources.GetHugepageLimits() - ) - - if p != 0 { - s.Linux.Resources.CPU.Period = &p - } - if q != 0 { - s.Linux.Resources.CPU.Quota = &q - } - if shares != 0 { - s.Linux.Resources.CPU.Shares = &shares - } - if cpus := resources.GetCpusetCpus(); cpus != "" { - s.Linux.Resources.CPU.Cpus = cpus - } - if mems := resources.GetCpusetMems(); mems != "" { - s.Linux.Resources.CPU.Mems = resources.GetCpusetMems() - } - if limit != 0 { - s.Linux.Resources.Memory.Limit = &limit - // swap/memory limit should be equal to prevent container from swapping by default - if swapLimit == 0 && SwapControllerAvailable() { - s.Linux.Resources.Memory.Swap = &limit - } - } - if swapLimit != 0 { - s.Linux.Resources.Memory.Swap = &swapLimit - } - - if !disableHugetlbController { - if isHugetlbControllerPresent() { - for _, limit := range hugepages { - s.Linux.Resources.HugepageLimits = append(s.Linux.Resources.HugepageLimits, runtimespec.LinuxHugepageLimit{ - Pagesize: limit.PageSize, - Limit: limit.Limit, - }) - } - } else { - if !tolerateMissingHugetlbController { - return errors.New("huge pages limits are specified but hugetlb cgroup controller is missing. " + - "Please set tolerate_missing_hugetlb_controller to `true` to ignore this error") - } - logrus.Warn("hugetlb cgroup controller is absent. skipping huge pages limits") - } - } - - if unified := resources.GetUnified(); unified != nil { - if s.Linux.Resources.Unified == nil { - s.Linux.Resources.Unified = make(map[string]string) - } - for k, v := range unified { - s.Linux.Resources.Unified[k] = v - } - } - return nil - } -} - var ( supportsHugetlbOnce sync.Once supportsHugetlb bool @@ -463,72 +139,6 @@ func IsCgroup2UnifiedMode() bool { return isUnified } -// WithOOMScoreAdj sets the oom score -func WithOOMScoreAdj(config *runtime.ContainerConfig, restrict bool) oci.SpecOpts { - return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { - if s.Process == nil { - s.Process = &runtimespec.Process{} - } - - resources := config.GetLinux().GetResources() - if resources == nil { - return nil - } - adj := int(resources.GetOomScoreAdj()) - if restrict { - var err error - adj, err = restrictOOMScoreAdj(adj) - if err != nil { - return err - } - } - s.Process.OOMScoreAdj = &adj - return nil - } -} - -// WithPodOOMScoreAdj sets the oom score for the pod sandbox -func WithPodOOMScoreAdj(adj int, restrict bool) oci.SpecOpts { - return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { - if s.Process == nil { - s.Process = &runtimespec.Process{} - } - if restrict { - var err error - adj, err = restrictOOMScoreAdj(adj) - if err != nil { - return err - } - } - s.Process.OOMScoreAdj = &adj - return nil - } -} - -func getCurrentOOMScoreAdj() (int, error) { - b, err := os.ReadFile("/proc/self/oom_score_adj") - if err != nil { - return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err) - } - s := strings.TrimSpace(string(b)) - i, err := strconv.Atoi(s) - if err != nil { - return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err) - } - return i, nil -} - -func restrictOOMScoreAdj(preferredOOMScoreAdj int) (int, error) { - currentOOMScoreAdj, err := getCurrentOOMScoreAdj() - if err != nil { - return preferredOOMScoreAdj, err - } - if preferredOOMScoreAdj < currentOOMScoreAdj { - return currentOOMScoreAdj, nil - } - return preferredOOMScoreAdj, nil -} - // WithCDI updates OCI spec with CDI content func WithCDI(annotations map[string]string) oci.SpecOpts { return func(ctx context.Context, _ oci.Client, c *containers.Container, s *oci.Spec) error { diff --git a/pkg/cri/opts/spec_linux_opts.go b/pkg/cri/opts/spec_linux_opts.go new file mode 100644 index 000000000..942f66f4a --- /dev/null +++ b/pkg/cri/opts/spec_linux_opts.go @@ -0,0 +1,426 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package opts + +import ( + "context" + "errors" + "fmt" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + "syscall" + + runtimespec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/sirupsen/logrus" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + + "github.com/containerd/containerd/containers" + "github.com/containerd/containerd/log" + "github.com/containerd/containerd/mount" + "github.com/containerd/containerd/oci" + osinterface "github.com/containerd/containerd/pkg/os" +) + +// WithMounts sorts and adds runtime and CRI mounts to the spec +func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string) oci.SpecOpts { + return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) (err error) { + // mergeMounts merge CRI mounts with extra mounts. If a mount destination + // is mounted by both a CRI mount and an extra mount, the CRI mount will + // be kept. + var ( + criMounts = config.GetMounts() + mounts = append([]*runtime.Mount{}, criMounts...) + ) + // Copy all mounts from extra mounts, except for mounts overridden by CRI. + for _, e := range extra { + found := false + for _, c := range criMounts { + if filepath.Clean(e.ContainerPath) == filepath.Clean(c.ContainerPath) { + found = true + break + } + } + if !found { + mounts = append(mounts, e) + } + } + + // Sort mounts in number of parts. This ensures that high level mounts don't + // shadow other mounts. + sort.Sort(orderedMounts(mounts)) + + // Mount cgroup into the container as readonly, which inherits docker's behavior. + s.Mounts = append(s.Mounts, runtimespec.Mount{ + Source: "cgroup", + Destination: "/sys/fs/cgroup", + Type: "cgroup", + Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"}, + }) + + // Copy all mounts from default mounts, except for + // - mounts overridden by supplied mount; + // - all mounts under /dev if a supplied /dev is present. + mountSet := make(map[string]struct{}) + for _, m := range mounts { + mountSet[filepath.Clean(m.ContainerPath)] = struct{}{} + } + + defaultMounts := s.Mounts + s.Mounts = nil + + for _, m := range defaultMounts { + dst := filepath.Clean(m.Destination) + if _, ok := mountSet[dst]; ok { + // filter out mount overridden by a supplied mount + continue + } + if _, mountDev := mountSet["/dev"]; mountDev && strings.HasPrefix(dst, "/dev/") { + // filter out everything under /dev if /dev is a supplied mount + continue + } + s.Mounts = append(s.Mounts, m) + } + + for _, mount := range mounts { + var ( + dst = mount.GetContainerPath() + src = mount.GetHostPath() + ) + // Create the host path if it doesn't exist. + // TODO(random-liu): Add CRI validation test for this case. + if _, err := osi.Stat(src); err != nil { + if !os.IsNotExist(err) { + return fmt.Errorf("failed to stat %q: %w", src, err) + } + if err := osi.MkdirAll(src, 0755); err != nil { + return fmt.Errorf("failed to mkdir %q: %w", src, err) + } + } + // TODO(random-liu): Add cri-containerd integration test or cri validation test + // for this. + src, err := osi.ResolveSymbolicLink(src) + if err != nil { + return fmt.Errorf("failed to resolve symlink %q: %w", src, err) + } + if s.Linux == nil { + s.Linux = &runtimespec.Linux{} + } + options := []string{"rbind"} + switch mount.GetPropagation() { + case runtime.MountPropagation_PROPAGATION_PRIVATE: + options = append(options, "rprivate") + // Since default root propagation in runc is rprivate ignore + // setting the root propagation + case runtime.MountPropagation_PROPAGATION_BIDIRECTIONAL: + if err := ensureShared(src, osi.LookupMount); err != nil { + return err + } + options = append(options, "rshared") + s.Linux.RootfsPropagation = "rshared" + case runtime.MountPropagation_PROPAGATION_HOST_TO_CONTAINER: + if err := ensureSharedOrSlave(src, osi.LookupMount); err != nil { + return err + } + options = append(options, "rslave") + if s.Linux.RootfsPropagation != "rshared" && + s.Linux.RootfsPropagation != "rslave" { + s.Linux.RootfsPropagation = "rslave" + } + default: + log.G(ctx).Warnf("Unknown propagation mode for hostPath %q", mount.HostPath) + options = append(options, "rprivate") + } + + // NOTE(random-liu): we don't change all mounts to `ro` when root filesystem + // is readonly. This is different from docker's behavior, but make more sense. + if mount.GetReadonly() { + options = append(options, "ro") + } else { + options = append(options, "rw") + } + + if mount.GetSelinuxRelabel() { + ENOTSUP := syscall.Errno(0x5f) // Linux specific error code, this branch will not execute on non Linux platforms. + if err := label.Relabel(src, mountLabel, false); err != nil && err != ENOTSUP { + return fmt.Errorf("relabel %q with %q failed: %w", src, mountLabel, err) + } + } + s.Mounts = append(s.Mounts, runtimespec.Mount{ + Source: src, + Destination: dst, + Type: "bind", + Options: options, + }) + } + return nil + } +} + +// Ensure mount point on which path is mounted, is shared. +func ensureShared(path string, lookupMount func(string) (mount.Info, error)) error { + mountInfo, err := lookupMount(path) + if err != nil { + return err + } + + // Make sure source mount point is shared. + optsSplit := strings.Split(mountInfo.Optional, " ") + for _, opt := range optsSplit { + if strings.HasPrefix(opt, "shared:") { + return nil + } + } + + return fmt.Errorf("path %q is mounted on %q but it is not a shared mount", path, mountInfo.Mountpoint) +} + +// ensure mount point on which path is mounted, is either shared or slave. +func ensureSharedOrSlave(path string, lookupMount func(string) (mount.Info, error)) error { + mountInfo, err := lookupMount(path) + if err != nil { + return err + } + // Make sure source mount point is shared. + optsSplit := strings.Split(mountInfo.Optional, " ") + for _, opt := range optsSplit { + if strings.HasPrefix(opt, "shared:") { + return nil + } else if strings.HasPrefix(opt, "master:") { + return nil + } + } + return fmt.Errorf("path %q is mounted on %q but it is not a shared or slave mount", path, mountInfo.Mountpoint) +} + +// getDeviceUserGroupID() is used to find the right uid/gid +// value for the device node created in the container namespace. +// The runtime executes mknod() and chmod()s the created +// device with the values returned here. +// +// On Linux, uid and gid are sufficient and the user/groupname do not +// need to be resolved. +// +// TODO(mythi): In case of user namespaces, the runtime simply bind +// mounts the devices from the host. Additional logic is needed +// to check that the runtimes effective UID/GID on the host has the +// permissions to access the device node and/or the right user namespace +// mappings are created. +// +// Ref: https://github.com/kubernetes/kubernetes/issues/92211 +func getDeviceUserGroupID(runAsVal *runtime.Int64Value) uint32 { + if runAsVal != nil { + return uint32(runAsVal.GetValue()) + } + return 0 +} + +// WithDevices sets the provided devices onto the container spec +func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDeviceOwnershipFromSecurityContext bool) oci.SpecOpts { + return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) { + if s.Linux == nil { + s.Linux = &runtimespec.Linux{} + } + if s.Linux.Resources == nil { + s.Linux.Resources = &runtimespec.LinuxResources{} + } + + oldDevices := len(s.Linux.Devices) + + for _, device := range config.GetDevices() { + path, err := osi.ResolveSymbolicLink(device.HostPath) + if err != nil { + return err + } + + o := oci.WithDevices(path, device.ContainerPath, device.Permissions) + if err := o(ctx, client, c, s); err != nil { + return err + } + } + + if enableDeviceOwnershipFromSecurityContext { + UID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsUser()) + GID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsGroup()) + // Loop all new devices added by oci.WithDevices() to update their + // dev.UID/dev.GID. + // + // non-zero UID/GID from SecurityContext is used to override host's + // device UID/GID for the container. + for idx := oldDevices; idx < len(s.Linux.Devices); idx++ { + if UID != 0 { + *s.Linux.Devices[idx].UID = UID + } + if GID != 0 { + *s.Linux.Devices[idx].GID = GID + } + } + } + return nil + } +} + +// WithResources sets the provided resource restrictions +func WithResources(resources *runtime.LinuxContainerResources, tolerateMissingHugetlbController, disableHugetlbController bool) oci.SpecOpts { + return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) { + if resources == nil { + return nil + } + if s.Linux == nil { + s.Linux = &runtimespec.Linux{} + } + if s.Linux.Resources == nil { + s.Linux.Resources = &runtimespec.LinuxResources{} + } + if s.Linux.Resources.CPU == nil { + s.Linux.Resources.CPU = &runtimespec.LinuxCPU{} + } + if s.Linux.Resources.Memory == nil { + s.Linux.Resources.Memory = &runtimespec.LinuxMemory{} + } + var ( + p = uint64(resources.GetCpuPeriod()) + q = resources.GetCpuQuota() + shares = uint64(resources.GetCpuShares()) + limit = resources.GetMemoryLimitInBytes() + swapLimit = resources.GetMemorySwapLimitInBytes() + hugepages = resources.GetHugepageLimits() + ) + + if p != 0 { + s.Linux.Resources.CPU.Period = &p + } + if q != 0 { + s.Linux.Resources.CPU.Quota = &q + } + if shares != 0 { + s.Linux.Resources.CPU.Shares = &shares + } + if cpus := resources.GetCpusetCpus(); cpus != "" { + s.Linux.Resources.CPU.Cpus = cpus + } + if mems := resources.GetCpusetMems(); mems != "" { + s.Linux.Resources.CPU.Mems = resources.GetCpusetMems() + } + if limit != 0 { + s.Linux.Resources.Memory.Limit = &limit + // swap/memory limit should be equal to prevent container from swapping by default + if swapLimit == 0 && SwapControllerAvailable() { + s.Linux.Resources.Memory.Swap = &limit + } + } + if swapLimit != 0 { + s.Linux.Resources.Memory.Swap = &swapLimit + } + + if !disableHugetlbController { + if isHugetlbControllerPresent() { + for _, limit := range hugepages { + s.Linux.Resources.HugepageLimits = append(s.Linux.Resources.HugepageLimits, runtimespec.LinuxHugepageLimit{ + Pagesize: limit.PageSize, + Limit: limit.Limit, + }) + } + } else { + if !tolerateMissingHugetlbController { + return errors.New("huge pages limits are specified but hugetlb cgroup controller is missing. " + + "Please set tolerate_missing_hugetlb_controller to `true` to ignore this error") + } + logrus.Warn("hugetlb cgroup controller is absent. skipping huge pages limits") + } + } + + if unified := resources.GetUnified(); unified != nil { + if s.Linux.Resources.Unified == nil { + s.Linux.Resources.Unified = make(map[string]string) + } + for k, v := range unified { + s.Linux.Resources.Unified[k] = v + } + } + return nil + } +} + +// WithOOMScoreAdj sets the oom score +func WithOOMScoreAdj(config *runtime.ContainerConfig, restrict bool) oci.SpecOpts { + return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { + if s.Process == nil { + s.Process = &runtimespec.Process{} + } + + resources := config.GetLinux().GetResources() + if resources == nil { + return nil + } + adj := int(resources.GetOomScoreAdj()) + if restrict { + var err error + adj, err = restrictOOMScoreAdj(adj) + if err != nil { + return err + } + } + s.Process.OOMScoreAdj = &adj + return nil + } +} + +// WithPodOOMScoreAdj sets the oom score for the pod sandbox +func WithPodOOMScoreAdj(adj int, restrict bool) oci.SpecOpts { + return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { + if s.Process == nil { + s.Process = &runtimespec.Process{} + } + if restrict { + var err error + adj, err = restrictOOMScoreAdj(adj) + if err != nil { + return err + } + } + s.Process.OOMScoreAdj = &adj + return nil + } +} + +func getCurrentOOMScoreAdj() (int, error) { + b, err := os.ReadFile("/proc/self/oom_score_adj") + if err != nil { + return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err) + } + s := strings.TrimSpace(string(b)) + i, err := strconv.Atoi(s) + if err != nil { + return 0, fmt.Errorf("could not get the daemon oom_score_adj: %w", err) + } + return i, nil +} + +func restrictOOMScoreAdj(preferredOOMScoreAdj int) (int, error) { + currentOOMScoreAdj, err := getCurrentOOMScoreAdj() + if err != nil { + return preferredOOMScoreAdj, err + } + if preferredOOMScoreAdj < currentOOMScoreAdj { + return currentOOMScoreAdj, nil + } + return preferredOOMScoreAdj, nil +} diff --git a/pkg/cri/opts/spec_nonlinux.go b/pkg/cri/opts/spec_nonlinux.go new file mode 100644 index 000000000..b30f24da2 --- /dev/null +++ b/pkg/cri/opts/spec_nonlinux.go @@ -0,0 +1,41 @@ +//go:build !linux + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package opts + +import ( + "context" + + "github.com/containerd/containerd/containers" + "github.com/containerd/containerd/oci" +) + +func isHugetlbControllerPresent() bool { + return false +} + +func SwapControllerAvailable() bool { + return false +} + +// WithCDI does nothing on non Linux platforms. +func WithCDI(_ map[string]string) oci.SpecOpts { + return func(ctx context.Context, client oci.Client, container *containers.Container, spec *oci.Spec) error { + return nil + } +} diff --git a/pkg/cri/opts/spec.go b/pkg/cri/opts/spec_opts.go similarity index 100% rename from pkg/cri/opts/spec.go rename to pkg/cri/opts/spec_opts.go diff --git a/pkg/cri/opts/spec_test.go b/pkg/cri/opts/spec_opts_test.go similarity index 100% rename from pkg/cri/opts/spec_test.go rename to pkg/cri/opts/spec_opts_test.go diff --git a/pkg/cri/opts/spec_windows.go b/pkg/cri/opts/spec_windows_opts.go similarity index 98% rename from pkg/cri/opts/spec_windows.go rename to pkg/cri/opts/spec_windows_opts.go index 9b964e748..b42a6d582 100644 --- a/pkg/cri/opts/spec_windows.go +++ b/pkg/cri/opts/spec_windows_opts.go @@ -24,11 +24,11 @@ import ( "sort" "strings" - "github.com/containerd/containerd/containers" - "github.com/containerd/containerd/oci" runtimespec "github.com/opencontainers/runtime-spec/specs-go" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + "github.com/containerd/containerd/containers" + "github.com/containerd/containerd/oci" osinterface "github.com/containerd/containerd/pkg/os" ) @@ -229,8 +229,8 @@ func WithWindowsCredentialSpec(credentialSpec string) oci.SpecOpts { } } -// WithDevices sets the provided devices onto the container spec -func WithDevices(config *runtime.ContainerConfig) oci.SpecOpts { +// WithWindowsDevices sets the provided devices onto the container spec +func WithWindowsDevices(config *runtime.ContainerConfig) oci.SpecOpts { return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) { for _, device := range config.GetDevices() { if device.ContainerPath != "" { diff --git a/pkg/cri/opts/spec_windows_test.go b/pkg/cri/opts/spec_windows_test.go index 7afd609c9..9b5d92e8c 100644 --- a/pkg/cri/opts/spec_windows_test.go +++ b/pkg/cri/opts/spec_windows_test.go @@ -22,14 +22,15 @@ import ( "strings" "testing" - "github.com/containerd/containerd/containers" - "github.com/containerd/containerd/namespaces" - "github.com/containerd/containerd/oci" - osinterface "github.com/containerd/containerd/pkg/os" "github.com/opencontainers/runtime-spec/specs-go" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + + "github.com/containerd/containerd/containers" + "github.com/containerd/containerd/namespaces" + "github.com/containerd/containerd/oci" + osinterface "github.com/containerd/containerd/pkg/os" ) func TestWithDevices(t *testing.T) { @@ -183,7 +184,7 @@ func TestWithDevices(t *testing.T) { config := runtime.ContainerConfig{} config.Devices = tc.devices - specOpts := []oci.SpecOpts{WithDevices(&config)} + specOpts := []oci.SpecOpts{WithWindowsDevices(&config)} platform := "windows" if tc.isLCOW { diff --git a/pkg/cri/sbserver/blockio_stub_linux.go b/pkg/cri/sbserver/blockio_stub.go similarity index 100% rename from pkg/cri/sbserver/blockio_stub_linux.go rename to pkg/cri/sbserver/blockio_stub.go diff --git a/pkg/cri/sbserver/container_create.go b/pkg/cri/sbserver/container_create.go index 42616344b..b5045315b 100644 --- a/pkg/cri/sbserver/container_create.go +++ b/pkg/cri/sbserver/container_create.go @@ -24,6 +24,14 @@ import ( "strconv" "time" + "github.com/containerd/typeurl" + "github.com/davecgh/go-spew/spew" + imagespec "github.com/opencontainers/image-spec/specs-go/v1" + runtimespec "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux" + "github.com/opencontainers/selinux/go-selinux/label" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + "github.com/containerd/containerd" "github.com/containerd/containerd/api/types" "github.com/containerd/containerd/containers" @@ -37,12 +45,6 @@ import ( containerstore "github.com/containerd/containerd/pkg/cri/store/container" "github.com/containerd/containerd/pkg/cri/util" ctrdutil "github.com/containerd/containerd/pkg/cri/util" - "github.com/containerd/typeurl" - "github.com/davecgh/go-spew/spew" - imagespec "github.com/opencontainers/image-spec/specs-go/v1" - runtimespec "github.com/opencontainers/runtime-spec/specs-go" - "github.com/opencontainers/selinux/go-selinux" - runtime "k8s.io/cri-api/pkg/apis/runtime/v1" ) func init() { @@ -419,44 +421,317 @@ func (c *criService) buildContainerSpec( ociRuntime config.Runtime, ) (_ *runtimespec.Spec, retErr error) { var ( - specOpts []oci.SpecOpts - - // Platform helpers isLinux = platform.OS == "linux" isWindows = platform.OS == "windows" + isDarwin = platform.OS == "darwin" ) - if isLinux { - specOpts = append(specOpts, oci.WithoutRunMount) + switch { + case isLinux: + return c.buildLinuxSpec( + id, + sandboxID, + sandboxPid, + netNSPath, + containerName, + imageName, + config, + sandboxConfig, + imageConfig, + extraMounts, + ociRuntime, + ) + case isWindows: + return c.buildWindowsSpec( + id, + sandboxID, + sandboxPid, + netNSPath, + containerName, + imageName, + config, + sandboxConfig, + imageConfig, + extraMounts, + ociRuntime, + ) + case isDarwin: + return c.buildDarwinSpec( + id, + sandboxID, + containerName, + imageName, + config, + sandboxConfig, + imageConfig, + extraMounts, + ociRuntime, + ) + default: + return nil, fmt.Errorf("unsupported spec platform: %s", platform.OS) + } +} - // Only clear the default security settings if the runtime does not have a custom - // base runtime spec. Admins can use this functionality to define - // default ulimits, seccomp, or other default settings. - if ociRuntime.BaseRuntimeSpec == "" { - specOpts = append(specOpts, customopts.WithoutDefaultSecuritySettings) +func (c *criService) buildLinuxSpec( + id string, + sandboxID string, + sandboxPid uint32, + netNSPath string, + containerName string, + imageName string, + config *runtime.ContainerConfig, + sandboxConfig *runtime.PodSandboxConfig, + imageConfig *imagespec.ImageConfig, + extraMounts []*runtime.Mount, + ociRuntime config.Runtime, +) (_ *runtimespec.Spec, retErr error) { + specOpts := []oci.SpecOpts{ + oci.WithoutRunMount, + } + // only clear the default security settings if the runtime does not have a custom + // base runtime spec spec. Admins can use this functionality to define + // default ulimits, seccomp, or other default settings. + if ociRuntime.BaseRuntimeSpec == "" { + specOpts = append(specOpts, customopts.WithoutDefaultSecuritySettings) + } + + specOpts = append(specOpts, + customopts.WithRelativeRoot(relativeRootfsPath), + customopts.WithProcessArgs(config, imageConfig), + oci.WithDefaultPathEnv, + // this will be set based on the security context below + oci.WithNewPrivileges, + ) + + if config.GetWorkingDir() != "" { + specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir())) + } else if imageConfig.WorkingDir != "" { + specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir)) + } + + if config.GetTty() { + specOpts = append(specOpts, oci.WithTTY) + } + + // Add HOSTNAME env. + var ( + err error + hostname = sandboxConfig.GetHostname() + ) + if hostname == "" { + if hostname, err = c.os.Hostname(); err != nil { + return nil, err } + } + specOpts = append(specOpts, oci.WithEnv([]string{hostnameEnv + "=" + hostname})) - specOpts = append(specOpts, - customopts.WithRelativeRoot(relativeRootfsPath), - oci.WithDefaultPathEnv, - // this will be set based on the security context below - oci.WithNewPrivileges, - ) + // Apply envs from image config first, so that envs from container config + // can override them. + env := append([]string{}, imageConfig.Env...) + for _, e := range config.GetEnvs() { + env = append(env, e.GetKey()+"="+e.GetValue()) + } + specOpts = append(specOpts, oci.WithEnv(env)) - // Add HOSTNAME env. - var ( - err error - hostname = sandboxConfig.GetHostname() - ) - if hostname == "" { - if hostname, err = c.os.Hostname(); err != nil { + securityContext := config.GetLinux().GetSecurityContext() + labelOptions, err := toLabel(securityContext.GetSelinuxOptions()) + if err != nil { + return nil, err + } + if len(labelOptions) == 0 { + // Use pod level SELinux config + if sandbox, err := c.sandboxStore.Get(sandboxID); err == nil { + labelOptions, err = selinux.DupSecOpt(sandbox.ProcessLabel) + if err != nil { return nil, err } } - specOpts = append(specOpts, oci.WithEnv([]string{hostnameEnv + "=" + hostname})) } - specOpts = append(specOpts, customopts.WithProcessArgs(config, imageConfig)) + processLabel, mountLabel, err := label.InitLabels(labelOptions) + if err != nil { + return nil, fmt.Errorf("failed to init selinux options %+v: %w", securityContext.GetSelinuxOptions(), err) + } + defer func() { + if retErr != nil { + selinux.ReleaseLabel(processLabel) + } + }() + + specOpts = append(specOpts, customopts.WithMounts(c.os, config, extraMounts, mountLabel)) + + if !c.config.DisableProcMount { + // Change the default masked/readonly paths to empty slices + // See https://github.com/containerd/containerd/issues/5029 + // TODO: Provide an option to set default paths to the ones in oci.populateDefaultUnixSpec() + specOpts = append(specOpts, oci.WithMaskedPaths([]string{}), oci.WithReadonlyPaths([]string{})) + + // Apply masked paths if specified. + // If the container is privileged, this will be cleared later on. + if maskedPaths := securityContext.GetMaskedPaths(); maskedPaths != nil { + specOpts = append(specOpts, oci.WithMaskedPaths(maskedPaths)) + } + + // Apply readonly paths if specified. + // If the container is privileged, this will be cleared later on. + if readonlyPaths := securityContext.GetReadonlyPaths(); readonlyPaths != nil { + specOpts = append(specOpts, oci.WithReadonlyPaths(readonlyPaths)) + } + } + + specOpts = append(specOpts, customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext), + customopts.WithCapabilities(securityContext, c.allCaps)) + + if securityContext.GetPrivileged() { + if !sandboxConfig.GetLinux().GetSecurityContext().GetPrivileged() { + return nil, errors.New("no privileged container allowed in sandbox") + } + specOpts = append(specOpts, oci.WithPrivileged) + if !ociRuntime.PrivilegedWithoutHostDevices { + specOpts = append(specOpts, oci.WithHostDevices, oci.WithAllDevicesAllowed) + } else if ociRuntime.PrivilegedWithoutHostDevicesAllDevicesAllowed { + // allow rwm on all devices for the container + specOpts = append(specOpts, oci.WithAllDevicesAllowed) + } + } + + // Clear all ambient capabilities. The implication of non-root + caps + // is not clearly defined in Kubernetes. + // See https://github.com/kubernetes/kubernetes/issues/56374 + // Keep docker's behavior for now. + specOpts = append(specOpts, + customopts.WithoutAmbientCaps, + customopts.WithSelinuxLabels(processLabel, mountLabel), + ) + + // TODO: Figure out whether we should set no new privilege for sandbox container by default + if securityContext.GetNoNewPrivs() { + specOpts = append(specOpts, oci.WithNoNewPrivileges) + } + // TODO(random-liu): [P1] Set selinux options (privileged or not). + if securityContext.GetReadonlyRootfs() { + specOpts = append(specOpts, oci.WithRootFSReadonly()) + } + + if c.config.DisableCgroup { + specOpts = append(specOpts, customopts.WithDisabledCgroups) + } else { + specOpts = append(specOpts, customopts.WithResources(config.GetLinux().GetResources(), c.config.TolerateMissingHugetlbController, c.config.DisableHugetlbController)) + if sandboxConfig.GetLinux().GetCgroupParent() != "" { + cgroupsPath := getCgroupsPath(sandboxConfig.GetLinux().GetCgroupParent(), id) + specOpts = append(specOpts, oci.WithCgroup(cgroupsPath)) + } + } + + supplementalGroups := securityContext.GetSupplementalGroups() + + // Get blockio class + blockIOClass, err := c.blockIOClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations) + if err != nil { + return nil, fmt.Errorf("failed to set blockio class: %w", err) + } + if blockIOClass != "" { + if linuxBlockIO, err := blockIOToLinuxOci(blockIOClass); err == nil { + specOpts = append(specOpts, oci.WithBlockIO(linuxBlockIO)) + } else { + return nil, err + } + } + + // Get RDT class + rdtClass, err := c.rdtClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations) + if err != nil { + return nil, fmt.Errorf("failed to set RDT class: %w", err) + } + if rdtClass != "" { + specOpts = append(specOpts, oci.WithRdt(rdtClass, "", "")) + } + + for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations, + ociRuntime.PodAnnotations) { + specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue)) + } + + for pKey, pValue := range getPassthroughAnnotations(config.Annotations, + ociRuntime.ContainerAnnotations) { + specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue)) + } + + // Default target PID namespace is the sandbox PID. + targetPid := sandboxPid + // If the container targets another container's PID namespace, + // set targetPid to the PID of that container. + nsOpts := securityContext.GetNamespaceOptions() + if nsOpts.GetPid() == runtime.NamespaceMode_TARGET { + targetContainer, err := c.validateTargetContainer(sandboxID, nsOpts.TargetId) + if err != nil { + return nil, fmt.Errorf("invalid target container: %w", err) + } + + status := targetContainer.Status.Get() + targetPid = status.Pid + } + + uids, gids, err := parseUsernsIDs(nsOpts.GetUsernsOptions()) + if err != nil { + return nil, fmt.Errorf("user namespace configuration: %w", err) + } + + // Check sandbox userns config is consistent with container config. + sandboxUsernsOpts := sandboxConfig.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions() + if !sameUsernsConfig(sandboxUsernsOpts, nsOpts.GetUsernsOptions()) { + return nil, fmt.Errorf("user namespace config for sandbox is different from container. Sandbox userns config: %v - Container userns config: %v", sandboxUsernsOpts, nsOpts.GetUsernsOptions()) + } + + specOpts = append(specOpts, + customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj), + customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, uids, gids), + customopts.WithSupplementalGroups(supplementalGroups), + customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer), + customopts.WithAnnotation(annotations.SandboxID, sandboxID), + customopts.WithAnnotation(annotations.SandboxNamespace, sandboxConfig.GetMetadata().GetNamespace()), + customopts.WithAnnotation(annotations.SandboxUID, sandboxConfig.GetMetadata().GetUid()), + customopts.WithAnnotation(annotations.SandboxName, sandboxConfig.GetMetadata().GetName()), + customopts.WithAnnotation(annotations.ContainerName, containerName), + customopts.WithAnnotation(annotations.ImageName, imageName), + ) + + // cgroupns is used for hiding /sys/fs/cgroup from containers. + // For compatibility, cgroupns is not used when running in cgroup v1 mode or in privileged. + // https://github.com/containers/libpod/issues/4363 + // https://github.com/kubernetes/enhancements/blob/0e409b47497e398b369c281074485c8de129694f/keps/sig-node/20191118-cgroups-v2.md#cgroup-namespace + if isUnifiedCgroupsMode() && !securityContext.GetPrivileged() { + specOpts = append(specOpts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.CgroupNamespace})) + } + + return c.runtimeSpec(id, ociRuntime.BaseRuntimeSpec, specOpts...) +} + +func (c *criService) buildWindowsSpec( + id string, + sandboxID string, + sandboxPid uint32, + netNSPath string, + containerName string, + imageName string, + config *runtime.ContainerConfig, + sandboxConfig *runtime.PodSandboxConfig, + imageConfig *imagespec.ImageConfig, + extraMounts []*runtime.Mount, + ociRuntime config.Runtime, +) (_ *runtimespec.Spec, retErr error) { + specOpts := []oci.SpecOpts{ + customopts.WithProcessArgs(config, imageConfig), + } + + // All containers in a pod need to have HostProcess set if it was set on the pod, + // and vice versa no containers in the pod can be HostProcess if the pods spec + // didn't have the field set. The only case that is valid is if these are the same value. + cntrHpc := config.GetWindows().GetSecurityContext().GetHostProcess() + sandboxHpc := sandboxConfig.GetWindows().GetSecurityContext().GetHostProcess() + if cntrHpc != sandboxHpc { + return nil, errors.New("pod spec and all containers inside must have the HostProcess field set to be valid") + } if config.GetWorkingDir() != "" { specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir())) @@ -476,116 +751,98 @@ func (c *criService) buildContainerSpec( } specOpts = append(specOpts, oci.WithEnv(env)) - if isWindows { - specOpts = append(specOpts, - // Clear the root location since hcsshim expects it. - // NOTE: readonly rootfs doesn't work on windows. - customopts.WithoutRoot, - oci.WithWindowsNetworkNamespace(netNSPath), - oci.WithHostname(sandboxConfig.GetHostname()), - ) - - // All containers in a pod need to have HostProcess set if it was set on the pod, - // and vice versa no containers in the pod can be HostProcess if the pods spec - // didn't have the field set. The only case that is valid is if these are the same value. - cntrHpc := config.GetWindows().GetSecurityContext().GetHostProcess() - sandboxHpc := sandboxConfig.GetWindows().GetSecurityContext().GetHostProcess() - if cntrHpc != sandboxHpc { - return nil, errors.New("pod spec and all containers inside must have the HostProcess field set to be valid") - } - - specOpts = append(specOpts, customopts.WithAnnotation(annotations.WindowsHostProcess, strconv.FormatBool(sandboxHpc))) - } - - // Get spec opts that depend on features offered by the platform containerd daemon is running on. - platformSpecOpts, err := c.platformSpec( - id, - sandboxID, - config, - sandboxConfig, - imageConfig, - extraMounts, + specOpts = append(specOpts, + // Clear the root location since hcsshim expects it. + // NOTE: readonly rootfs doesn't work on windows. + customopts.WithoutRoot, + oci.WithWindowsNetworkNamespace(netNSPath), + oci.WithHostname(sandboxConfig.GetHostname()), ) - if err != nil { - return nil, err + + specOpts = append(specOpts, customopts.WithWindowsMounts(c.os, config, extraMounts), customopts.WithWindowsDevices(config)) + + // Start with the image config user and override below if RunAsUsername is not "". + username := imageConfig.User + + windowsConfig := config.GetWindows() + if windowsConfig != nil { + specOpts = append(specOpts, customopts.WithWindowsResources(windowsConfig.GetResources())) + securityCtx := windowsConfig.GetSecurityContext() + if securityCtx != nil { + runAsUser := securityCtx.GetRunAsUsername() + if runAsUser != "" { + username = runAsUser + } + cs := securityCtx.GetCredentialSpec() + if cs != "" { + specOpts = append(specOpts, customopts.WithWindowsCredentialSpec(cs)) + } + } } - specOpts = append(specOpts, platformSpecOpts...) + // There really isn't a good Windows way to verify that the username is available in the + // image as early as here like there is for Linux. Later on in the stack hcsshim + // will handle the behavior of erroring out if the user isn't available in the image + // when trying to run the init process. + specOpts = append(specOpts, oci.WithUser(username)) - if isLinux { - securityContext := config.GetLinux().GetSecurityContext() - - if !c.config.DisableProcMount { - // Change the default masked/readonly paths to empty slices - // See https://github.com/containerd/containerd/issues/5029 - // TODO: Provide an option to set default paths to the ones in oci.populateDefaultUnixSpec() - specOpts = append(specOpts, oci.WithMaskedPaths([]string{}), oci.WithReadonlyPaths([]string{})) - - // Apply masked paths if specified. - // If the container is privileged, this will be cleared later on. - if maskedPaths := securityContext.GetMaskedPaths(); maskedPaths != nil { - specOpts = append(specOpts, oci.WithMaskedPaths(maskedPaths)) - } - - // Apply readonly paths if specified. - // If the container is privileged, this will be cleared later on. - if readonlyPaths := securityContext.GetReadonlyPaths(); readonlyPaths != nil { - specOpts = append(specOpts, oci.WithReadonlyPaths(readonlyPaths)) - } - } - - if securityContext.GetPrivileged() { - if !sandboxConfig.GetLinux().GetSecurityContext().GetPrivileged() { - return nil, errors.New("no privileged container allowed in sandbox") - } - specOpts = append(specOpts, oci.WithPrivileged) - if !ociRuntime.PrivilegedWithoutHostDevices { - specOpts = append(specOpts, oci.WithHostDevices, oci.WithAllDevicesAllowed) - } else if ociRuntime.PrivilegedWithoutHostDevicesAllDevicesAllowed { - // allow rwm on all devices for the container - specOpts = append(specOpts, oci.WithAllDevicesAllowed) - } - } - - // Clear all ambient capabilities. The implication of non-root + caps - // is not clearly defined in Kubernetes. - // See https://github.com/kubernetes/kubernetes/issues/56374 - // Keep docker's behavior for now. - specOpts = append(specOpts, customopts.WithoutAmbientCaps) - - // TODO: Figure out whether we should set no new privilege for sandbox container by default - if securityContext.GetNoNewPrivs() { - specOpts = append(specOpts, oci.WithNoNewPrivileges) - } - // TODO(random-liu): [P1] Set selinux options (privileged or not). - if securityContext.GetReadonlyRootfs() { - specOpts = append(specOpts, oci.WithRootFSReadonly()) - } - - supplementalGroups := securityContext.GetSupplementalGroups() - specOpts = append(specOpts, customopts.WithSupplementalGroups(supplementalGroups)) - - // Default target PID namespace is the sandbox PID. - targetPid := sandboxPid - // If the container targets another container's PID namespace, - // set targetPid to the PID of that container. - nsOpts := securityContext.GetNamespaceOptions() - if nsOpts.GetPid() == runtime.NamespaceMode_TARGET { - targetContainer, err := c.validateTargetContainer(sandboxID, nsOpts.TargetId) - if err != nil { - return nil, fmt.Errorf("invalid target container: %w", err) - } - - status := targetContainer.Status.Get() - targetPid = status.Pid - } - - specOpts = append(specOpts, - // TODO: This is a hack to make this compile. We should move userns support to sbserver. - customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, nil, nil), - ) + for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations, + ociRuntime.PodAnnotations) { + specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue)) } + for pKey, pValue := range getPassthroughAnnotations(config.Annotations, + ociRuntime.ContainerAnnotations) { + specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue)) + } + + specOpts = append(specOpts, + customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer), + customopts.WithAnnotation(annotations.SandboxID, sandboxID), + customopts.WithAnnotation(annotations.SandboxNamespace, sandboxConfig.GetMetadata().GetNamespace()), + customopts.WithAnnotation(annotations.SandboxUID, sandboxConfig.GetMetadata().GetUid()), + customopts.WithAnnotation(annotations.SandboxName, sandboxConfig.GetMetadata().GetName()), + customopts.WithAnnotation(annotations.ContainerName, containerName), + customopts.WithAnnotation(annotations.ImageName, imageName), + customopts.WithAnnotation(annotations.WindowsHostProcess, strconv.FormatBool(sandboxHpc)), + ) + + return c.runtimeSpec(id, ociRuntime.BaseRuntimeSpec, specOpts...) +} + +func (c *criService) buildDarwinSpec( + id string, + sandboxID string, + containerName string, + imageName string, + config *runtime.ContainerConfig, + sandboxConfig *runtime.PodSandboxConfig, + imageConfig *imagespec.ImageConfig, + extraMounts []*runtime.Mount, + ociRuntime config.Runtime, +) (_ *runtimespec.Spec, retErr error) { + specOpts := []oci.SpecOpts{ + customopts.WithProcessArgs(config, imageConfig), + } + + if config.GetWorkingDir() != "" { + specOpts = append(specOpts, oci.WithProcessCwd(config.GetWorkingDir())) + } else if imageConfig.WorkingDir != "" { + specOpts = append(specOpts, oci.WithProcessCwd(imageConfig.WorkingDir)) + } + + if config.GetTty() { + specOpts = append(specOpts, oci.WithTTY) + } + + // Apply envs from image config first, so that envs from container config + // can override them. + env := append([]string{}, imageConfig.Env...) + for _, e := range config.GetEnvs() { + env = append(env, e.GetKey()+"="+e.GetValue()) + } + specOpts = append(specOpts, oci.WithEnv(env)) + for pKey, pValue := range getPassthroughAnnotations(sandboxConfig.Annotations, ociRuntime.PodAnnotations) { specOpts = append(specOpts, customopts.WithAnnotation(pKey, pValue)) diff --git a/pkg/cri/sbserver/container_create_linux.go b/pkg/cri/sbserver/container_create_linux.go index 171ce325f..ba39c1219 100644 --- a/pkg/cri/sbserver/container_create_linux.go +++ b/pkg/cri/sbserver/container_create_linux.go @@ -25,16 +25,13 @@ import ( "strconv" "strings" - "github.com/containerd/cgroups/v3" + imagespec "github.com/opencontainers/image-spec/specs-go/v1" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + "github.com/containerd/containerd/contrib/apparmor" "github.com/containerd/containerd/contrib/seccomp" "github.com/containerd/containerd/oci" "github.com/containerd/containerd/snapshots" - imagespec "github.com/opencontainers/image-spec/specs-go/v1" - runtimespec "github.com/opencontainers/runtime-spec/specs-go" - "github.com/opencontainers/selinux/go-selinux" - "github.com/opencontainers/selinux/go-selinux/label" - runtime "k8s.io/cri-api/pkg/apis/runtime/v1" customopts "github.com/containerd/containerd/pkg/cri/opts" ) @@ -111,93 +108,6 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container return mounts } -func (c *criService) platformSpec( - id string, - sandboxID string, - config *runtime.ContainerConfig, - sandboxConfig *runtime.PodSandboxConfig, - imageConfig *imagespec.ImageConfig, - extraMounts []*runtime.Mount, -) (_ []oci.SpecOpts, retErr error) { - specOpts := []oci.SpecOpts{} - - securityContext := config.GetLinux().GetSecurityContext() - labelOptions, err := toLabel(securityContext.GetSelinuxOptions()) - if err != nil { - return nil, err - } - if len(labelOptions) == 0 { - // Use pod level SELinux config - if sandbox, err := c.sandboxStore.Get(sandboxID); err == nil { - labelOptions, err = selinux.DupSecOpt(sandbox.ProcessLabel) - if err != nil { - return nil, err - } - } - } - - processLabel, mountLabel, err := label.InitLabels(labelOptions) - if err != nil { - return nil, fmt.Errorf("failed to init selinux options %+v: %w", securityContext.GetSelinuxOptions(), err) - } - defer func() { - if retErr != nil { - selinux.ReleaseLabel(processLabel) - } - }() - - specOpts = append(specOpts, - customopts.WithSelinuxLabels(processLabel, mountLabel), - customopts.WithMounts(c.os, config, extraMounts, mountLabel), - customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext), - customopts.WithCapabilities(securityContext, c.allCaps), - ) - - if c.config.DisableCgroup { - specOpts = append(specOpts, customopts.WithDisabledCgroups) - } else { - specOpts = append(specOpts, customopts.WithResources(config.GetLinux().GetResources(), c.config.TolerateMissingHugetlbController, c.config.DisableHugetlbController)) - if sandboxConfig.GetLinux().GetCgroupParent() != "" { - cgroupsPath := getCgroupsPath(sandboxConfig.GetLinux().GetCgroupParent(), id) - specOpts = append(specOpts, oci.WithCgroup(cgroupsPath)) - } - } - - // Get blockio class - blockIOClass, err := c.blockIOClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations) - if err != nil { - return nil, fmt.Errorf("failed to set blockio class: %w", err) - } - if blockIOClass != "" { - if linuxBlockIO, err := blockIOToLinuxOci(blockIOClass); err == nil { - specOpts = append(specOpts, oci.WithBlockIO(linuxBlockIO)) - } else { - return nil, err - } - } - - // Get RDT class - rdtClass, err := c.rdtClassFromAnnotations(config.GetMetadata().GetName(), config.Annotations, sandboxConfig.Annotations) - if err != nil { - return nil, fmt.Errorf("failed to set RDT class: %w", err) - } - if rdtClass != "" { - specOpts = append(specOpts, oci.WithRdt(rdtClass, "", "")) - } - - specOpts = append(specOpts, customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj)) - - // cgroupns is used for hiding /sys/fs/cgroup from containers. - // For compatibility, cgroupns is not used when running in cgroup v1 mode or in privileged. - // https://github.com/containers/libpod/issues/4363 - // https://github.com/kubernetes/enhancements/blob/0e409b47497e398b369c281074485c8de129694f/keps/sig-node/20191118-cgroups-v2.md#cgroup-namespace - if cgroups.Mode() == cgroups.Unified && !securityContext.GetPrivileged() { - specOpts = append(specOpts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.CgroupNamespace})) - } - - return specOpts, nil -} - func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) { var specOpts []oci.SpecOpts securityContext := config.GetLinux().GetSecurityContext() diff --git a/pkg/cri/sbserver/container_create_other.go b/pkg/cri/sbserver/container_create_other.go index a2d8e1f27..90f94b100 100644 --- a/pkg/cri/sbserver/container_create_other.go +++ b/pkg/cri/sbserver/container_create_other.go @@ -19,10 +19,11 @@ package sbserver import ( - "github.com/containerd/containerd/oci" - "github.com/containerd/containerd/snapshots" imagespec "github.com/opencontainers/image-spec/specs-go/v1" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + + "github.com/containerd/containerd/oci" + "github.com/containerd/containerd/snapshots" ) // containerMounts sets up necessary container system file mounts @@ -31,17 +32,6 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container return []*runtime.Mount{} } -func (c *criService) platformSpec( - id string, - sandboxID string, - config *runtime.ContainerConfig, - sandboxConfig *runtime.PodSandboxConfig, - imageConfig *imagespec.ImageConfig, - extraMounts []*runtime.Mount, -) ([]oci.SpecOpts, error) { - return []oci.SpecOpts{}, nil -} - func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) { return []oci.SpecOpts{}, nil } diff --git a/pkg/cri/sbserver/container_create_windows.go b/pkg/cri/sbserver/container_create_windows.go index c9d9b893d..ee9a2cd1e 100644 --- a/pkg/cri/sbserver/container_create_windows.go +++ b/pkg/cri/sbserver/container_create_windows.go @@ -23,7 +23,6 @@ import ( runtime "k8s.io/cri-api/pkg/apis/runtime/v1" "github.com/containerd/containerd/oci" - customopts "github.com/containerd/containerd/pkg/cri/opts" "github.com/containerd/containerd/snapshots" ) @@ -32,49 +31,6 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container return nil } -func (c *criService) platformSpec( - id string, - sandboxID string, - config *runtime.ContainerConfig, - sandboxConfig *runtime.PodSandboxConfig, - imageConfig *imagespec.ImageConfig, - extraMounts []*runtime.Mount, -) ([]oci.SpecOpts, error) { - specOpts := []oci.SpecOpts{} - - specOpts = append(specOpts, - customopts.WithWindowsMounts(c.os, config, extraMounts), - customopts.WithDevices(config), - ) - - // Start with the image config user and override below if RunAsUsername is not "". - username := imageConfig.User - - windowsConfig := config.GetWindows() - if windowsConfig != nil { - specOpts = append(specOpts, customopts.WithWindowsResources(windowsConfig.GetResources())) - securityCtx := windowsConfig.GetSecurityContext() - if securityCtx != nil { - runAsUser := securityCtx.GetRunAsUsername() - if runAsUser != "" { - username = runAsUser - } - cs := securityCtx.GetCredentialSpec() - if cs != "" { - specOpts = append(specOpts, customopts.WithWindowsCredentialSpec(cs)) - } - } - } - - // There really isn't a good Windows way to verify that the username is available in the - // image as early as here like there is for Linux. Later on in the stack hcsshim - // will handle the behavior of erroring out if the user isn't available in the image - // when trying to run the init process. - specOpts = append(specOpts, oci.WithUser(username)) - - return specOpts, nil -} - // No extra spec options needed for windows. func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageConfig *imagespec.ImageConfig) ([]oci.SpecOpts, error) { return nil, nil diff --git a/pkg/cri/sbserver/helpers.go b/pkg/cri/sbserver/helpers.go index c0be54e53..a85b86cae 100644 --- a/pkg/cri/sbserver/helpers.go +++ b/pkg/cri/sbserver/helpers.go @@ -21,6 +21,7 @@ import ( "fmt" "path" "path/filepath" + "regexp" goruntime "runtime" "strconv" "strings" @@ -603,3 +604,180 @@ func hostNetwork(config *runtime.PodSandboxConfig) bool { } return hostNet } + +// getCgroupsPath generates container cgroups path. +func getCgroupsPath(cgroupsParent, id string) string { + base := path.Base(cgroupsParent) + if strings.HasSuffix(base, ".slice") { + // For a.slice/b.slice/c.slice, base is c.slice. + // runc systemd cgroup path format is "slice:prefix:name". + return strings.Join([]string{base, "cri-containerd", id}, ":") + } + return filepath.Join(cgroupsParent, id) +} + +func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) { + var labels []string + + if selinuxOptions == nil { + return nil, nil + } + if err := checkSelinuxLevel(selinuxOptions.Level); err != nil { + return nil, err + } + if selinuxOptions.User != "" { + labels = append(labels, "user:"+selinuxOptions.User) + } + if selinuxOptions.Role != "" { + labels = append(labels, "role:"+selinuxOptions.Role) + } + if selinuxOptions.Type != "" { + labels = append(labels, "type:"+selinuxOptions.Type) + } + if selinuxOptions.Level != "" { + labels = append(labels, "level:"+selinuxOptions.Level) + } + + return labels, nil +} + +func checkSelinuxLevel(level string) error { + if len(level) == 0 { + return nil + } + + matched, err := regexp.MatchString(`^s\d(-s\d)??(:c\d{1,4}(\.c\d{1,4})?(,c\d{1,4}(\.c\d{1,4})?)*)?$`, level) + if err != nil { + return fmt.Errorf("the format of 'level' %q is not correct: %w", level, err) + } + if !matched { + return fmt.Errorf("the format of 'level' %q is not correct", level) + } + return nil +} + +func parseUsernsIDMap(runtimeIDMap []*runtime.IDMapping) ([]runtimespec.LinuxIDMapping, error) { + var m []runtimespec.LinuxIDMapping + + if len(runtimeIDMap) == 0 { + return m, nil + } + + if len(runtimeIDMap) > 1 { + // We only accept 1 line, because containerd.WithRemappedSnapshot() only supports that. + return m, fmt.Errorf("only one mapping line supported, got %v mapping lines", len(runtimeIDMap)) + } + + // We know len is 1 now. + if runtimeIDMap[0] == nil { + return m, nil + } + uidMap := *runtimeIDMap[0] + + if uidMap.Length < 1 { + return m, fmt.Errorf("invalid mapping length: %v", uidMap.Length) + } + + m = []runtimespec.LinuxIDMapping{ + { + ContainerID: uidMap.ContainerId, + HostID: uidMap.HostId, + Size: uidMap.Length, + }, + } + + return m, nil +} + +func parseUsernsIDs(userns *runtime.UserNamespace) (uids, gids []runtimespec.LinuxIDMapping, retErr error) { + if userns == nil { + // If userns is not set, the kubelet doesn't support this option + // and we should just fallback to no userns. This is completely + // valid. + return nil, nil, nil + } + + uids, err := parseUsernsIDMap(userns.GetUids()) + if err != nil { + return nil, nil, fmt.Errorf("UID mapping: %w", err) + } + + gids, err = parseUsernsIDMap(userns.GetGids()) + if err != nil { + return nil, nil, fmt.Errorf("GID mapping: %w", err) + } + + switch mode := userns.GetMode(); mode { + case runtime.NamespaceMode_NODE: + if len(uids) != 0 || len(gids) != 0 { + return nil, nil, fmt.Errorf("can't use user namespace mode %q with mappings. Got %v UID mappings and %v GID mappings", mode, len(uids), len(gids)) + } + case runtime.NamespaceMode_POD: + // This is valid, we will handle it in WithPodNamespaces(). + if len(uids) == 0 || len(gids) == 0 { + return nil, nil, fmt.Errorf("can't use user namespace mode %q without UID and GID mappings", mode) + } + default: + return nil, nil, fmt.Errorf("unsupported user namespace mode: %q", mode) + } + + return uids, gids, nil +} + +// sameUsernsConfig checks if the userns configs are the same. If the mappings +// on each config are the same but in different order, it returns false. +// XXX: If the runtime.UserNamespace struct changes, we should update this +// function accordingly. +func sameUsernsConfig(a, b *runtime.UserNamespace) bool { + // If both are nil, they are the same. + if a == nil && b == nil { + return true + } + // If only one is nil, they are different. + if a == nil || b == nil { + return false + } + // At this point, a is not nil nor b. + + if a.GetMode() != b.GetMode() { + return false + } + + aUids, aGids, err := parseUsernsIDs(a) + if err != nil { + return false + } + bUids, bGids, err := parseUsernsIDs(b) + if err != nil { + return false + } + + if !sameMapping(aUids, bUids) { + return false + } + if !sameMapping(aGids, bGids) { + return false + } + return true +} + +// sameMapping checks if the mappings are the same. If the mappings are the same +// but in different order, it returns false. +func sameMapping(a, b []runtimespec.LinuxIDMapping) bool { + if len(a) != len(b) { + return false + } + + for x := range a { + if a[x].ContainerID != b[x].ContainerID { + return false + } + if a[x].HostID != b[x].HostID { + return false + } + if a[x].Size != b[x].Size { + return false + } + } + return true +} diff --git a/pkg/cri/sbserver/helpers_linux.go b/pkg/cri/sbserver/helpers_linux.go index c465a4931..12b2888cb 100644 --- a/pkg/cri/sbserver/helpers_linux.go +++ b/pkg/cri/sbserver/helpers_linux.go @@ -20,23 +20,22 @@ import ( "context" "fmt" "os" - "path" "path/filepath" - "regexp" "sort" "strings" "syscall" "time" + "github.com/containerd/cgroups/v3" + "github.com/moby/sys/mountinfo" + "github.com/opencontainers/runtime-spec/specs-go" + "golang.org/x/sys/unix" + "github.com/containerd/containerd/log" "github.com/containerd/containerd/mount" "github.com/containerd/containerd/pkg/apparmor" "github.com/containerd/containerd/pkg/seccomp" "github.com/containerd/containerd/pkg/seutil" - "github.com/moby/sys/mountinfo" - "github.com/opencontainers/runtime-spec/specs-go" - "golang.org/x/sys/unix" - runtime "k8s.io/cri-api/pkg/apis/runtime/v1" ) const ( @@ -50,17 +49,6 @@ const ( resolvConfPath = "/etc/resolv.conf" ) -// getCgroupsPath generates container cgroups path. -func getCgroupsPath(cgroupsParent, id string) string { - base := path.Base(cgroupsParent) - if strings.HasSuffix(base, ".slice") { - // For a.slice/b.slice/c.slice, base is c.slice. - // runc systemd cgroup path format is "slice:prefix:name". - return strings.Join([]string{base, "cri-containerd", id}, ":") - } - return filepath.Join(cgroupsParent, id) -} - // getSandboxRootDir returns the root directory for managing sandbox files, // e.g. hosts files. func (c *criService) getSandboxRootDir(id string) string { @@ -93,46 +81,6 @@ func (c *criService) getSandboxDevShm(id string) string { return filepath.Join(c.getVolatileSandboxRootDir(id), "shm") } -func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) { - var labels []string - - if selinuxOptions == nil { - return nil, nil - } - if err := checkSelinuxLevel(selinuxOptions.Level); err != nil { - return nil, err - } - if selinuxOptions.User != "" { - labels = append(labels, "user:"+selinuxOptions.User) - } - if selinuxOptions.Role != "" { - labels = append(labels, "role:"+selinuxOptions.Role) - } - if selinuxOptions.Type != "" { - labels = append(labels, "type:"+selinuxOptions.Type) - } - if selinuxOptions.Level != "" { - labels = append(labels, "level:"+selinuxOptions.Level) - } - - return labels, nil -} - -func checkSelinuxLevel(level string) error { - if len(level) == 0 { - return nil - } - - matched, err := regexp.MatchString(`^s\d(-s\d)??(:c\d{1,4}(\.c\d{1,4})?(,c\d{1,4}(\.c\d{1,4})?)*)?$`, level) - if err != nil { - return fmt.Errorf("the format of 'level' %q is not correct: %w", level, err) - } - if !matched { - return fmt.Errorf("the format of 'level' %q is not correct", level) - } - return nil -} - // apparmorEnabled returns true if apparmor is enabled, supported by the host, // if apparmor_parser is installed, and if we are not running docker-in-docker. func (c *criService) apparmorEnabled() bool { @@ -270,3 +218,9 @@ func modifyProcessLabel(runtimeType string, spec *specs.Spec) error { spec.Process.SelinuxLabel = l return nil } + +// getCgroupsMode returns cgropu mode. +// TODO: add build constraints to cgroups package and remove this helper +func isUnifiedCgroupsMode() bool { + return cgroups.Mode() == cgroups.Unified +} diff --git a/pkg/cri/sbserver/helpers_other.go b/pkg/cri/sbserver/helpers_other.go index e0a904ae7..aef880153 100644 --- a/pkg/cri/sbserver/helpers_other.go +++ b/pkg/cri/sbserver/helpers_other.go @@ -41,3 +41,7 @@ func ensureRemoveAll(ctx context.Context, dir string) error { func modifyProcessLabel(runtimeType string, spec *specs.Spec) error { return nil } + +func isUnifiedCgroupsMode() bool { + return false +} diff --git a/pkg/cri/sbserver/helpers_windows.go b/pkg/cri/sbserver/helpers_windows.go index b052f37ca..aa44299d0 100644 --- a/pkg/cri/sbserver/helpers_windows.go +++ b/pkg/cri/sbserver/helpers_windows.go @@ -166,3 +166,7 @@ func ensureRemoveAll(_ context.Context, dir string) error { func modifyProcessLabel(runtimeType string, spec *specs.Spec) error { return nil } + +func isUnifiedCgroupsMode() bool { + return false +} diff --git a/pkg/cri/sbserver/rdt_linux.go b/pkg/cri/sbserver/rdt.go similarity index 100% rename from pkg/cri/sbserver/rdt_linux.go rename to pkg/cri/sbserver/rdt.go diff --git a/pkg/cri/sbserver/rdt_stub_linux.go b/pkg/cri/sbserver/rdt_stub.go similarity index 100% rename from pkg/cri/sbserver/rdt_stub_linux.go rename to pkg/cri/sbserver/rdt_stub.go diff --git a/pkg/cri/server/container_create_windows.go b/pkg/cri/server/container_create_windows.go index e11466545..8116c5d3a 100644 --- a/pkg/cri/server/container_create_windows.go +++ b/pkg/cri/server/container_create_windows.go @@ -21,12 +21,13 @@ import ( "fmt" "strconv" - "github.com/containerd/containerd/oci" - "github.com/containerd/containerd/snapshots" imagespec "github.com/opencontainers/image-spec/specs-go/v1" runtimespec "github.com/opencontainers/runtime-spec/specs-go" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + "github.com/containerd/containerd/oci" + "github.com/containerd/containerd/snapshots" + "github.com/containerd/containerd/pkg/cri/annotations" "github.com/containerd/containerd/pkg/cri/config" customopts "github.com/containerd/containerd/pkg/cri/opts" @@ -89,7 +90,7 @@ func (c *criService) containerSpec( oci.WithHostname(sandboxConfig.GetHostname()), ) - specOpts = append(specOpts, customopts.WithWindowsMounts(c.os, config, extraMounts), customopts.WithDevices(config)) + specOpts = append(specOpts, customopts.WithWindowsMounts(c.os, config, extraMounts), customopts.WithWindowsDevices(config)) // Start with the image config user and override below if RunAsUsername is not "". username := imageConfig.User