From 0ae0399b167bd4a01cd81cde9b5de06e6d37fb4f Mon Sep 17 00:00:00 2001 From: Maksym Pavlenko Date: Thu, 5 Jan 2023 20:31:48 -0800 Subject: [PATCH] Make OCI spec opts available on all platforms Signed-off-by: Maksym Pavlenko --- oci/spec_opts.go | 167 ++++++++++++++++++++++++ oci/spec_opts_linux.go | 97 -------------- oci/spec_opts_nonlinux.go | 22 ---- oci/spec_opts_unix.go | 7 - oci/spec_opts_windows.go | 86 ++---------- pkg/cri/opts/spec.go | 259 ++++++++++++++++++++++++++++++++++++ pkg/cri/opts/spec_linux.go | 260 +------------------------------------ 7 files changed, 435 insertions(+), 463 deletions(-) diff --git a/oci/spec_opts.go b/oci/spec_opts.go index 5394feeba..18f653347 100644 --- a/oci/spec_opts.go +++ b/oci/spec_opts.go @@ -1433,3 +1433,170 @@ func WithWindowsDevice(idType, id string) SpecOpts { return nil } } + +// WithMemorySwap sets the container's swap in bytes +func WithMemorySwap(swap int64) SpecOpts { + return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { + setResources(s) + if s.Linux.Resources.Memory == nil { + s.Linux.Resources.Memory = &specs.LinuxMemory{} + } + s.Linux.Resources.Memory.Swap = &swap + return nil + } +} + +// WithPidsLimit sets the container's pid limit or maximum +func WithPidsLimit(limit int64) SpecOpts { + return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { + setResources(s) + if s.Linux.Resources.Pids == nil { + s.Linux.Resources.Pids = &specs.LinuxPids{} + } + s.Linux.Resources.Pids.Limit = limit + return nil + } +} + +// WithBlockIO sets the container's blkio parameters +func WithBlockIO(blockio *specs.LinuxBlockIO) SpecOpts { + return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { + setResources(s) + s.Linux.Resources.BlockIO = blockio + return nil + } +} + +// WithCPUShares sets the container's cpu shares +func WithCPUShares(shares uint64) SpecOpts { + return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { + setCPU(s) + s.Linux.Resources.CPU.Shares = &shares + return nil + } +} + +// WithCPUs sets the container's cpus/cores for use by the container +func WithCPUs(cpus string) SpecOpts { + return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { + setCPU(s) + s.Linux.Resources.CPU.Cpus = cpus + return nil + } +} + +// WithCPUsMems sets the container's cpu mems for use by the container +func WithCPUsMems(mems string) SpecOpts { + return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { + setCPU(s) + s.Linux.Resources.CPU.Mems = mems + return nil + } +} + +// WithCPUCFS sets the container's Completely fair scheduling (CFS) quota and period +func WithCPUCFS(quota int64, period uint64) SpecOpts { + return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { + setCPU(s) + s.Linux.Resources.CPU.Quota = "a + s.Linux.Resources.CPU.Period = &period + return nil + } +} + +// WithCPURT sets the container's realtime scheduling (RT) runtime and period. +func WithCPURT(runtime int64, period uint64) SpecOpts { + return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { + setCPU(s) + s.Linux.Resources.CPU.RealtimeRuntime = &runtime + s.Linux.Resources.CPU.RealtimePeriod = &period + return nil + } +} + +// WithoutRunMount removes the `/run` inside the spec +func WithoutRunMount(ctx context.Context, client Client, c *containers.Container, s *Spec) error { + return WithoutMounts("/run")(ctx, client, c, s) +} + +// WithRdt sets the container's RDT parameters +func WithRdt(closID, l3CacheSchema, memBwSchema string) SpecOpts { + return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { + s.Linux.IntelRdt = &specs.LinuxIntelRdt{ + ClosID: closID, + L3CacheSchema: l3CacheSchema, + MemBwSchema: memBwSchema, + } + return nil + } +} + +// WithWindowsCPUCount sets the `Windows.Resources.CPU.Count` section to the +// `count` specified. +func WithWindowsCPUCount(count uint64) SpecOpts { + return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error { + setCPUWindows(s) + s.Windows.Resources.CPU.Count = &count + return nil + } +} + +// WithWindowsCPUShares sets the `Windows.Resources.CPU.Shares` section to the +// `shares` specified. +func WithWindowsCPUShares(shares uint16) SpecOpts { + return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error { + setCPUWindows(s) + s.Windows.Resources.CPU.Shares = &shares + return nil + } +} + +// WithWindowsCPUMaximum sets the `Windows.Resources.CPU.Maximum` section to the +// `max` specified. +func WithWindowsCPUMaximum(max uint16) SpecOpts { + return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error { + setCPUWindows(s) + s.Windows.Resources.CPU.Maximum = &max + return nil + } +} + +// WithWindowsIgnoreFlushesDuringBoot sets `Windows.IgnoreFlushesDuringBoot`. +func WithWindowsIgnoreFlushesDuringBoot() SpecOpts { + return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error { + if s.Windows == nil { + s.Windows = &specs.Windows{} + } + s.Windows.IgnoreFlushesDuringBoot = true + return nil + } +} + +// WithWindowNetworksAllowUnqualifiedDNSQuery sets `Windows.Network.AllowUnqualifiedDNSQuery`. +func WithWindowNetworksAllowUnqualifiedDNSQuery() SpecOpts { + return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error { + if s.Windows == nil { + s.Windows = &specs.Windows{} + } + if s.Windows.Network == nil { + s.Windows.Network = &specs.WindowsNetwork{} + } + + s.Windows.Network.AllowUnqualifiedDNSQuery = true + return nil + } +} + +// WithWindowsNetworkNamespace sets the network namespace for a Windows container. +func WithWindowsNetworkNamespace(ns string) SpecOpts { + return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error { + if s.Windows == nil { + s.Windows = &specs.Windows{} + } + if s.Windows.Network == nil { + s.Windows.Network = &specs.WindowsNetwork{} + } + s.Windows.Network.NetworkNamespace = ns + return nil + } +} diff --git a/oci/spec_opts_linux.go b/oci/spec_opts_linux.go index 36ad26196..bccea766a 100644 --- a/oci/spec_opts_linux.go +++ b/oci/spec_opts_linux.go @@ -60,86 +60,6 @@ func WithDevices(devicePath, containerPath, permissions string) SpecOpts { } } -// WithMemorySwap sets the container's swap in bytes -func WithMemorySwap(swap int64) SpecOpts { - return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { - setResources(s) - if s.Linux.Resources.Memory == nil { - s.Linux.Resources.Memory = &specs.LinuxMemory{} - } - s.Linux.Resources.Memory.Swap = &swap - return nil - } -} - -// WithPidsLimit sets the container's pid limit or maximum -func WithPidsLimit(limit int64) SpecOpts { - return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { - setResources(s) - if s.Linux.Resources.Pids == nil { - s.Linux.Resources.Pids = &specs.LinuxPids{} - } - s.Linux.Resources.Pids.Limit = limit - return nil - } -} - -// WithBlockIO sets the container's blkio parameters -func WithBlockIO(blockio *specs.LinuxBlockIO) SpecOpts { - return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { - setResources(s) - s.Linux.Resources.BlockIO = blockio - return nil - } -} - -// WithCPUShares sets the container's cpu shares -func WithCPUShares(shares uint64) SpecOpts { - return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { - setCPU(s) - s.Linux.Resources.CPU.Shares = &shares - return nil - } -} - -// WithCPUs sets the container's cpus/cores for use by the container -func WithCPUs(cpus string) SpecOpts { - return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { - setCPU(s) - s.Linux.Resources.CPU.Cpus = cpus - return nil - } -} - -// WithCPUsMems sets the container's cpu mems for use by the container -func WithCPUsMems(mems string) SpecOpts { - return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { - setCPU(s) - s.Linux.Resources.CPU.Mems = mems - return nil - } -} - -// WithCPUCFS sets the container's Completely fair scheduling (CFS) quota and period -func WithCPUCFS(quota int64, period uint64) SpecOpts { - return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { - setCPU(s) - s.Linux.Resources.CPU.Quota = "a - s.Linux.Resources.CPU.Period = &period - return nil - } -} - -// WithCPURT sets the container's realtime scheduling (RT) runtime and period. -func WithCPURT(runtime int64, period uint64) SpecOpts { - return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { - setCPU(s) - s.Linux.Resources.CPU.RealtimeRuntime = &runtime - s.Linux.Resources.CPU.RealtimePeriod = &period - return nil - } -} - // WithAllCurrentCapabilities propagates the effective capabilities of the caller process to the container process. // The capability set may differ from WithAllKnownCapabilities when running in a container. var WithAllCurrentCapabilities = func(ctx context.Context, client Client, c *containers.Container, s *Spec) error { @@ -156,23 +76,6 @@ var WithAllKnownCapabilities = func(ctx context.Context, client Client, c *conta return WithCapabilities(caps)(ctx, client, c, s) } -// WithoutRunMount removes the `/run` inside the spec -func WithoutRunMount(ctx context.Context, client Client, c *containers.Container, s *Spec) error { - return WithoutMounts("/run")(ctx, client, c, s) -} - -// WithRdt sets the container's RDT parameters -func WithRdt(closID, l3CacheSchema, memBwSchema string) SpecOpts { - return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { - s.Linux.IntelRdt = &specs.LinuxIntelRdt{ - ClosID: closID, - L3CacheSchema: l3CacheSchema, - MemBwSchema: memBwSchema, - } - return nil - } -} - func escapeAndCombineArgs(args []string) string { panic("not supported") } diff --git a/oci/spec_opts_nonlinux.go b/oci/spec_opts_nonlinux.go index eb6de026a..b2f796f36 100644 --- a/oci/spec_opts_nonlinux.go +++ b/oci/spec_opts_nonlinux.go @@ -20,7 +20,6 @@ package oci import ( "context" - "errors" "github.com/containerd/containerd/containers" ) @@ -35,24 +34,3 @@ var WithAllCurrentCapabilities = func(ctx context.Context, client Client, c *con var WithAllKnownCapabilities = func(ctx context.Context, client Client, c *containers.Container, s *Spec) error { return WithCapabilities(nil)(ctx, client, c, s) } - -// WithBlockIO sets the container's blkio parameters -func WithBlockIO(blockio interface{}) SpecOpts { - return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { - return errors.New("blkio not supported") - } -} - -// WithCPUShares sets the container's cpu shares -func WithCPUShares(shares uint64) SpecOpts { - return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { - return nil - } -} - -// WithRdt sets the container's RDT parameters -func WithRdt(closID, l3CacheSchema, memBwSchema string) SpecOpts { - return func(_ context.Context, _ Client, _ *containers.Container, _ *Spec) error { - return errors.New("RDT not supported") - } -} diff --git a/oci/spec_opts_unix.go b/oci/spec_opts_unix.go index e1c66bfaa..f4135285c 100644 --- a/oci/spec_opts_unix.go +++ b/oci/spec_opts_unix.go @@ -50,13 +50,6 @@ func WithDevices(devicePath, containerPath, permissions string) SpecOpts { } } -// WithCPUCFS sets the container's Completely fair scheduling (CFS) quota and period -func WithCPUCFS(quota int64, period uint64) SpecOpts { - return func(ctx context.Context, _ Client, c *containers.Container, s *Spec) error { - return nil - } -} - func escapeAndCombineArgs(args []string) string { panic("not supported") } diff --git a/oci/spec_opts_windows.go b/oci/spec_opts_windows.go index 94cb39312..dd925a8b1 100644 --- a/oci/spec_opts_windows.go +++ b/oci/spec_opts_windows.go @@ -21,66 +21,18 @@ import ( "errors" "strings" - "github.com/containerd/containerd/containers" - - specs "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/windows" + + "github.com/containerd/containerd/containers" ) -// WithWindowsCPUCount sets the `Windows.Resources.CPU.Count` section to the -// `count` specified. -func WithWindowsCPUCount(count uint64) SpecOpts { - return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error { - setCPUWindows(s) - s.Windows.Resources.CPU.Count = &count - return nil - } -} - -// WithWindowsCPUShares sets the `Windows.Resources.CPU.Shares` section to the -// `shares` specified. -func WithWindowsCPUShares(shares uint16) SpecOpts { - return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error { - setCPUWindows(s) - s.Windows.Resources.CPU.Shares = &shares - return nil - } -} - -// WithWindowsCPUMaximum sets the `Windows.Resources.CPU.Maximum` section to the -// `max` specified. -func WithWindowsCPUMaximum(max uint16) SpecOpts { - return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error { - setCPUWindows(s) - s.Windows.Resources.CPU.Maximum = &max - return nil - } -} - -// WithWindowsIgnoreFlushesDuringBoot sets `Windows.IgnoreFlushesDuringBoot`. -func WithWindowsIgnoreFlushesDuringBoot() SpecOpts { - return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error { - if s.Windows == nil { - s.Windows = &specs.Windows{} - } - s.Windows.IgnoreFlushesDuringBoot = true - return nil - } -} - -// WithWindowNetworksAllowUnqualifiedDNSQuery sets `Windows.Network.AllowUnqualifiedDNSQuery`. -func WithWindowNetworksAllowUnqualifiedDNSQuery() SpecOpts { - return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error { - if s.Windows == nil { - s.Windows = &specs.Windows{} - } - if s.Windows.Network == nil { - s.Windows.Network = &specs.WindowsNetwork{} - } - - s.Windows.Network.AllowUnqualifiedDNSQuery = true - return nil +func escapeAndCombineArgs(args []string) string { + escaped := make([]string, len(args)) + for i, a := range args { + escaped[i] = windows.EscapeArg(a) } + return strings.Join(escaped, " ") } // WithHostDevices adds all the hosts device nodes to the container's spec @@ -93,25 +45,3 @@ func WithHostDevices(_ context.Context, _ Client, _ *containers.Container, s *Sp func DeviceFromPath(path string) (*specs.LinuxDevice, error) { return nil, errors.New("device from path not supported on Windows") } - -// WithWindowsNetworkNamespace sets the network namespace for a Windows container. -func WithWindowsNetworkNamespace(ns string) SpecOpts { - return func(_ context.Context, _ Client, _ *containers.Container, s *Spec) error { - if s.Windows == nil { - s.Windows = &specs.Windows{} - } - if s.Windows.Network == nil { - s.Windows.Network = &specs.WindowsNetwork{} - } - s.Windows.Network.NetworkNamespace = ns - return nil - } -} - -func escapeAndCombineArgs(args []string) string { - escaped := make([]string, len(args)) - for i, a := range args { - escaped[i] = windows.EscapeArg(a) - } - return strings.Join(escaped, " ") -} diff --git a/pkg/cri/opts/spec.go b/pkg/cri/opts/spec.go index 4ad1f1c14..c8d40b743 100644 --- a/pkg/cri/opts/spec.go +++ b/pkg/cri/opts/spec.go @@ -19,12 +19,15 @@ package opts import ( "context" "errors" + "fmt" "os" "path/filepath" + "sort" "strings" "github.com/containerd/containerd/containers" "github.com/containerd/containerd/oci" + "github.com/containerd/containerd/pkg/cri/util" imagespec "github.com/opencontainers/image-spec/specs-go/v1" runtimespec "github.com/opencontainers/runtime-spec/specs-go" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" @@ -113,3 +116,259 @@ func WithAnnotation(k, v string) oci.SpecOpts { return nil } } + +// WithAdditionalGIDs adds any additional groups listed for a particular user in the +// /etc/groups file of the image's root filesystem to the OCI spec's additionalGids array. +func WithAdditionalGIDs(userstr string) oci.SpecOpts { + return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) { + if s.Process == nil { + s.Process = &runtimespec.Process{} + } + gids := s.Process.User.AdditionalGids + if err := oci.WithAdditionalGIDs(userstr)(ctx, client, c, s); err != nil { + return err + } + // Merge existing gids and new gids. + s.Process.User.AdditionalGids = mergeGids(s.Process.User.AdditionalGids, gids) + return nil + } +} + +func mergeGids(gids1, gids2 []uint32) []uint32 { + gidsMap := make(map[uint32]struct{}) + for _, gid1 := range gids1 { + gidsMap[gid1] = struct{}{} + } + for _, gid2 := range gids2 { + gidsMap[gid2] = struct{}{} + } + var gids []uint32 + for gid := range gidsMap { + gids = append(gids, gid) + } + sort.Slice(gids, func(i, j int) bool { return gids[i] < gids[j] }) + return gids +} + +// WithoutDefaultSecuritySettings removes the default security settings generated on a spec +func WithoutDefaultSecuritySettings(_ context.Context, _ oci.Client, c *containers.Container, s *runtimespec.Spec) error { + if s.Process == nil { + s.Process = &runtimespec.Process{} + } + // Make sure no default seccomp/apparmor is specified + s.Process.ApparmorProfile = "" + if s.Linux != nil { + s.Linux.Seccomp = nil + } + // Remove default rlimits (See issue #515) + s.Process.Rlimits = nil + return nil +} + +// WithCapabilities sets the provided capabilities from the security context +func WithCapabilities(sc *runtime.LinuxContainerSecurityContext, allCaps []string) oci.SpecOpts { + capabilities := sc.GetCapabilities() + if capabilities == nil { + return nullOpt + } + + var opts []oci.SpecOpts + // Add/drop all capabilities if "all" is specified, so that + // following individual add/drop could still work. E.g. + // AddCapabilities: []string{"ALL"}, DropCapabilities: []string{"CHOWN"} + // will be all capabilities without `CAP_CHOWN`. + if util.InStringSlice(capabilities.GetAddCapabilities(), "ALL") { + opts = append(opts, oci.WithCapabilities(allCaps)) + } + if util.InStringSlice(capabilities.GetDropCapabilities(), "ALL") { + opts = append(opts, oci.WithCapabilities(nil)) + } + + var caps []string + for _, c := range capabilities.GetAddCapabilities() { + if strings.ToUpper(c) == "ALL" { + continue + } + // Capabilities in CRI doesn't have `CAP_` prefix, so add it. + caps = append(caps, "CAP_"+strings.ToUpper(c)) + } + opts = append(opts, oci.WithAddedCapabilities(caps)) + + caps = []string{} + for _, c := range capabilities.GetDropCapabilities() { + if strings.ToUpper(c) == "ALL" { + continue + } + caps = append(caps, "CAP_"+strings.ToUpper(c)) + } + opts = append(opts, oci.WithDroppedCapabilities(caps)) + return oci.Compose(opts...) +} + +func nullOpt(_ context.Context, _ oci.Client, _ *containers.Container, _ *runtimespec.Spec) error { + return nil +} + +// WithoutAmbientCaps removes the ambient caps from the spec +func WithoutAmbientCaps(_ context.Context, _ oci.Client, c *containers.Container, s *runtimespec.Spec) error { + if s.Process == nil { + s.Process = &runtimespec.Process{} + } + if s.Process.Capabilities == nil { + s.Process.Capabilities = &runtimespec.LinuxCapabilities{} + } + s.Process.Capabilities.Ambient = nil + return nil +} + +// WithDisabledCgroups clears the Cgroups Path from the spec +func WithDisabledCgroups(_ context.Context, _ oci.Client, c *containers.Container, s *runtimespec.Spec) error { + if s.Linux == nil { + s.Linux = &runtimespec.Linux{} + } + s.Linux.CgroupsPath = "" + return nil +} + +// WithSelinuxLabels sets the mount and process labels +func WithSelinuxLabels(process, mount string) oci.SpecOpts { + return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) { + if s.Linux == nil { + s.Linux = &runtimespec.Linux{} + } + if s.Process == nil { + s.Process = &runtimespec.Process{} + } + s.Linux.MountLabel = mount + s.Process.SelinuxLabel = process + return nil + } +} + +// WithSysctls sets the provided sysctls onto the spec +func WithSysctls(sysctls map[string]string) oci.SpecOpts { + return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { + if s.Linux == nil { + s.Linux = &runtimespec.Linux{} + } + if s.Linux.Sysctl == nil { + s.Linux.Sysctl = make(map[string]string) + } + for k, v := range sysctls { + s.Linux.Sysctl[k] = v + } + return nil + } +} + +// WithSupplementalGroups sets the supplemental groups for the process +func WithSupplementalGroups(groups []int64) oci.SpecOpts { + return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { + if s.Process == nil { + s.Process = &runtimespec.Process{} + } + var guids []uint32 + for _, g := range groups { + guids = append(guids, uint32(g)) + } + s.Process.User.AdditionalGids = mergeGids(s.Process.User.AdditionalGids, guids) + return nil + } +} + +// WithDefaultSandboxShares sets the default sandbox CPU shares +func WithDefaultSandboxShares(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { + if s.Linux == nil { + s.Linux = &runtimespec.Linux{} + } + if s.Linux.Resources == nil { + s.Linux.Resources = &runtimespec.LinuxResources{} + } + if s.Linux.Resources.CPU == nil { + s.Linux.Resources.CPU = &runtimespec.LinuxCPU{} + } + i := uint64(DefaultSandboxCPUshares) + s.Linux.Resources.CPU.Shares = &i + return nil +} + +// WithoutNamespace removes the provided namespace +func WithoutNamespace(t runtimespec.LinuxNamespaceType) oci.SpecOpts { + return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { + if s.Linux == nil { + return nil + } + var namespaces []runtimespec.LinuxNamespace + for i, ns := range s.Linux.Namespaces { + if ns.Type != t { + namespaces = append(namespaces, s.Linux.Namespaces[i]) + } + } + s.Linux.Namespaces = namespaces + return nil + } +} + +// WithPodNamespaces sets the pod namespaces for the container +func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32, uids, gids []runtimespec.LinuxIDMapping) oci.SpecOpts { + namespaces := config.GetNamespaceOptions() + + opts := []oci.SpecOpts{ + oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.NetworkNamespace, Path: GetNetworkNamespace(sandboxPid)}), + oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.IPCNamespace, Path: GetIPCNamespace(sandboxPid)}), + oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.UTSNamespace, Path: GetUTSNamespace(sandboxPid)}), + } + if namespaces.GetPid() != runtime.NamespaceMode_CONTAINER { + opts = append(opts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.PIDNamespace, Path: GetPIDNamespace(targetPid)})) + } + + if namespaces.GetUsernsOptions() != nil { + switch namespaces.GetUsernsOptions().GetMode() { + case runtime.NamespaceMode_NODE: + // Nothing to do. Not adding userns field uses the node userns. + case runtime.NamespaceMode_POD: + opts = append(opts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.UserNamespace, Path: GetUserNamespace(sandboxPid)})) + opts = append(opts, oci.WithUserNamespace(uids, gids)) + } + } + + return oci.Compose(opts...) +} + +const ( + // netNSFormat is the format of network namespace of a process. + netNSFormat = "/proc/%v/ns/net" + // ipcNSFormat is the format of ipc namespace of a process. + ipcNSFormat = "/proc/%v/ns/ipc" + // utsNSFormat is the format of uts namespace of a process. + utsNSFormat = "/proc/%v/ns/uts" + // pidNSFormat is the format of pid namespace of a process. + pidNSFormat = "/proc/%v/ns/pid" + // userNSFormat is the format of user namespace of a process. + userNSFormat = "/proc/%v/ns/user" +) + +// GetNetworkNamespace returns the network namespace of a process. +func GetNetworkNamespace(pid uint32) string { + return fmt.Sprintf(netNSFormat, pid) +} + +// GetIPCNamespace returns the ipc namespace of a process. +func GetIPCNamespace(pid uint32) string { + return fmt.Sprintf(ipcNSFormat, pid) +} + +// GetUTSNamespace returns the uts namespace of a process. +func GetUTSNamespace(pid uint32) string { + return fmt.Sprintf(utsNSFormat, pid) +} + +// GetPIDNamespace returns the pid namespace of a process. +func GetPIDNamespace(pid uint32) string { + return fmt.Sprintf(pidNSFormat, pid) +} + +// GetUserNamespace returns the user namespace of a process. +func GetUserNamespace(pid uint32) string { + return fmt.Sprintf(userNSFormat, pid) +} diff --git a/pkg/cri/opts/spec_linux.go b/pkg/cri/opts/spec_linux.go index d627da34c..8b94cc5b7 100644 --- a/pkg/cri/opts/spec_linux.go +++ b/pkg/cri/opts/spec_linux.go @@ -35,64 +35,14 @@ import ( "github.com/containerd/containerd/log" "github.com/containerd/containerd/mount" "github.com/containerd/containerd/oci" + osinterface "github.com/containerd/containerd/pkg/os" runtimespec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" - - "github.com/containerd/containerd/pkg/cri/util" - osinterface "github.com/containerd/containerd/pkg/os" ) -// WithAdditionalGIDs adds any additional groups listed for a particular user in the -// /etc/groups file of the image's root filesystem to the OCI spec's additionalGids array. -func WithAdditionalGIDs(userstr string) oci.SpecOpts { - return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) { - if s.Process == nil { - s.Process = &runtimespec.Process{} - } - gids := s.Process.User.AdditionalGids - if err := oci.WithAdditionalGIDs(userstr)(ctx, client, c, s); err != nil { - return err - } - // Merge existing gids and new gids. - s.Process.User.AdditionalGids = mergeGids(s.Process.User.AdditionalGids, gids) - return nil - } -} - -func mergeGids(gids1, gids2 []uint32) []uint32 { - gidsMap := make(map[uint32]struct{}) - for _, gid1 := range gids1 { - gidsMap[gid1] = struct{}{} - } - for _, gid2 := range gids2 { - gidsMap[gid2] = struct{}{} - } - var gids []uint32 - for gid := range gidsMap { - gids = append(gids, gid) - } - sort.Slice(gids, func(i, j int) bool { return gids[i] < gids[j] }) - return gids -} - -// WithoutDefaultSecuritySettings removes the default security settings generated on a spec -func WithoutDefaultSecuritySettings(_ context.Context, _ oci.Client, c *containers.Container, s *runtimespec.Spec) error { - if s.Process == nil { - s.Process = &runtimespec.Process{} - } - // Make sure no default seccomp/apparmor is specified - s.Process.ApparmorProfile = "" - if s.Linux != nil { - s.Linux.Seccomp = nil - } - // Remove default rlimits (See issue #515) - s.Process.Rlimits = nil - return nil -} - // WithMounts sorts and adds runtime and CRI mounts to the spec func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*runtime.Mount, mountLabel string) oci.SpecOpts { return func(ctx context.Context, client oci.Client, _ *containers.Container, s *runtimespec.Spec) (err error) { @@ -330,82 +280,6 @@ func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDevi } } -// WithCapabilities sets the provided capabilities from the security context -func WithCapabilities(sc *runtime.LinuxContainerSecurityContext, allCaps []string) oci.SpecOpts { - capabilities := sc.GetCapabilities() - if capabilities == nil { - return nullOpt - } - - var opts []oci.SpecOpts - // Add/drop all capabilities if "all" is specified, so that - // following individual add/drop could still work. E.g. - // AddCapabilities: []string{"ALL"}, DropCapabilities: []string{"CHOWN"} - // will be all capabilities without `CAP_CHOWN`. - if util.InStringSlice(capabilities.GetAddCapabilities(), "ALL") { - opts = append(opts, oci.WithCapabilities(allCaps)) - } - if util.InStringSlice(capabilities.GetDropCapabilities(), "ALL") { - opts = append(opts, oci.WithCapabilities(nil)) - } - - var caps []string - for _, c := range capabilities.GetAddCapabilities() { - if strings.ToUpper(c) == "ALL" { - continue - } - // Capabilities in CRI doesn't have `CAP_` prefix, so add it. - caps = append(caps, "CAP_"+strings.ToUpper(c)) - } - opts = append(opts, oci.WithAddedCapabilities(caps)) - - caps = []string{} - for _, c := range capabilities.GetDropCapabilities() { - if strings.ToUpper(c) == "ALL" { - continue - } - caps = append(caps, "CAP_"+strings.ToUpper(c)) - } - opts = append(opts, oci.WithDroppedCapabilities(caps)) - return oci.Compose(opts...) -} - -// WithoutAmbientCaps removes the ambient caps from the spec -func WithoutAmbientCaps(_ context.Context, _ oci.Client, c *containers.Container, s *runtimespec.Spec) error { - if s.Process == nil { - s.Process = &runtimespec.Process{} - } - if s.Process.Capabilities == nil { - s.Process.Capabilities = &runtimespec.LinuxCapabilities{} - } - s.Process.Capabilities.Ambient = nil - return nil -} - -// WithDisabledCgroups clears the Cgroups Path from the spec -func WithDisabledCgroups(_ context.Context, _ oci.Client, c *containers.Container, s *runtimespec.Spec) error { - if s.Linux == nil { - s.Linux = &runtimespec.Linux{} - } - s.Linux.CgroupsPath = "" - return nil -} - -// WithSelinuxLabels sets the mount and process labels -func WithSelinuxLabels(process, mount string) oci.SpecOpts { - return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) { - if s.Linux == nil { - s.Linux = &runtimespec.Linux{} - } - if s.Process == nil { - s.Process = &runtimespec.Process{} - } - s.Linux.MountLabel = mount - s.Process.SelinuxLabel = process - return nil - } -} - var ( swapControllerAvailability bool swapControllerAvailabilityOnce sync.Once @@ -612,22 +486,6 @@ func WithOOMScoreAdj(config *runtime.ContainerConfig, restrict bool) oci.SpecOpt } } -// WithSysctls sets the provided sysctls onto the spec -func WithSysctls(sysctls map[string]string) oci.SpecOpts { - return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { - if s.Linux == nil { - s.Linux = &runtimespec.Linux{} - } - if s.Linux.Sysctl == nil { - s.Linux.Sysctl = make(map[string]string) - } - for k, v := range sysctls { - s.Linux.Sysctl[k] = v - } - return nil - } -} - // WithPodOOMScoreAdj sets the oom score for the pod sandbox func WithPodOOMScoreAdj(adj int, restrict bool) oci.SpecOpts { return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { @@ -646,84 +504,6 @@ func WithPodOOMScoreAdj(adj int, restrict bool) oci.SpecOpts { } } -// WithSupplementalGroups sets the supplemental groups for the process -func WithSupplementalGroups(groups []int64) oci.SpecOpts { - return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { - if s.Process == nil { - s.Process = &runtimespec.Process{} - } - var guids []uint32 - for _, g := range groups { - guids = append(guids, uint32(g)) - } - s.Process.User.AdditionalGids = mergeGids(s.Process.User.AdditionalGids, guids) - return nil - } -} - -// WithPodNamespaces sets the pod namespaces for the container -func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32, uids, gids []runtimespec.LinuxIDMapping) oci.SpecOpts { - namespaces := config.GetNamespaceOptions() - - opts := []oci.SpecOpts{ - oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.NetworkNamespace, Path: GetNetworkNamespace(sandboxPid)}), - oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.IPCNamespace, Path: GetIPCNamespace(sandboxPid)}), - oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.UTSNamespace, Path: GetUTSNamespace(sandboxPid)}), - } - if namespaces.GetPid() != runtime.NamespaceMode_CONTAINER { - opts = append(opts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.PIDNamespace, Path: GetPIDNamespace(targetPid)})) - } - - if namespaces.GetUsernsOptions() != nil { - switch namespaces.GetUsernsOptions().GetMode() { - case runtime.NamespaceMode_NODE: - // Nothing to do. Not adding userns field uses the node userns. - case runtime.NamespaceMode_POD: - opts = append(opts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.UserNamespace, Path: GetUserNamespace(sandboxPid)})) - opts = append(opts, oci.WithUserNamespace(uids, gids)) - } - } - - return oci.Compose(opts...) -} - -// WithDefaultSandboxShares sets the default sandbox CPU shares -func WithDefaultSandboxShares(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { - if s.Linux == nil { - s.Linux = &runtimespec.Linux{} - } - if s.Linux.Resources == nil { - s.Linux.Resources = &runtimespec.LinuxResources{} - } - if s.Linux.Resources.CPU == nil { - s.Linux.Resources.CPU = &runtimespec.LinuxCPU{} - } - i := uint64(DefaultSandboxCPUshares) - s.Linux.Resources.CPU.Shares = &i - return nil -} - -// WithoutNamespace removes the provided namespace -func WithoutNamespace(t runtimespec.LinuxNamespaceType) oci.SpecOpts { - return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) error { - if s.Linux == nil { - return nil - } - var namespaces []runtimespec.LinuxNamespace - for i, ns := range s.Linux.Namespaces { - if ns.Type != t { - namespaces = append(namespaces, s.Linux.Namespaces[i]) - } - } - s.Linux.Namespaces = namespaces - return nil - } -} - -func nullOpt(_ context.Context, _ oci.Client, _ *containers.Container, _ *runtimespec.Spec) error { - return nil -} - func getCurrentOOMScoreAdj() (int, error) { b, err := os.ReadFile("/proc/self/oom_score_adj") if err != nil { @@ -748,44 +528,6 @@ func restrictOOMScoreAdj(preferredOOMScoreAdj int) (int, error) { return preferredOOMScoreAdj, nil } -const ( - // netNSFormat is the format of network namespace of a process. - netNSFormat = "/proc/%v/ns/net" - // ipcNSFormat is the format of ipc namespace of a process. - ipcNSFormat = "/proc/%v/ns/ipc" - // utsNSFormat is the format of uts namespace of a process. - utsNSFormat = "/proc/%v/ns/uts" - // pidNSFormat is the format of pid namespace of a process. - pidNSFormat = "/proc/%v/ns/pid" - // userNSFormat is the format of user namespace of a process. - userNSFormat = "/proc/%v/ns/user" -) - -// GetNetworkNamespace returns the network namespace of a process. -func GetNetworkNamespace(pid uint32) string { - return fmt.Sprintf(netNSFormat, pid) -} - -// GetIPCNamespace returns the ipc namespace of a process. -func GetIPCNamespace(pid uint32) string { - return fmt.Sprintf(ipcNSFormat, pid) -} - -// GetUTSNamespace returns the uts namespace of a process. -func GetUTSNamespace(pid uint32) string { - return fmt.Sprintf(utsNSFormat, pid) -} - -// GetPIDNamespace returns the pid namespace of a process. -func GetPIDNamespace(pid uint32) string { - return fmt.Sprintf(pidNSFormat, pid) -} - -// GetUserNamespace returns the user namespace of a process. -func GetUserNamespace(pid uint32) string { - return fmt.Sprintf(userNSFormat, pid) -} - // WithCDI updates OCI spec with CDI content func WithCDI(annotations map[string]string) oci.SpecOpts { return func(ctx context.Context, _ oci.Client, c *containers.Container, s *oci.Spec) error {