From 36f520dc04259debc7b8f19f5574db2a6054abf6 Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Thu, 15 Dec 2022 14:31:36 -0300 Subject: [PATCH 1/4] Let OCI runtime create netns when userns is used As explained in the comments, this patch lets the OCI runtime create the netns when userns are in use. This is needed because the netns needs to be owned by the userns (otherwise can't modify the IP, etc.). Before this patch, we are creating the netns and then starting the pod sandbox asking to join this netns. This can't never work with userns, as the userns needs to be created first for the netns ownership to be correct. One option would be to also create the userns in containerd, then create the netns. But this is painful (needs tricks with the go runtime, special care to write the mapping, etc.). So, we just let the OCI runtime create the userns and netns, that creates them with the proper ownership. As requested by Mike Brown, the current code when userns is not used is left unchanged. We can unify the cases (with and without userns) in a future release. Signed-off-by: Rodrigo Campos --- pkg/cri/server/sandbox_run.go | 104 +++++++++++++++++++++++++++++++++- pkg/netns/netns_linux.go | 25 +++++++- pkg/netns/netns_other.go | 5 ++ pkg/netns/netns_windows.go | 15 ++++- 4 files changed, 144 insertions(+), 5 deletions(-) diff --git a/pkg/cri/server/sandbox_run.go b/pkg/cri/server/sandbox_run.go index 4c1365f47..b7ed6c150 100644 --- a/pkg/cri/server/sandbox_run.go +++ b/pkg/cri/server/sandbox_run.go @@ -23,6 +23,7 @@ import ( "fmt" "math" "path/filepath" + goruntime "runtime" "strings" "time" @@ -244,8 +245,27 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, fmt.Errorf("failed to get sandbox container info: %w", err) } + userNsEnabled := false + if goruntime.GOOS != "windows" { + usernsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions() + if usernsOpts != nil && usernsOpts.GetMode() == runtime.NamespaceMode_POD { + userNsEnabled = true + } + } + // Setup the network namespace if host networking wasn't requested. - if !hostNetwork(config) { + if !hostNetwork(config) && !userNsEnabled { + // XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too. + // We can't move this to a function, as the defer calls need to be executed if other + // errors are returned in this function. So, we would need more refactors to move + // this code to a function and the idea was to not change the current code for + // !userNsEnabled case, therefore doing it would defeat the purpose. + // + // The difference between the cases is the use of netns.NewNetNS() vs + // netns.NewNetNSFromPID() and we verify the task is still running in the other case. + // + // To simplify this, in the future, we should just remove this case (podNetwork && + // !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled). netStart := time.Now() // If it is not in host network namespace then create a namespace and set the sandbox @@ -353,6 +373,88 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, fmt.Errorf("failed to wait for sandbox container task: %w", err) } + if !hostNetwork(config) && userNsEnabled { + // If userns is enabled, then the netns was created by the OCI runtime + // when creating "task". The OCI runtime needs to create the netns + // because, if userns is in use, the netns needs to be owned by the + // userns. So, let the OCI runtime just handle this for us. + // If the netns is not owned by the userns several problems will happen. + // For instance, the container will lack permission (even if + // capabilities are present) to modify the netns or, even worse, the OCI + // runtime will fail to mount sysfs: + // https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164 + netStart := time.Now() + + // If it is not in host network namespace then create a namespace and set the sandbox + // handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network + // namespaces. If the pod is in host network namespace then both are empty and should not + // be used. + var netnsMountDir = "/var/run/netns" + if c.config.NetNSMountsUnderStateDir { + netnsMountDir = filepath.Join(c.config.StateDir, "netns") + } + sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, task.Pid()) + if err != nil { + return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err) + } + + // Verify task is still in created state. + if st, err := task.Status(ctx); err != nil || st.Status != containerd.Created { + return nil, fmt.Errorf("failed to create pod sandbox %q: err is %v - status is %q and is expected %q", id, err, st.Status, containerd.Created) + } + sandbox.NetNSPath = sandbox.NetNS.GetPath() + + defer func() { + // Remove the network namespace only if all the resource cleanup is done. + if retErr != nil && cleanupErr == nil { + if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id) + return + } + sandbox.NetNSPath = "" + } + }() + + // Update network namespace in the container's spec + c.updateNetNamespacePath(spec, sandbox.NetNSPath) + + if err := container.Update(ctx, + // Update spec of the container + containerd.UpdateContainerOpts(containerd.WithSpec(spec)), + // Update sandbox metadata to include NetNS info + containerd.UpdateContainerOpts(containerd.WithContainerExtension(sandboxMetadataExtension, &sandbox.Metadata))); err != nil { + return nil, fmt.Errorf("failed to update the network namespace for the sandbox container %q: %w", id, err) + } + + // Define this defer to teardownPodNetwork prior to the setupPodNetwork function call. + // This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource creation functions. + defer func() { + // Teardown the network only if all the resource cleanup is done. + if retErr != nil && cleanupErr == nil { + deferCtx, deferCancel := ctrdutil.DeferContext() + defer deferCancel() + // Teardown network if an error is returned. + if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id) + } + } + }() + + // Setup network for sandbox. + // Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524) + // rely on the assumption that CRI shim will not be querying the network namespace to check the + // network states such as IP. + // In future runtime implementation should avoid relying on CRI shim implementation details. + // In this case however caching the IP will add a subtle performance enhancement by avoiding + // calls to network namespace of the pod to query the IP of the veth interface on every + // SandboxStatus request. + if err := c.setupPodNetwork(ctx, &sandbox); err != nil { + return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err) + } + + sandboxCreateNetworkTimer.UpdateSince(netStart) + } + if c.nri.isEnabled() { err = c.nri.runPodSandbox(ctx, &sandbox) if err != nil { diff --git a/pkg/netns/netns_linux.go b/pkg/netns/netns_linux.go index 03f68a568..6e1a61f72 100644 --- a/pkg/netns/netns_linux.go +++ b/pkg/netns/netns_linux.go @@ -50,7 +50,9 @@ import ( // newNS creates a new persistent (bind-mounted) network namespace and returns the // path to the network namespace. -func newNS(baseDir string) (nsPath string, err error) { +// If pid is not 0, returns the netns from that pid persistently mounted. Otherwise, +// a new netns is created. +func newNS(baseDir string, pid uint32) (nsPath string, err error) { b := make([]byte, 16) _, err = rand.Read(b) @@ -81,6 +83,16 @@ func newNS(baseDir string) (nsPath string, err error) { } }() + if pid != 0 { + procNsPath := getNetNSPathFromPID(pid) + // bind mount the netns onto the mount point. This causes the namespace + // to persist, even when there are no threads in the ns. + if err = unix.Mount(procNsPath, nsPath, "none", unix.MS_BIND, ""); err != nil { + return "", fmt.Errorf("failed to bind mount ns src: %v at %s: %w", procNsPath, nsPath, err) + } + return nsPath, nil + } + var wg sync.WaitGroup wg.Add(1) @@ -155,6 +167,10 @@ func getCurrentThreadNetNSPath() string { return fmt.Sprintf("/proc/%d/task/%d/ns/net", os.Getpid(), unix.Gettid()) } +func getNetNSPathFromPID(pid uint32) string { + return fmt.Sprintf("/proc/%d/ns/net", pid) +} + // NetNS holds network namespace. type NetNS struct { path string @@ -162,7 +178,12 @@ type NetNS struct { // NewNetNS creates a network namespace. func NewNetNS(baseDir string) (*NetNS, error) { - path, err := newNS(baseDir) + return NewNetNSFromPID(baseDir, 0) +} + +// NewNetNS returns the netns from pid or a new netns if pid is 0. +func NewNetNSFromPID(baseDir string, pid uint32) (*NetNS, error) { + path, err := newNS(baseDir, pid) if err != nil { return nil, fmt.Errorf("failed to setup netns: %w", err) } diff --git a/pkg/netns/netns_other.go b/pkg/netns/netns_other.go index ec8124ceb..3cd60ef3f 100644 --- a/pkg/netns/netns_other.go +++ b/pkg/netns/netns_other.go @@ -35,6 +35,11 @@ func NewNetNS(baseDir string) (*NetNS, error) { return nil, errNotImplementedOnUnix } +// NewNetNS returns the netns from pid or a new netns if pid is 0. +func NewNetNSFromPID(baseDir string, pid uint32) (*NetNS, error) { + return nil, errNotImplementedOnUnix +} + // LoadNetNS loads existing network namespace. func LoadNetNS(path string) *NetNS { return &NetNS{path: path} diff --git a/pkg/netns/netns_windows.go b/pkg/netns/netns_windows.go index de02094b8..2d26d6f71 100644 --- a/pkg/netns/netns_windows.go +++ b/pkg/netns/netns_windows.go @@ -16,14 +16,20 @@ package netns -import "github.com/Microsoft/hcsshim/hcn" +import ( + "errors" + + "github.com/Microsoft/hcsshim/hcn" +) + +var errNotImplementedOnWindows = errors.New("not implemented on windows") // NetNS holds network namespace for sandbox type NetNS struct { path string } -// NewNetNS creates a network namespace for the sandbox +// NewNetNS creates a network namespace for the sandbox. func NewNetNS(baseDir string) (*NetNS, error) { temp := hcn.HostComputeNamespace{} hcnNamespace, err := temp.Create() @@ -34,6 +40,11 @@ func NewNetNS(baseDir string) (*NetNS, error) { return &NetNS{path: hcnNamespace.Id}, nil } +// NewNetNS returns the netns from pid or a new netns if pid is 0. +func NewNetNSFromPID(baseDir string, pid uint32) (*NetNS, error) { + return nil, errNotImplementedOnWindows +} + // LoadNetNS loads existing network namespace. func LoadNetNS(path string) *NetNS { return &NetNS{path: path} From 31a6449734f167b465a000e87e6cad2dd2a0abc5 Mon Sep 17 00:00:00 2001 From: David Leadbeater Date: Fri, 19 Aug 2022 05:22:44 +0000 Subject: [PATCH 2/4] Add capability for snapshotters to declare support for UID remapping This allows user namespace support to progress, either by allowing snapshotters to deal with ownership, or falling back to containerd doing a recursive chown. In the future, when snapshotters implement idmap mounts, they should report the "remap-ids" capability. Co-authored-by: Rodrigo Campos Signed-off-by: Rodrigo Campos Signed-off-by: David Leadbeater --- client.go | 18 +++++ container_opts.go | 10 +++ pkg/cri/server/container_create.go | 5 +- pkg/cri/server/container_create_linux.go | 5 +- pkg/cri/server/container_create_other.go | 4 +- pkg/cri/server/container_create_windows.go | 4 +- pkg/cri/server/helpers_linux.go | 91 ++++++++++++++++++++++ pkg/cri/server/sandbox_run.go | 11 ++- pkg/cri/server/sandbox_run_linux.go | 8 ++ pkg/cri/server/sandbox_run_other.go | 7 ++ pkg/cri/server/sandbox_run_windows.go | 6 ++ snapshots/snapshotter.go | 5 ++ snapshotter_opts_unix.go | 81 ++++++++++++++++++- snapshotter_opts_windows.go | 27 +++++++ 14 files changed, 270 insertions(+), 12 deletions(-) create mode 100644 snapshotter_opts_windows.go diff --git a/client.go b/client.go index 3786f8b57..4133476b1 100644 --- a/client.go +++ b/client.go @@ -866,3 +866,21 @@ func toPlatforms(pt []*apitypes.Platform) []ocispec.Platform { } return platforms } + +// GetSnapshotterCapabilities returns the capabilities of a snapshotter. +func (c *Client) GetSnapshotterCapabilities(ctx context.Context, snapshotterName string) ([]string, error) { + filters := []string{fmt.Sprintf("type==%s, id==%s", plugin.SnapshotPlugin, snapshotterName)} + in := c.IntrospectionService() + + resp, err := in.Plugins(ctx, filters) + if err != nil { + return nil, err + } + + if len(resp.Plugins) <= 0 { + return nil, fmt.Errorf("inspection service could not find snapshotter %s plugin", snapshotterName) + } + + sn := resp.Plugins[0] + return sn.Capabilities, nil +} diff --git a/container_opts.go b/container_opts.go index cf41d1aab..0719ed293 100644 --- a/container_opts.go +++ b/container_opts.go @@ -224,6 +224,11 @@ func WithNewSnapshot(id string, i Image, opts ...snapshots.Opt) NewContainerOpts if err != nil { return err } + + parent, err = resolveSnapshotOptions(ctx, client, c.Snapshotter, s, parent, opts...) + if err != nil { + return err + } if _, err := s.Prepare(ctx, id, parent, opts...); err != nil { return err } @@ -268,6 +273,11 @@ func WithNewSnapshotView(id string, i Image, opts ...snapshots.Opt) NewContainer if err != nil { return err } + + parent, err = resolveSnapshotOptions(ctx, client, c.Snapshotter, s, parent, opts...) + if err != nil { + return err + } if _, err := s.View(ctx, id, parent, opts...); err != nil { return err } diff --git a/pkg/cri/server/container_create.go b/pkg/cri/server/container_create.go index 72c4a6df1..ea4d5a02a 100644 --- a/pkg/cri/server/container_create.go +++ b/pkg/cri/server/container_create.go @@ -184,7 +184,10 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta log.G(ctx).Debugf("Container %q spec: %#+v", id, spew.NewFormatter(spec)) // Grab any platform specific snapshotter opts. - sOpts := snapshotterOpts(c.config.ContainerdConfig.Snapshotter, config) + sOpts, err := snapshotterOpts(c.config.ContainerdConfig.Snapshotter, config) + if err != nil { + return nil, err + } // Set snapshotter before any other options. opts := []containerd.NewContainerOpts{ diff --git a/pkg/cri/server/container_create_linux.go b/pkg/cri/server/container_create_linux.go index a74bf5d9a..93e7469ab 100644 --- a/pkg/cri/server/container_create_linux.go +++ b/pkg/cri/server/container_create_linux.go @@ -601,6 +601,7 @@ func generateUserString(username string, uid, gid *runtime.Int64Value) (string, } // snapshotterOpts returns any Linux specific snapshotter options for the rootfs snapshot -func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt { - return []snapshots.Opt{} +func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) { + nsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions() + return snapshotterRemapOpts(nsOpts) } diff --git a/pkg/cri/server/container_create_other.go b/pkg/cri/server/container_create_other.go index 9cfb15a04..acab67c11 100644 --- a/pkg/cri/server/container_create_other.go +++ b/pkg/cri/server/container_create_other.go @@ -55,6 +55,6 @@ func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageCon } // snapshotterOpts returns snapshotter options for the rootfs snapshot -func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt { - return []snapshots.Opt{} +func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) { + return []snapshots.Opt{}, nil } diff --git a/pkg/cri/server/container_create_windows.go b/pkg/cri/server/container_create_windows.go index bd7ed0fa5..e11466545 100644 --- a/pkg/cri/server/container_create_windows.go +++ b/pkg/cri/server/container_create_windows.go @@ -145,7 +145,7 @@ func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageCon } // snapshotterOpts returns any Windows specific snapshotter options for the r/w layer -func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt { +func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) { var opts []snapshots.Opt switch snapshotterName { @@ -160,5 +160,5 @@ func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) [] } } - return opts + return opts, nil } diff --git a/pkg/cri/server/helpers_linux.go b/pkg/cri/server/helpers_linux.go index 42b2d99a1..d0cb13006 100644 --- a/pkg/cri/server/helpers_linux.go +++ b/pkg/cri/server/helpers_linux.go @@ -28,11 +28,13 @@ import ( "syscall" "time" + "github.com/containerd/containerd" "github.com/containerd/containerd/log" "github.com/containerd/containerd/mount" "github.com/containerd/containerd/pkg/apparmor" "github.com/containerd/containerd/pkg/seccomp" "github.com/containerd/containerd/pkg/seutil" + "github.com/containerd/containerd/snapshots" "github.com/moby/sys/mountinfo" "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" @@ -275,3 +277,92 @@ func modifyProcessLabel(runtimeType string, spec *specs.Spec) error { spec.Process.SelinuxLabel = l return nil } + +func parseUsernsIDMap(runtimeIDMap []*runtime.IDMapping) ([]specs.LinuxIDMapping, error) { + var m []specs.LinuxIDMapping + + if len(runtimeIDMap) == 0 { + return m, nil + } + + if len(runtimeIDMap) > 1 { + // We only accept 1 line, because containerd.WithRemappedSnapshot() only supports that. + return m, fmt.Errorf("only one mapping line supported, got %v mapping lines", len(runtimeIDMap)) + } + + // We know len is 1 now. + if runtimeIDMap[0] == nil { + return m, nil + } + uidMap := *runtimeIDMap[0] + + if uidMap.Length < 1 { + return m, fmt.Errorf("invalid mapping length: %v", uidMap.Length) + } + + m = []specs.LinuxIDMapping{ + { + ContainerID: uidMap.ContainerId, + HostID: uidMap.HostId, + Size: uidMap.Length, + }, + } + + return m, nil +} + +func parseUsernsIDs(userns *runtime.UserNamespace) (uids, gids []specs.LinuxIDMapping, retErr error) { + if userns == nil { + // If userns is not set, the kubelet doesn't support this option + // and we should just fallback to no userns. This is completely + // valid. + return nil, nil, nil + } + + uidRuntimeMap := userns.GetUids() + gidRuntimeMap := userns.GetGids() + + uids, err := parseUsernsIDMap(uidRuntimeMap) + if err != nil { + return nil, nil, fmt.Errorf("UID mapping: %w", err) + } + + gids, err = parseUsernsIDMap(gidRuntimeMap) + if err != nil { + return nil, nil, fmt.Errorf("GID mapping: %w", err) + } + + switch mode := userns.GetMode(); mode { + case runtime.NamespaceMode_NODE: + if len(uids) != 0 || len(gids) != 0 { + return nil, nil, fmt.Errorf("can't use user namespace mode %q with mappings. Got %v UID mappings and %v GID mappings", mode, len(uids), len(gids)) + } + case runtime.NamespaceMode_POD: + // This is valid, we will handle it in WithPodNamespaces(). + if len(uids) == 0 || len(gids) == 0 { + return nil, nil, fmt.Errorf("can't use user namespace mode %q without UID and GID mappings", mode) + } + default: + return nil, nil, fmt.Errorf("unsupported user namespace mode: %q", mode) + } + + return uids, gids, nil +} + +func snapshotterRemapOpts(nsOpts *runtime.NamespaceOption) ([]snapshots.Opt, error) { + snapshotOpt := []snapshots.Opt{} + usernsOpts := nsOpts.GetUsernsOptions() + if usernsOpts == nil { + return snapshotOpt, nil + } + + uids, gids, err := parseUsernsIDs(usernsOpts) + if err != nil { + return nil, fmt.Errorf("user namespace configuration: %w", err) + } + + if usernsOpts.GetMode() == runtime.NamespaceMode_POD { + snapshotOpt = append(snapshotOpt, containerd.WithRemapperLabels(0, uids[0].HostID, 0, gids[0].HostID, uids[0].Size)) + } + return snapshotOpt, nil +} diff --git a/pkg/cri/server/sandbox_run.go b/pkg/cri/server/sandbox_run.go index b7ed6c150..9419a476c 100644 --- a/pkg/cri/server/sandbox_run.go +++ b/pkg/cri/server/sandbox_run.go @@ -158,10 +158,17 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox if err != nil { return nil, fmt.Errorf("failed to generate runtime options: %w", err) } - snapshotterOpt := snapshots.WithLabels(snapshots.FilterInheritedLabels(config.Annotations)) + + sOpts := []snapshots.Opt{snapshots.WithLabels(snapshots.FilterInheritedLabels(config.Annotations))} + extraSOpts, err := sandboxSnapshotterOpts(config) + if err != nil { + return nil, err + } + sOpts = append(sOpts, extraSOpts...) + opts := []containerd.NewContainerOpts{ containerd.WithSnapshotter(c.runtimeSnapshotter(ctx, ociRuntime)), - customopts.WithNewSnapshot(id, containerdImage, snapshotterOpt), + customopts.WithNewSnapshot(id, containerdImage, sOpts...), containerd.WithSpec(spec, specOpts...), containerd.WithContainerLabels(sandboxLabels), containerd.WithContainerExtension(sandboxMetadataExtension, &sandbox.Metadata), diff --git a/pkg/cri/server/sandbox_run_linux.go b/pkg/cri/server/sandbox_run_linux.go index 21bbb1baf..5aacd76ff 100644 --- a/pkg/cri/server/sandbox_run_linux.go +++ b/pkg/cri/server/sandbox_run_linux.go @@ -25,6 +25,7 @@ import ( "github.com/containerd/containerd" "github.com/containerd/containerd/oci" "github.com/containerd/containerd/plugin" + "github.com/containerd/containerd/snapshots" imagespec "github.com/opencontainers/image-spec/specs-go/v1" runtimespec "github.com/opencontainers/runtime-spec/specs-go" selinux "github.com/opencontainers/selinux/go-selinux" @@ -358,3 +359,10 @@ func (c *criService) updateNetNamespacePath(spec *runtimespec.Spec, nsPath strin } } } + +// sandboxSnapshotterOpts generates any platform specific snapshotter options +// for a sandbox container. +func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) { + nsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions() + return snapshotterRemapOpts(nsOpts) +} diff --git a/pkg/cri/server/sandbox_run_other.go b/pkg/cri/server/sandbox_run_other.go index 150cc917d..1676b4760 100644 --- a/pkg/cri/server/sandbox_run_other.go +++ b/pkg/cri/server/sandbox_run_other.go @@ -21,6 +21,7 @@ package server import ( "github.com/containerd/containerd" "github.com/containerd/containerd/oci" + "github.com/containerd/containerd/snapshots" imagespec "github.com/opencontainers/image-spec/specs-go/v1" runtimespec "github.com/opencontainers/runtime-spec/specs-go" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" @@ -56,3 +57,9 @@ func (c *criService) taskOpts(runtimeType string) []containerd.NewTaskOpts { func (c *criService) updateNetNamespacePath(spec *runtimespec.Spec, nsPath string) { } + +// sandboxSnapshotterOpts generates any platform specific snapshotter options +// for a sandbox container. +func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) { + return []snapshots.Opt{}, nil +} diff --git a/pkg/cri/server/sandbox_run_windows.go b/pkg/cri/server/sandbox_run_windows.go index 017007f66..10b9b2faf 100644 --- a/pkg/cri/server/sandbox_run_windows.go +++ b/pkg/cri/server/sandbox_run_windows.go @@ -22,6 +22,7 @@ import ( "github.com/containerd/containerd" "github.com/containerd/containerd/oci" + "github.com/containerd/containerd/snapshots" imagespec "github.com/opencontainers/image-spec/specs-go/v1" runtimespec "github.com/opencontainers/runtime-spec/specs-go" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" @@ -116,3 +117,8 @@ func (c *criService) taskOpts(runtimeType string) []containerd.NewTaskOpts { func (c *criService) updateNetNamespacePath(spec *runtimespec.Spec, nsPath string) { spec.Windows.Network.NetworkNamespace = nsPath } + +// No sandbox snapshotter options needed for windows. +func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) { + return []snapshots.Opt{}, nil +} diff --git a/snapshots/snapshotter.go b/snapshots/snapshotter.go index 9da28583b..5fa5aa530 100644 --- a/snapshots/snapshotter.go +++ b/snapshots/snapshotter.go @@ -33,6 +33,11 @@ const ( UnpackKeyFormat = UnpackKeyPrefix + "-%s %s" inheritedLabelsPrefix = "containerd.io/snapshot/" labelSnapshotRef = "containerd.io/snapshot.ref" + + // LabelSnapshotUIDMapping is the label used for UID mappings + LabelSnapshotUIDMapping = "containerd.io/snapshot/uidmapping" + // LabelSnapshotGIDMapping is the label used for GID mappings + LabelSnapshotGIDMapping = "containerd.io/snapshot/gidmapping" ) // Kind identifies the kind of snapshot. diff --git a/snapshotter_opts_unix.go b/snapshotter_opts_unix.go index e25588b09..4739e192f 100644 --- a/snapshotter_opts_unix.go +++ b/snapshotter_opts_unix.go @@ -19,17 +19,92 @@ package containerd import ( + "context" "fmt" "github.com/containerd/containerd/snapshots" ) +const ( + capabRemapIDs = "remap-ids" +) + // WithRemapperLabels creates the labels used by any supporting snapshotter // to shift the filesystem ownership (user namespace mapping) automatically; currently // supported by the fuse-overlayfs snapshotter func WithRemapperLabels(ctrUID, hostUID, ctrGID, hostGID, length uint32) snapshots.Opt { return snapshots.WithLabels(map[string]string{ - "containerd.io/snapshot/uidmapping": fmt.Sprintf("%d:%d:%d", ctrUID, hostUID, length), - "containerd.io/snapshot/gidmapping": fmt.Sprintf("%d:%d:%d", ctrGID, hostGID, length), - }) + snapshots.LabelSnapshotUIDMapping: fmt.Sprintf("%d:%d:%d", ctrUID, hostUID, length), + snapshots.LabelSnapshotGIDMapping: fmt.Sprintf("%d:%d:%d", ctrGID, hostGID, length)}) +} + +func resolveSnapshotOptions(ctx context.Context, client *Client, snapshotterName string, snapshotter snapshots.Snapshotter, parent string, opts ...snapshots.Opt) (string, error) { + capabs, err := client.GetSnapshotterCapabilities(ctx, snapshotterName) + if err != nil { + return "", err + } + + for _, capab := range capabs { + if capab == capabRemapIDs { + // Snapshotter supports ID remapping, we don't need to do anything. + return parent, nil + } + } + + var local snapshots.Info + for _, opt := range opts { + opt(&local) + } + + needsRemap := false + var uidMap, gidMap string + + if value, ok := local.Labels[snapshots.LabelSnapshotUIDMapping]; ok { + needsRemap = true + uidMap = value + } + if value, ok := local.Labels[snapshots.LabelSnapshotGIDMapping]; ok { + needsRemap = true + gidMap = value + } + + if !needsRemap { + return parent, nil + } + + var ctrUID, hostUID, length uint32 + _, err = fmt.Sscanf(uidMap, "%d:%d:%d", &ctrUID, &hostUID, &length) + if err != nil { + return "", fmt.Errorf("uidMap unparsable: %w", err) + } + + var ctrGID, hostGID, lengthGID uint32 + _, err = fmt.Sscanf(gidMap, "%d:%d:%d", &ctrGID, &hostGID, &lengthGID) + if err != nil { + return "", fmt.Errorf("gidMap unparsable: %w", err) + } + + if ctrUID != 0 || ctrGID != 0 { + return "", fmt.Errorf("Container UID/GID of 0 only supported currently (%d/%d)", ctrUID, ctrGID) + } + + // TODO(dgl): length isn't taken into account for the intermediate snapshot id. + usernsID := fmt.Sprintf("%s-%d-%d", parent, hostUID, hostGID) + if _, err := snapshotter.Stat(ctx, usernsID); err == nil { + return usernsID, nil + } + mounts, err := snapshotter.Prepare(ctx, usernsID+"-remap", parent) + if err != nil { + return "", err + } + // TODO(dgl): length isn't taken into account here yet either. + if err := remapRootFS(ctx, mounts, hostUID, hostGID); err != nil { + snapshotter.Remove(ctx, usernsID+"-remap") + return "", err + } + if err := snapshotter.Commit(ctx, usernsID, usernsID+"-remap"); err != nil { + return "", err + } + + return usernsID, nil } diff --git a/snapshotter_opts_windows.go b/snapshotter_opts_windows.go new file mode 100644 index 000000000..540bcb313 --- /dev/null +++ b/snapshotter_opts_windows.go @@ -0,0 +1,27 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package containerd + +import ( + "context" + + "github.com/containerd/containerd/snapshots" +) + +func resolveSnapshotOptions(ctx context.Context, client *Client, snapshotterName string, snapshotter snapshots.Snapshotter, parent string, opts ...snapshots.Opt) (string, error) { + return parent, nil +} From a7adeb69769395193a0278c4bda6068011d06cde Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Wed, 16 Nov 2022 12:41:46 +0100 Subject: [PATCH 3/4] cri: Support pods with user namespaces This patch requests the OCI runtime to create a userns when the CRI message includes such request. Signed-off-by: Rodrigo Campos --- pkg/cri/opts/spec_linux.go | 20 ++- pkg/cri/sbserver/container_create_linux.go | 3 +- pkg/cri/server/container_create_linux.go | 7 +- pkg/cri/server/container_create_linux_test.go | 107 ++++++++++++++++ pkg/cri/server/sandbox_run_linux.go | 17 +++ pkg/cri/server/sandbox_run_linux_test.go | 117 ++++++++++++++++++ 6 files changed, 268 insertions(+), 3 deletions(-) diff --git a/pkg/cri/opts/spec_linux.go b/pkg/cri/opts/spec_linux.go index 53cf464d3..767c9c2fc 100644 --- a/pkg/cri/opts/spec_linux.go +++ b/pkg/cri/opts/spec_linux.go @@ -661,7 +661,7 @@ func WithSupplementalGroups(groups []int64) oci.SpecOpts { } // WithPodNamespaces sets the pod namespaces for the container -func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32) oci.SpecOpts { +func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32, uids, gids []runtimespec.LinuxIDMapping) oci.SpecOpts { namespaces := config.GetNamespaceOptions() opts := []oci.SpecOpts{ @@ -672,6 +672,17 @@ func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid if namespaces.GetPid() != runtime.NamespaceMode_CONTAINER { opts = append(opts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.PIDNamespace, Path: GetPIDNamespace(targetPid)})) } + + if namespaces.GetUsernsOptions() != nil { + switch namespaces.GetUsernsOptions().GetMode() { + case runtime.NamespaceMode_NODE: + // Nothing to do. Not adding userns field uses the node userns. + case runtime.NamespaceMode_POD: + opts = append(opts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.UserNamespace, Path: GetUserNamespace(sandboxPid)})) + opts = append(opts, oci.WithUserNamespace(uids, gids)) + } + } + return oci.Compose(opts...) } @@ -745,6 +756,8 @@ const ( utsNSFormat = "/proc/%v/ns/uts" // pidNSFormat is the format of pid namespace of a process. pidNSFormat = "/proc/%v/ns/pid" + // userNSFormat is the format of user namespace of a process. + userNSFormat = "/proc/%v/ns/user" ) // GetNetworkNamespace returns the network namespace of a process. @@ -767,6 +780,11 @@ func GetPIDNamespace(pid uint32) string { return fmt.Sprintf(pidNSFormat, pid) } +// GetUserNamespace returns the user namespace of a process. +func GetUserNamespace(pid uint32) string { + return fmt.Sprintf(userNSFormat, pid) +} + // WithCDI updates OCI spec with CDI content func WithCDI(annotations map[string]string) oci.SpecOpts { return func(ctx context.Context, _ oci.Client, c *containers.Container, s *oci.Spec) error { diff --git a/pkg/cri/sbserver/container_create_linux.go b/pkg/cri/sbserver/container_create_linux.go index 558eaf962..71e6af0a6 100644 --- a/pkg/cri/sbserver/container_create_linux.go +++ b/pkg/cri/sbserver/container_create_linux.go @@ -313,7 +313,8 @@ func (c *criService) containerSpec( specOpts = append(specOpts, customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj), - customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid), + // TODO: This is a hack to make this compile. We should move userns support to sbserver. + customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, nil, nil), customopts.WithSupplementalGroups(supplementalGroups), customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer), customopts.WithAnnotation(annotations.SandboxID, sandboxID), diff --git a/pkg/cri/server/container_create_linux.go b/pkg/cri/server/container_create_linux.go index 93e7469ab..5181c5f5e 100644 --- a/pkg/cri/server/container_create_linux.go +++ b/pkg/cri/server/container_create_linux.go @@ -311,9 +311,14 @@ func (c *criService) containerSpec( targetPid = status.Pid } + uids, gids, err := parseUsernsIDs(nsOpts.GetUsernsOptions()) + if err != nil { + return nil, fmt.Errorf("user namespace configuration: %w", err) + } + specOpts = append(specOpts, customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj), - customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid), + customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, uids, gids), customopts.WithSupplementalGroups(supplementalGroups), customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer), customopts.WithAnnotation(annotations.SandboxID, sandboxID), diff --git a/pkg/cri/server/container_create_linux_test.go b/pkg/cri/server/container_create_linux_test.go index 8ba7cabb0..a17bd5895 100644 --- a/pkg/cri/server/container_create_linux_test.go +++ b/pkg/cri/server/container_create_linux_test.go @@ -804,6 +804,113 @@ func TestPidNamespace(t *testing.T) { } } +func TestUserNamespace(t *testing.T) { + testID := "test-id" + testPid := uint32(1234) + testSandboxID := "sandbox-id" + testContainerName := "container-name" + idMap := runtime.IDMapping{ + HostId: 1000, + ContainerId: 1000, + Length: 10, + } + expIDMap := runtimespec.LinuxIDMapping{ + HostID: 1000, + ContainerID: 1000, + Size: 10, + } + containerConfig, sandboxConfig, imageConfig, _ := getCreateContainerTestData() + ociRuntime := config.Runtime{} + c := newTestCRIService() + for desc, test := range map[string]struct { + userNS *runtime.UserNamespace + expNS *runtimespec.LinuxNamespace + expNotNS *runtimespec.LinuxNamespace // Does NOT contain this namespace + expUIDMapping []runtimespec.LinuxIDMapping + expGIDMapping []runtimespec.LinuxIDMapping + err bool + }{ + "node namespace mode": { + userNS: &runtime.UserNamespace{Mode: runtime.NamespaceMode_NODE}, + // Expect userns to NOT be present. + expNotNS: &runtimespec.LinuxNamespace{ + Type: runtimespec.UserNamespace, + Path: opts.GetUserNamespace(testPid), + }, + }, + "node namespace mode with mappings": { + userNS: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_NODE, + Uids: []*runtime.IDMapping{&idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + err: true, + }, + "container namespace mode": { + userNS: &runtime.UserNamespace{Mode: runtime.NamespaceMode_CONTAINER}, + err: true, + }, + "target namespace mode": { + userNS: &runtime.UserNamespace{Mode: runtime.NamespaceMode_TARGET}, + err: true, + }, + "unknown namespace mode": { + userNS: &runtime.UserNamespace{Mode: runtime.NamespaceMode(100)}, + err: true, + }, + "pod namespace mode": { + userNS: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + Uids: []*runtime.IDMapping{&idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + expNS: &runtimespec.LinuxNamespace{ + Type: runtimespec.UserNamespace, + Path: opts.GetUserNamespace(testPid), + }, + expUIDMapping: []runtimespec.LinuxIDMapping{expIDMap}, + expGIDMapping: []runtimespec.LinuxIDMapping{expIDMap}, + }, + "pod namespace mode with several mappings": { + userNS: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + Uids: []*runtime.IDMapping{&idMap, &idMap}, + Gids: []*runtime.IDMapping{&idMap, &idMap}, + }, + err: true, + }, + "pod namespace mode with uneven mappings": { + userNS: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + Uids: []*runtime.IDMapping{&idMap, &idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + err: true, + }, + } { + t.Run(desc, func(t *testing.T) { + containerConfig.Linux.SecurityContext.NamespaceOptions = &runtime.NamespaceOption{UsernsOptions: test.userNS} + spec, err := c.containerSpec(testID, testSandboxID, testPid, "", testContainerName, testImageName, containerConfig, sandboxConfig, imageConfig, nil, ociRuntime) + + if test.err { + assert.Error(t, err) + assert.Nil(t, spec) + return + } + assert.NoError(t, err) + assert.Equal(t, spec.Linux.UIDMappings, test.expUIDMapping) + assert.Equal(t, spec.Linux.GIDMappings, test.expGIDMapping) + + if test.expNS != nil { + assert.Contains(t, spec.Linux.Namespaces, *test.expNS) + } + if test.expNotNS != nil { + assert.NotContains(t, spec.Linux.Namespaces, *test.expNotNS) + } + }) + } +} + func TestNoDefaultRunMount(t *testing.T) { testID := "test-id" testPid := uint32(1234) diff --git a/pkg/cri/server/sandbox_run_linux.go b/pkg/cri/server/sandbox_run_linux.go index 5aacd76ff..78cca50c6 100644 --- a/pkg/cri/server/sandbox_run_linux.go +++ b/pkg/cri/server/sandbox_run_linux.go @@ -96,6 +96,23 @@ func (c *criService) sandboxContainerSpec(id string, config *runtime.PodSandboxC specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.IPCNamespace)) } + usernsOpts := nsOptions.GetUsernsOptions() + uids, gids, err := parseUsernsIDs(usernsOpts) + if err != nil { + return nil, fmt.Errorf("user namespace configuration: %w", err) + } + + if usernsOpts != nil { + switch mode := usernsOpts.GetMode(); mode { + case runtime.NamespaceMode_NODE: + specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.UserNamespace)) + case runtime.NamespaceMode_POD: + specOpts = append(specOpts, oci.WithUserNamespace(uids, gids)) + default: + return nil, fmt.Errorf("unsupported user namespace mode: %q", mode) + } + } + // It's fine to generate the spec before the sandbox /dev/shm // is actually created. sandboxDevShm := c.getSandboxDevShm(id) diff --git a/pkg/cri/server/sandbox_run_linux_test.go b/pkg/cri/server/sandbox_run_linux_test.go index 378136136..9c646e069 100644 --- a/pkg/cri/server/sandbox_run_linux_test.go +++ b/pkg/cri/server/sandbox_run_linux_test.go @@ -98,6 +98,17 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf func TestLinuxSandboxContainerSpec(t *testing.T) { testID := "test-id" nsPath := "test-cni" + idMap := runtime.IDMapping{ + HostId: 1000, + ContainerId: 1000, + Length: 10, + } + expIDMap := runtimespec.LinuxIDMapping{ + HostID: 1000, + ContainerID: 1000, + Size: 10, + } + for desc, test := range map[string]struct { configChange func(*runtime.PodSandboxConfig) specCheck func(*testing.T, *runtimespec.Spec) @@ -122,6 +133,9 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { }) assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "0") assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") + assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.UserNamespace, + }) }, }, "host namespace": { @@ -149,10 +163,113 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ Type: runtimespec.IPCNamespace, }) + assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.UserNamespace, + }) assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "0") assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") }, }, + "user namespace": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + Uids: []*runtime.IDMapping{&idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + }, + } + }, + specCheck: func(t *testing.T, spec *runtimespec.Spec) { + require.NotNil(t, spec.Linux) + assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.UserNamespace, + }) + require.Equal(t, spec.Linux.UIDMappings, []runtimespec.LinuxIDMapping{expIDMap}) + require.Equal(t, spec.Linux.GIDMappings, []runtimespec.LinuxIDMapping{expIDMap}) + + }, + }, + "user namespace mode node and mappings": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_NODE, + Uids: []*runtime.IDMapping{&idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + }, + } + }, + expectErr: true, + }, + "user namespace with several mappings": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_NODE, + Uids: []*runtime.IDMapping{&idMap, &idMap}, + Gids: []*runtime.IDMapping{&idMap, &idMap}, + }, + }, + } + }, + expectErr: true, + }, + "user namespace with uneven mappings": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_NODE, + Uids: []*runtime.IDMapping{&idMap, &idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + }, + } + }, + expectErr: true, + }, + "user namespace mode container": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_CONTAINER, + }, + }, + } + }, + expectErr: true, + }, + "user namespace mode target": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_TARGET, + }, + }, + } + }, + expectErr: true, + }, + "user namespace unknown mode": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode(100), + }, + }, + } + }, + expectErr: true, + }, "should set supplemental groups correctly": { configChange: func(c *runtime.PodSandboxConfig) { c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ From ca69ae26567ca36f4a14d6896998d9130459ce4e Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Wed, 21 Dec 2022 17:58:05 -0300 Subject: [PATCH 4/4] Add integration tests for CRI userns Signed-off-by: Rodrigo Campos --- integration/main_test.go | 58 +++++++++ integration/pod_userns_linux_test.go | 169 +++++++++++++++++++++++++++ 2 files changed, 227 insertions(+) create mode 100644 integration/pod_userns_linux_test.go diff --git a/integration/main_test.go b/integration/main_test.go index 3250b07d8..690cf4f3f 100644 --- a/integration/main_test.go +++ b/integration/main_test.go @@ -126,6 +126,35 @@ func WithHostNetwork(p *runtime.PodSandboxConfig) { p.Linux.SecurityContext.NamespaceOptions.Network = runtime.NamespaceMode_NODE } +// Set pod userns. +func WithPodUserNs(containerID, hostID, length uint32) PodSandboxOpts { + return func(p *runtime.PodSandboxConfig) { + if p.Linux == nil { + p.Linux = &runtime.LinuxPodSandboxConfig{} + } + if p.Linux.SecurityContext == nil { + p.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{} + } + if p.Linux.SecurityContext.NamespaceOptions == nil { + p.Linux.SecurityContext.NamespaceOptions = &runtime.NamespaceOption{} + } + + idMap := runtime.IDMapping{ + HostId: hostID, + ContainerId: containerID, + Length: length, + } + if p.Linux.SecurityContext.NamespaceOptions.UsernsOptions == nil { + p.Linux.SecurityContext.NamespaceOptions.UsernsOptions = &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + } + } + + p.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Uids = append(p.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Uids, &idMap) + p.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Gids = append(p.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Gids, &idMap) + } +} + // Set host pid. func WithHostPid(p *runtime.PodSandboxConfig) { if p.Linux == nil { @@ -314,6 +343,35 @@ func WithPidNamespace(mode runtime.NamespaceMode) ContainerOpts { } +// Add user namespace pod mode. +func WithUserNamespace(containerID, hostID, length uint32) ContainerOpts { + return func(c *runtime.ContainerConfig) { + if c.Linux == nil { + c.Linux = &runtime.LinuxContainerConfig{} + } + if c.Linux.SecurityContext == nil { + c.Linux.SecurityContext = &runtime.LinuxContainerSecurityContext{} + } + if c.Linux.SecurityContext.NamespaceOptions == nil { + c.Linux.SecurityContext.NamespaceOptions = &runtime.NamespaceOption{} + } + idMap := runtime.IDMapping{ + HostId: hostID, + ContainerId: containerID, + Length: length, + } + + if c.Linux.SecurityContext.NamespaceOptions.UsernsOptions == nil { + c.Linux.SecurityContext.NamespaceOptions.UsernsOptions = &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + } + } + + c.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Uids = append(c.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Uids, &idMap) + c.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Gids = append(c.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Gids, &idMap) + } +} + // Add container log path. func WithLogPath(path string) ContainerOpts { return func(c *runtime.ContainerConfig) { diff --git a/integration/pod_userns_linux_test.go b/integration/pod_userns_linux_test.go new file mode 100644 index 000000000..b020d64ae --- /dev/null +++ b/integration/pod_userns_linux_test.go @@ -0,0 +1,169 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package integration + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + "testing" + "time" + + "github.com/containerd/containerd/integration/images" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + exec "golang.org/x/sys/execabs" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +func TestPodUserNS(t *testing.T) { + containerID := uint32(0) + hostID := uint32(65536) + size := uint32(65536) + for name, test := range map[string]struct { + sandboxOpts []PodSandboxOpts + containerOpts []ContainerOpts + checkOutput func(t *testing.T, output string) + expectErr bool + }{ + "userns uid mapping": { + sandboxOpts: []PodSandboxOpts{ + WithPodUserNs(containerID, hostID, size), + }, + containerOpts: []ContainerOpts{ + WithUserNamespace(containerID, hostID, size), + WithCommand("cat", "/proc/self/uid_map"), + }, + checkOutput: func(t *testing.T, output string) { + // The output should contain the length of the userns requested. + assert.Contains(t, output, fmt.Sprint(size)) + }, + }, + "userns gid mapping": { + sandboxOpts: []PodSandboxOpts{ + WithPodUserNs(containerID, hostID, size), + }, + containerOpts: []ContainerOpts{ + WithUserNamespace(containerID, hostID, size), + WithCommand("cat", "/proc/self/gid_map"), + }, + checkOutput: func(t *testing.T, output string) { + // The output should contain the length of the userns requested. + assert.Contains(t, output, fmt.Sprint(size)) + }, + }, + "rootfs permissions": { + sandboxOpts: []PodSandboxOpts{ + WithPodUserNs(containerID, hostID, size), + }, + containerOpts: []ContainerOpts{ + WithUserNamespace(containerID, hostID, size), + // Prints numeric UID and GID for path. + // For example, if UID and GID is 0 it will print: =0=0= + // We add the "=" signs so we use can assert.Contains() and be sure + // the UID/GID is 0 and not things like 100 (that contain 0). + // We can't use assert.Equal() easily as it contains timestamp, etc. + WithCommand("stat", "-c", "'=%u=%g='", "/root/"), + }, + checkOutput: func(t *testing.T, output string) { + // The UID and GID should be 0 (root) if the chown/remap is done correctly. + assert.Contains(t, output, "=0=0=") + }, + }, + "fails with several mappings": { + sandboxOpts: []PodSandboxOpts{ + WithPodUserNs(containerID, hostID, size), + WithPodUserNs(containerID*2, hostID*2, size*2), + }, + expectErr: true, + }, + } { + t.Run(name, func(t *testing.T) { + if os.Getenv("ENABLE_CRI_SANDBOXES") == "'sandboxed'" { + t.Skip("skipping test: userns not supported/needed in sanboxed runtimes") + } + cmd := exec.Command("true") + cmd.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: syscall.CLONE_NEWUSER, + } + if err := cmd.Run(); err != nil { + t.Skip("skipping test: user namespaces are unavailable") + } + + testPodLogDir := t.TempDir() + sandboxOpts := append(test.sandboxOpts, WithPodLogDirectory(testPodLogDir)) + t.Log("Create a sandbox with userns") + sbConfig := PodSandboxConfig("sandbox", "userns", sandboxOpts...) + sb, err := runtimeService.RunPodSandbox(sbConfig, *runtimeHandler) + if err != nil { + if !test.expectErr { + t.Fatalf("Unexpected RunPodSandbox error: %v", err) + } + return + } + // Make sure the sandbox is cleaned up. + defer func() { + assert.NoError(t, runtimeService.StopPodSandbox(sb)) + assert.NoError(t, runtimeService.RemovePodSandbox(sb)) + }() + if test.expectErr { + t.Fatalf("Expected RunPodSandbox to return error") + } + + var ( + testImage = images.Get(images.BusyBox) + containerName = "test-container" + ) + + EnsureImageExists(t, testImage) + + containerOpts := append(test.containerOpts, + WithLogPath(containerName), + ) + t.Log("Create a container for userns") + cnConfig := ContainerConfig( + containerName, + testImage, + containerOpts..., + ) + cn, err := runtimeService.CreateContainer(sb, cnConfig, sbConfig) + require.NoError(t, err) + + t.Log("Start the container") + require.NoError(t, runtimeService.StartContainer(cn)) + + t.Log("Wait for container to finish running") + require.NoError(t, Eventually(func() (bool, error) { + s, err := runtimeService.ContainerStatus(cn) + if err != nil { + return false, err + } + if s.GetState() == runtime.ContainerState_CONTAINER_EXITED { + return true, nil + } + return false, nil + }, time.Second, 30*time.Second)) + + content, err := os.ReadFile(filepath.Join(testPodLogDir, containerName)) + assert.NoError(t, err) + + t.Log("Running check function") + test.checkOutput(t, string(content)) + }) + } +}