diff --git a/client.go b/client.go index 3786f8b57..4133476b1 100644 --- a/client.go +++ b/client.go @@ -866,3 +866,21 @@ func toPlatforms(pt []*apitypes.Platform) []ocispec.Platform { } return platforms } + +// GetSnapshotterCapabilities returns the capabilities of a snapshotter. +func (c *Client) GetSnapshotterCapabilities(ctx context.Context, snapshotterName string) ([]string, error) { + filters := []string{fmt.Sprintf("type==%s, id==%s", plugin.SnapshotPlugin, snapshotterName)} + in := c.IntrospectionService() + + resp, err := in.Plugins(ctx, filters) + if err != nil { + return nil, err + } + + if len(resp.Plugins) <= 0 { + return nil, fmt.Errorf("inspection service could not find snapshotter %s plugin", snapshotterName) + } + + sn := resp.Plugins[0] + return sn.Capabilities, nil +} diff --git a/container_opts.go b/container_opts.go index cf41d1aab..0719ed293 100644 --- a/container_opts.go +++ b/container_opts.go @@ -224,6 +224,11 @@ func WithNewSnapshot(id string, i Image, opts ...snapshots.Opt) NewContainerOpts if err != nil { return err } + + parent, err = resolveSnapshotOptions(ctx, client, c.Snapshotter, s, parent, opts...) + if err != nil { + return err + } if _, err := s.Prepare(ctx, id, parent, opts...); err != nil { return err } @@ -268,6 +273,11 @@ func WithNewSnapshotView(id string, i Image, opts ...snapshots.Opt) NewContainer if err != nil { return err } + + parent, err = resolveSnapshotOptions(ctx, client, c.Snapshotter, s, parent, opts...) + if err != nil { + return err + } if _, err := s.View(ctx, id, parent, opts...); err != nil { return err } diff --git a/integration/main_test.go b/integration/main_test.go index 3250b07d8..690cf4f3f 100644 --- a/integration/main_test.go +++ b/integration/main_test.go @@ -126,6 +126,35 @@ func WithHostNetwork(p *runtime.PodSandboxConfig) { p.Linux.SecurityContext.NamespaceOptions.Network = runtime.NamespaceMode_NODE } +// Set pod userns. +func WithPodUserNs(containerID, hostID, length uint32) PodSandboxOpts { + return func(p *runtime.PodSandboxConfig) { + if p.Linux == nil { + p.Linux = &runtime.LinuxPodSandboxConfig{} + } + if p.Linux.SecurityContext == nil { + p.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{} + } + if p.Linux.SecurityContext.NamespaceOptions == nil { + p.Linux.SecurityContext.NamespaceOptions = &runtime.NamespaceOption{} + } + + idMap := runtime.IDMapping{ + HostId: hostID, + ContainerId: containerID, + Length: length, + } + if p.Linux.SecurityContext.NamespaceOptions.UsernsOptions == nil { + p.Linux.SecurityContext.NamespaceOptions.UsernsOptions = &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + } + } + + p.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Uids = append(p.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Uids, &idMap) + p.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Gids = append(p.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Gids, &idMap) + } +} + // Set host pid. func WithHostPid(p *runtime.PodSandboxConfig) { if p.Linux == nil { @@ -314,6 +343,35 @@ func WithPidNamespace(mode runtime.NamespaceMode) ContainerOpts { } +// Add user namespace pod mode. +func WithUserNamespace(containerID, hostID, length uint32) ContainerOpts { + return func(c *runtime.ContainerConfig) { + if c.Linux == nil { + c.Linux = &runtime.LinuxContainerConfig{} + } + if c.Linux.SecurityContext == nil { + c.Linux.SecurityContext = &runtime.LinuxContainerSecurityContext{} + } + if c.Linux.SecurityContext.NamespaceOptions == nil { + c.Linux.SecurityContext.NamespaceOptions = &runtime.NamespaceOption{} + } + idMap := runtime.IDMapping{ + HostId: hostID, + ContainerId: containerID, + Length: length, + } + + if c.Linux.SecurityContext.NamespaceOptions.UsernsOptions == nil { + c.Linux.SecurityContext.NamespaceOptions.UsernsOptions = &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + } + } + + c.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Uids = append(c.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Uids, &idMap) + c.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Gids = append(c.Linux.SecurityContext.NamespaceOptions.UsernsOptions.Gids, &idMap) + } +} + // Add container log path. func WithLogPath(path string) ContainerOpts { return func(c *runtime.ContainerConfig) { diff --git a/integration/pod_userns_linux_test.go b/integration/pod_userns_linux_test.go new file mode 100644 index 000000000..b020d64ae --- /dev/null +++ b/integration/pod_userns_linux_test.go @@ -0,0 +1,169 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package integration + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + "testing" + "time" + + "github.com/containerd/containerd/integration/images" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + exec "golang.org/x/sys/execabs" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +func TestPodUserNS(t *testing.T) { + containerID := uint32(0) + hostID := uint32(65536) + size := uint32(65536) + for name, test := range map[string]struct { + sandboxOpts []PodSandboxOpts + containerOpts []ContainerOpts + checkOutput func(t *testing.T, output string) + expectErr bool + }{ + "userns uid mapping": { + sandboxOpts: []PodSandboxOpts{ + WithPodUserNs(containerID, hostID, size), + }, + containerOpts: []ContainerOpts{ + WithUserNamespace(containerID, hostID, size), + WithCommand("cat", "/proc/self/uid_map"), + }, + checkOutput: func(t *testing.T, output string) { + // The output should contain the length of the userns requested. + assert.Contains(t, output, fmt.Sprint(size)) + }, + }, + "userns gid mapping": { + sandboxOpts: []PodSandboxOpts{ + WithPodUserNs(containerID, hostID, size), + }, + containerOpts: []ContainerOpts{ + WithUserNamespace(containerID, hostID, size), + WithCommand("cat", "/proc/self/gid_map"), + }, + checkOutput: func(t *testing.T, output string) { + // The output should contain the length of the userns requested. + assert.Contains(t, output, fmt.Sprint(size)) + }, + }, + "rootfs permissions": { + sandboxOpts: []PodSandboxOpts{ + WithPodUserNs(containerID, hostID, size), + }, + containerOpts: []ContainerOpts{ + WithUserNamespace(containerID, hostID, size), + // Prints numeric UID and GID for path. + // For example, if UID and GID is 0 it will print: =0=0= + // We add the "=" signs so we use can assert.Contains() and be sure + // the UID/GID is 0 and not things like 100 (that contain 0). + // We can't use assert.Equal() easily as it contains timestamp, etc. + WithCommand("stat", "-c", "'=%u=%g='", "/root/"), + }, + checkOutput: func(t *testing.T, output string) { + // The UID and GID should be 0 (root) if the chown/remap is done correctly. + assert.Contains(t, output, "=0=0=") + }, + }, + "fails with several mappings": { + sandboxOpts: []PodSandboxOpts{ + WithPodUserNs(containerID, hostID, size), + WithPodUserNs(containerID*2, hostID*2, size*2), + }, + expectErr: true, + }, + } { + t.Run(name, func(t *testing.T) { + if os.Getenv("ENABLE_CRI_SANDBOXES") == "'sandboxed'" { + t.Skip("skipping test: userns not supported/needed in sanboxed runtimes") + } + cmd := exec.Command("true") + cmd.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: syscall.CLONE_NEWUSER, + } + if err := cmd.Run(); err != nil { + t.Skip("skipping test: user namespaces are unavailable") + } + + testPodLogDir := t.TempDir() + sandboxOpts := append(test.sandboxOpts, WithPodLogDirectory(testPodLogDir)) + t.Log("Create a sandbox with userns") + sbConfig := PodSandboxConfig("sandbox", "userns", sandboxOpts...) + sb, err := runtimeService.RunPodSandbox(sbConfig, *runtimeHandler) + if err != nil { + if !test.expectErr { + t.Fatalf("Unexpected RunPodSandbox error: %v", err) + } + return + } + // Make sure the sandbox is cleaned up. + defer func() { + assert.NoError(t, runtimeService.StopPodSandbox(sb)) + assert.NoError(t, runtimeService.RemovePodSandbox(sb)) + }() + if test.expectErr { + t.Fatalf("Expected RunPodSandbox to return error") + } + + var ( + testImage = images.Get(images.BusyBox) + containerName = "test-container" + ) + + EnsureImageExists(t, testImage) + + containerOpts := append(test.containerOpts, + WithLogPath(containerName), + ) + t.Log("Create a container for userns") + cnConfig := ContainerConfig( + containerName, + testImage, + containerOpts..., + ) + cn, err := runtimeService.CreateContainer(sb, cnConfig, sbConfig) + require.NoError(t, err) + + t.Log("Start the container") + require.NoError(t, runtimeService.StartContainer(cn)) + + t.Log("Wait for container to finish running") + require.NoError(t, Eventually(func() (bool, error) { + s, err := runtimeService.ContainerStatus(cn) + if err != nil { + return false, err + } + if s.GetState() == runtime.ContainerState_CONTAINER_EXITED { + return true, nil + } + return false, nil + }, time.Second, 30*time.Second)) + + content, err := os.ReadFile(filepath.Join(testPodLogDir, containerName)) + assert.NoError(t, err) + + t.Log("Running check function") + test.checkOutput(t, string(content)) + }) + } +} diff --git a/pkg/cri/opts/spec_linux.go b/pkg/cri/opts/spec_linux.go index 53cf464d3..767c9c2fc 100644 --- a/pkg/cri/opts/spec_linux.go +++ b/pkg/cri/opts/spec_linux.go @@ -661,7 +661,7 @@ func WithSupplementalGroups(groups []int64) oci.SpecOpts { } // WithPodNamespaces sets the pod namespaces for the container -func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32) oci.SpecOpts { +func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid uint32, targetPid uint32, uids, gids []runtimespec.LinuxIDMapping) oci.SpecOpts { namespaces := config.GetNamespaceOptions() opts := []oci.SpecOpts{ @@ -672,6 +672,17 @@ func WithPodNamespaces(config *runtime.LinuxContainerSecurityContext, sandboxPid if namespaces.GetPid() != runtime.NamespaceMode_CONTAINER { opts = append(opts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.PIDNamespace, Path: GetPIDNamespace(targetPid)})) } + + if namespaces.GetUsernsOptions() != nil { + switch namespaces.GetUsernsOptions().GetMode() { + case runtime.NamespaceMode_NODE: + // Nothing to do. Not adding userns field uses the node userns. + case runtime.NamespaceMode_POD: + opts = append(opts, oci.WithLinuxNamespace(runtimespec.LinuxNamespace{Type: runtimespec.UserNamespace, Path: GetUserNamespace(sandboxPid)})) + opts = append(opts, oci.WithUserNamespace(uids, gids)) + } + } + return oci.Compose(opts...) } @@ -745,6 +756,8 @@ const ( utsNSFormat = "/proc/%v/ns/uts" // pidNSFormat is the format of pid namespace of a process. pidNSFormat = "/proc/%v/ns/pid" + // userNSFormat is the format of user namespace of a process. + userNSFormat = "/proc/%v/ns/user" ) // GetNetworkNamespace returns the network namespace of a process. @@ -767,6 +780,11 @@ func GetPIDNamespace(pid uint32) string { return fmt.Sprintf(pidNSFormat, pid) } +// GetUserNamespace returns the user namespace of a process. +func GetUserNamespace(pid uint32) string { + return fmt.Sprintf(userNSFormat, pid) +} + // WithCDI updates OCI spec with CDI content func WithCDI(annotations map[string]string) oci.SpecOpts { return func(ctx context.Context, _ oci.Client, c *containers.Container, s *oci.Spec) error { diff --git a/pkg/cri/sbserver/container_create_linux.go b/pkg/cri/sbserver/container_create_linux.go index 558eaf962..71e6af0a6 100644 --- a/pkg/cri/sbserver/container_create_linux.go +++ b/pkg/cri/sbserver/container_create_linux.go @@ -313,7 +313,8 @@ func (c *criService) containerSpec( specOpts = append(specOpts, customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj), - customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid), + // TODO: This is a hack to make this compile. We should move userns support to sbserver. + customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, nil, nil), customopts.WithSupplementalGroups(supplementalGroups), customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer), customopts.WithAnnotation(annotations.SandboxID, sandboxID), diff --git a/pkg/cri/server/container_create.go b/pkg/cri/server/container_create.go index 72c4a6df1..ea4d5a02a 100644 --- a/pkg/cri/server/container_create.go +++ b/pkg/cri/server/container_create.go @@ -184,7 +184,10 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta log.G(ctx).Debugf("Container %q spec: %#+v", id, spew.NewFormatter(spec)) // Grab any platform specific snapshotter opts. - sOpts := snapshotterOpts(c.config.ContainerdConfig.Snapshotter, config) + sOpts, err := snapshotterOpts(c.config.ContainerdConfig.Snapshotter, config) + if err != nil { + return nil, err + } // Set snapshotter before any other options. opts := []containerd.NewContainerOpts{ diff --git a/pkg/cri/server/container_create_linux.go b/pkg/cri/server/container_create_linux.go index a74bf5d9a..5181c5f5e 100644 --- a/pkg/cri/server/container_create_linux.go +++ b/pkg/cri/server/container_create_linux.go @@ -311,9 +311,14 @@ func (c *criService) containerSpec( targetPid = status.Pid } + uids, gids, err := parseUsernsIDs(nsOpts.GetUsernsOptions()) + if err != nil { + return nil, fmt.Errorf("user namespace configuration: %w", err) + } + specOpts = append(specOpts, customopts.WithOOMScoreAdj(config, c.config.RestrictOOMScoreAdj), - customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid), + customopts.WithPodNamespaces(securityContext, sandboxPid, targetPid, uids, gids), customopts.WithSupplementalGroups(supplementalGroups), customopts.WithAnnotation(annotations.ContainerType, annotations.ContainerTypeContainer), customopts.WithAnnotation(annotations.SandboxID, sandboxID), @@ -601,6 +606,7 @@ func generateUserString(username string, uid, gid *runtime.Int64Value) (string, } // snapshotterOpts returns any Linux specific snapshotter options for the rootfs snapshot -func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt { - return []snapshots.Opt{} +func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) { + nsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions() + return snapshotterRemapOpts(nsOpts) } diff --git a/pkg/cri/server/container_create_linux_test.go b/pkg/cri/server/container_create_linux_test.go index 8ba7cabb0..a17bd5895 100644 --- a/pkg/cri/server/container_create_linux_test.go +++ b/pkg/cri/server/container_create_linux_test.go @@ -804,6 +804,113 @@ func TestPidNamespace(t *testing.T) { } } +func TestUserNamespace(t *testing.T) { + testID := "test-id" + testPid := uint32(1234) + testSandboxID := "sandbox-id" + testContainerName := "container-name" + idMap := runtime.IDMapping{ + HostId: 1000, + ContainerId: 1000, + Length: 10, + } + expIDMap := runtimespec.LinuxIDMapping{ + HostID: 1000, + ContainerID: 1000, + Size: 10, + } + containerConfig, sandboxConfig, imageConfig, _ := getCreateContainerTestData() + ociRuntime := config.Runtime{} + c := newTestCRIService() + for desc, test := range map[string]struct { + userNS *runtime.UserNamespace + expNS *runtimespec.LinuxNamespace + expNotNS *runtimespec.LinuxNamespace // Does NOT contain this namespace + expUIDMapping []runtimespec.LinuxIDMapping + expGIDMapping []runtimespec.LinuxIDMapping + err bool + }{ + "node namespace mode": { + userNS: &runtime.UserNamespace{Mode: runtime.NamespaceMode_NODE}, + // Expect userns to NOT be present. + expNotNS: &runtimespec.LinuxNamespace{ + Type: runtimespec.UserNamespace, + Path: opts.GetUserNamespace(testPid), + }, + }, + "node namespace mode with mappings": { + userNS: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_NODE, + Uids: []*runtime.IDMapping{&idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + err: true, + }, + "container namespace mode": { + userNS: &runtime.UserNamespace{Mode: runtime.NamespaceMode_CONTAINER}, + err: true, + }, + "target namespace mode": { + userNS: &runtime.UserNamespace{Mode: runtime.NamespaceMode_TARGET}, + err: true, + }, + "unknown namespace mode": { + userNS: &runtime.UserNamespace{Mode: runtime.NamespaceMode(100)}, + err: true, + }, + "pod namespace mode": { + userNS: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + Uids: []*runtime.IDMapping{&idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + expNS: &runtimespec.LinuxNamespace{ + Type: runtimespec.UserNamespace, + Path: opts.GetUserNamespace(testPid), + }, + expUIDMapping: []runtimespec.LinuxIDMapping{expIDMap}, + expGIDMapping: []runtimespec.LinuxIDMapping{expIDMap}, + }, + "pod namespace mode with several mappings": { + userNS: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + Uids: []*runtime.IDMapping{&idMap, &idMap}, + Gids: []*runtime.IDMapping{&idMap, &idMap}, + }, + err: true, + }, + "pod namespace mode with uneven mappings": { + userNS: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + Uids: []*runtime.IDMapping{&idMap, &idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + err: true, + }, + } { + t.Run(desc, func(t *testing.T) { + containerConfig.Linux.SecurityContext.NamespaceOptions = &runtime.NamespaceOption{UsernsOptions: test.userNS} + spec, err := c.containerSpec(testID, testSandboxID, testPid, "", testContainerName, testImageName, containerConfig, sandboxConfig, imageConfig, nil, ociRuntime) + + if test.err { + assert.Error(t, err) + assert.Nil(t, spec) + return + } + assert.NoError(t, err) + assert.Equal(t, spec.Linux.UIDMappings, test.expUIDMapping) + assert.Equal(t, spec.Linux.GIDMappings, test.expGIDMapping) + + if test.expNS != nil { + assert.Contains(t, spec.Linux.Namespaces, *test.expNS) + } + if test.expNotNS != nil { + assert.NotContains(t, spec.Linux.Namespaces, *test.expNotNS) + } + }) + } +} + func TestNoDefaultRunMount(t *testing.T) { testID := "test-id" testPid := uint32(1234) diff --git a/pkg/cri/server/container_create_other.go b/pkg/cri/server/container_create_other.go index 9cfb15a04..acab67c11 100644 --- a/pkg/cri/server/container_create_other.go +++ b/pkg/cri/server/container_create_other.go @@ -55,6 +55,6 @@ func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageCon } // snapshotterOpts returns snapshotter options for the rootfs snapshot -func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt { - return []snapshots.Opt{} +func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) { + return []snapshots.Opt{}, nil } diff --git a/pkg/cri/server/container_create_windows.go b/pkg/cri/server/container_create_windows.go index bd7ed0fa5..e11466545 100644 --- a/pkg/cri/server/container_create_windows.go +++ b/pkg/cri/server/container_create_windows.go @@ -145,7 +145,7 @@ func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageCon } // snapshotterOpts returns any Windows specific snapshotter options for the r/w layer -func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt { +func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) { var opts []snapshots.Opt switch snapshotterName { @@ -160,5 +160,5 @@ func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) [] } } - return opts + return opts, nil } diff --git a/pkg/cri/server/helpers_linux.go b/pkg/cri/server/helpers_linux.go index 42b2d99a1..d0cb13006 100644 --- a/pkg/cri/server/helpers_linux.go +++ b/pkg/cri/server/helpers_linux.go @@ -28,11 +28,13 @@ import ( "syscall" "time" + "github.com/containerd/containerd" "github.com/containerd/containerd/log" "github.com/containerd/containerd/mount" "github.com/containerd/containerd/pkg/apparmor" "github.com/containerd/containerd/pkg/seccomp" "github.com/containerd/containerd/pkg/seutil" + "github.com/containerd/containerd/snapshots" "github.com/moby/sys/mountinfo" "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" @@ -275,3 +277,92 @@ func modifyProcessLabel(runtimeType string, spec *specs.Spec) error { spec.Process.SelinuxLabel = l return nil } + +func parseUsernsIDMap(runtimeIDMap []*runtime.IDMapping) ([]specs.LinuxIDMapping, error) { + var m []specs.LinuxIDMapping + + if len(runtimeIDMap) == 0 { + return m, nil + } + + if len(runtimeIDMap) > 1 { + // We only accept 1 line, because containerd.WithRemappedSnapshot() only supports that. + return m, fmt.Errorf("only one mapping line supported, got %v mapping lines", len(runtimeIDMap)) + } + + // We know len is 1 now. + if runtimeIDMap[0] == nil { + return m, nil + } + uidMap := *runtimeIDMap[0] + + if uidMap.Length < 1 { + return m, fmt.Errorf("invalid mapping length: %v", uidMap.Length) + } + + m = []specs.LinuxIDMapping{ + { + ContainerID: uidMap.ContainerId, + HostID: uidMap.HostId, + Size: uidMap.Length, + }, + } + + return m, nil +} + +func parseUsernsIDs(userns *runtime.UserNamespace) (uids, gids []specs.LinuxIDMapping, retErr error) { + if userns == nil { + // If userns is not set, the kubelet doesn't support this option + // and we should just fallback to no userns. This is completely + // valid. + return nil, nil, nil + } + + uidRuntimeMap := userns.GetUids() + gidRuntimeMap := userns.GetGids() + + uids, err := parseUsernsIDMap(uidRuntimeMap) + if err != nil { + return nil, nil, fmt.Errorf("UID mapping: %w", err) + } + + gids, err = parseUsernsIDMap(gidRuntimeMap) + if err != nil { + return nil, nil, fmt.Errorf("GID mapping: %w", err) + } + + switch mode := userns.GetMode(); mode { + case runtime.NamespaceMode_NODE: + if len(uids) != 0 || len(gids) != 0 { + return nil, nil, fmt.Errorf("can't use user namespace mode %q with mappings. Got %v UID mappings and %v GID mappings", mode, len(uids), len(gids)) + } + case runtime.NamespaceMode_POD: + // This is valid, we will handle it in WithPodNamespaces(). + if len(uids) == 0 || len(gids) == 0 { + return nil, nil, fmt.Errorf("can't use user namespace mode %q without UID and GID mappings", mode) + } + default: + return nil, nil, fmt.Errorf("unsupported user namespace mode: %q", mode) + } + + return uids, gids, nil +} + +func snapshotterRemapOpts(nsOpts *runtime.NamespaceOption) ([]snapshots.Opt, error) { + snapshotOpt := []snapshots.Opt{} + usernsOpts := nsOpts.GetUsernsOptions() + if usernsOpts == nil { + return snapshotOpt, nil + } + + uids, gids, err := parseUsernsIDs(usernsOpts) + if err != nil { + return nil, fmt.Errorf("user namespace configuration: %w", err) + } + + if usernsOpts.GetMode() == runtime.NamespaceMode_POD { + snapshotOpt = append(snapshotOpt, containerd.WithRemapperLabels(0, uids[0].HostID, 0, gids[0].HostID, uids[0].Size)) + } + return snapshotOpt, nil +} diff --git a/pkg/cri/server/sandbox_run.go b/pkg/cri/server/sandbox_run.go index acc4a16ef..5380a2664 100644 --- a/pkg/cri/server/sandbox_run.go +++ b/pkg/cri/server/sandbox_run.go @@ -23,6 +23,7 @@ import ( "fmt" "math" "path/filepath" + goruntime "runtime" "strings" "time" @@ -157,10 +158,17 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox if err != nil { return nil, fmt.Errorf("failed to generate runtime options: %w", err) } - snapshotterOpt := snapshots.WithLabels(snapshots.FilterInheritedLabels(config.Annotations)) + + sOpts := []snapshots.Opt{snapshots.WithLabels(snapshots.FilterInheritedLabels(config.Annotations))} + extraSOpts, err := sandboxSnapshotterOpts(config) + if err != nil { + return nil, err + } + sOpts = append(sOpts, extraSOpts...) + opts := []containerd.NewContainerOpts{ containerd.WithSnapshotter(c.runtimeSnapshotter(ctx, ociRuntime)), - customopts.WithNewSnapshot(id, containerdImage, snapshotterOpt), + customopts.WithNewSnapshot(id, containerdImage, sOpts...), containerd.WithSpec(spec, specOpts...), containerd.WithContainerLabels(sandboxLabels), containerd.WithContainerExtension(sandboxMetadataExtension, &sandbox.Metadata), @@ -244,8 +252,27 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, fmt.Errorf("failed to get sandbox container info: %w", err) } + userNsEnabled := false + if goruntime.GOOS != "windows" { + usernsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions() + if usernsOpts != nil && usernsOpts.GetMode() == runtime.NamespaceMode_POD { + userNsEnabled = true + } + } + // Setup the network namespace if host networking wasn't requested. - if !hostNetwork(config) { + if !hostNetwork(config) && !userNsEnabled { + // XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too. + // We can't move this to a function, as the defer calls need to be executed if other + // errors are returned in this function. So, we would need more refactors to move + // this code to a function and the idea was to not change the current code for + // !userNsEnabled case, therefore doing it would defeat the purpose. + // + // The difference between the cases is the use of netns.NewNetNS() vs + // netns.NewNetNSFromPID() and we verify the task is still running in the other case. + // + // To simplify this, in the future, we should just remove this case (podNetwork && + // !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled). netStart := time.Now() // If it is not in host network namespace then create a namespace and set the sandbox @@ -353,6 +380,88 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, fmt.Errorf("failed to wait for sandbox container task: %w", err) } + if !hostNetwork(config) && userNsEnabled { + // If userns is enabled, then the netns was created by the OCI runtime + // when creating "task". The OCI runtime needs to create the netns + // because, if userns is in use, the netns needs to be owned by the + // userns. So, let the OCI runtime just handle this for us. + // If the netns is not owned by the userns several problems will happen. + // For instance, the container will lack permission (even if + // capabilities are present) to modify the netns or, even worse, the OCI + // runtime will fail to mount sysfs: + // https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164 + netStart := time.Now() + + // If it is not in host network namespace then create a namespace and set the sandbox + // handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network + // namespaces. If the pod is in host network namespace then both are empty and should not + // be used. + var netnsMountDir = "/var/run/netns" + if c.config.NetNSMountsUnderStateDir { + netnsMountDir = filepath.Join(c.config.StateDir, "netns") + } + sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, task.Pid()) + if err != nil { + return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err) + } + + // Verify task is still in created state. + if st, err := task.Status(ctx); err != nil || st.Status != containerd.Created { + return nil, fmt.Errorf("failed to create pod sandbox %q: err is %v - status is %q and is expected %q", id, err, st.Status, containerd.Created) + } + sandbox.NetNSPath = sandbox.NetNS.GetPath() + + defer func() { + // Remove the network namespace only if all the resource cleanup is done. + if retErr != nil && cleanupErr == nil { + if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id) + return + } + sandbox.NetNSPath = "" + } + }() + + // Update network namespace in the container's spec + c.updateNetNamespacePath(spec, sandbox.NetNSPath) + + if err := container.Update(ctx, + // Update spec of the container + containerd.UpdateContainerOpts(containerd.WithSpec(spec)), + // Update sandbox metadata to include NetNS info + containerd.UpdateContainerOpts(containerd.WithContainerExtension(sandboxMetadataExtension, &sandbox.Metadata))); err != nil { + return nil, fmt.Errorf("failed to update the network namespace for the sandbox container %q: %w", id, err) + } + + // Define this defer to teardownPodNetwork prior to the setupPodNetwork function call. + // This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource creation functions. + defer func() { + // Teardown the network only if all the resource cleanup is done. + if retErr != nil && cleanupErr == nil { + deferCtx, deferCancel := ctrdutil.DeferContext() + defer deferCancel() + // Teardown network if an error is returned. + if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id) + } + } + }() + + // Setup network for sandbox. + // Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524) + // rely on the assumption that CRI shim will not be querying the network namespace to check the + // network states such as IP. + // In future runtime implementation should avoid relying on CRI shim implementation details. + // In this case however caching the IP will add a subtle performance enhancement by avoiding + // calls to network namespace of the pod to query the IP of the veth interface on every + // SandboxStatus request. + if err := c.setupPodNetwork(ctx, &sandbox); err != nil { + return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err) + } + + sandboxCreateNetworkTimer.UpdateSince(netStart) + } + if c.nri.isEnabled() { err = c.nri.runPodSandbox(ctx, &sandbox) if err != nil { diff --git a/pkg/cri/server/sandbox_run_linux.go b/pkg/cri/server/sandbox_run_linux.go index 21bbb1baf..78cca50c6 100644 --- a/pkg/cri/server/sandbox_run_linux.go +++ b/pkg/cri/server/sandbox_run_linux.go @@ -25,6 +25,7 @@ import ( "github.com/containerd/containerd" "github.com/containerd/containerd/oci" "github.com/containerd/containerd/plugin" + "github.com/containerd/containerd/snapshots" imagespec "github.com/opencontainers/image-spec/specs-go/v1" runtimespec "github.com/opencontainers/runtime-spec/specs-go" selinux "github.com/opencontainers/selinux/go-selinux" @@ -95,6 +96,23 @@ func (c *criService) sandboxContainerSpec(id string, config *runtime.PodSandboxC specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.IPCNamespace)) } + usernsOpts := nsOptions.GetUsernsOptions() + uids, gids, err := parseUsernsIDs(usernsOpts) + if err != nil { + return nil, fmt.Errorf("user namespace configuration: %w", err) + } + + if usernsOpts != nil { + switch mode := usernsOpts.GetMode(); mode { + case runtime.NamespaceMode_NODE: + specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.UserNamespace)) + case runtime.NamespaceMode_POD: + specOpts = append(specOpts, oci.WithUserNamespace(uids, gids)) + default: + return nil, fmt.Errorf("unsupported user namespace mode: %q", mode) + } + } + // It's fine to generate the spec before the sandbox /dev/shm // is actually created. sandboxDevShm := c.getSandboxDevShm(id) @@ -358,3 +376,10 @@ func (c *criService) updateNetNamespacePath(spec *runtimespec.Spec, nsPath strin } } } + +// sandboxSnapshotterOpts generates any platform specific snapshotter options +// for a sandbox container. +func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) { + nsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions() + return snapshotterRemapOpts(nsOpts) +} diff --git a/pkg/cri/server/sandbox_run_linux_test.go b/pkg/cri/server/sandbox_run_linux_test.go index 378136136..9c646e069 100644 --- a/pkg/cri/server/sandbox_run_linux_test.go +++ b/pkg/cri/server/sandbox_run_linux_test.go @@ -98,6 +98,17 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf func TestLinuxSandboxContainerSpec(t *testing.T) { testID := "test-id" nsPath := "test-cni" + idMap := runtime.IDMapping{ + HostId: 1000, + ContainerId: 1000, + Length: 10, + } + expIDMap := runtimespec.LinuxIDMapping{ + HostID: 1000, + ContainerID: 1000, + Size: 10, + } + for desc, test := range map[string]struct { configChange func(*runtime.PodSandboxConfig) specCheck func(*testing.T, *runtimespec.Spec) @@ -122,6 +133,9 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { }) assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "0") assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") + assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.UserNamespace, + }) }, }, "host namespace": { @@ -149,10 +163,113 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ Type: runtimespec.IPCNamespace, }) + assert.NotContains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.UserNamespace, + }) assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "0") assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") }, }, + "user namespace": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + Uids: []*runtime.IDMapping{&idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + }, + } + }, + specCheck: func(t *testing.T, spec *runtimespec.Spec) { + require.NotNil(t, spec.Linux) + assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.UserNamespace, + }) + require.Equal(t, spec.Linux.UIDMappings, []runtimespec.LinuxIDMapping{expIDMap}) + require.Equal(t, spec.Linux.GIDMappings, []runtimespec.LinuxIDMapping{expIDMap}) + + }, + }, + "user namespace mode node and mappings": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_NODE, + Uids: []*runtime.IDMapping{&idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + }, + } + }, + expectErr: true, + }, + "user namespace with several mappings": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_NODE, + Uids: []*runtime.IDMapping{&idMap, &idMap}, + Gids: []*runtime.IDMapping{&idMap, &idMap}, + }, + }, + } + }, + expectErr: true, + }, + "user namespace with uneven mappings": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_NODE, + Uids: []*runtime.IDMapping{&idMap, &idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + }, + } + }, + expectErr: true, + }, + "user namespace mode container": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_CONTAINER, + }, + }, + } + }, + expectErr: true, + }, + "user namespace mode target": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_TARGET, + }, + }, + } + }, + expectErr: true, + }, + "user namespace unknown mode": { + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode(100), + }, + }, + } + }, + expectErr: true, + }, "should set supplemental groups correctly": { configChange: func(c *runtime.PodSandboxConfig) { c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ diff --git a/pkg/cri/server/sandbox_run_other.go b/pkg/cri/server/sandbox_run_other.go index 150cc917d..1676b4760 100644 --- a/pkg/cri/server/sandbox_run_other.go +++ b/pkg/cri/server/sandbox_run_other.go @@ -21,6 +21,7 @@ package server import ( "github.com/containerd/containerd" "github.com/containerd/containerd/oci" + "github.com/containerd/containerd/snapshots" imagespec "github.com/opencontainers/image-spec/specs-go/v1" runtimespec "github.com/opencontainers/runtime-spec/specs-go" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" @@ -56,3 +57,9 @@ func (c *criService) taskOpts(runtimeType string) []containerd.NewTaskOpts { func (c *criService) updateNetNamespacePath(spec *runtimespec.Spec, nsPath string) { } + +// sandboxSnapshotterOpts generates any platform specific snapshotter options +// for a sandbox container. +func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) { + return []snapshots.Opt{}, nil +} diff --git a/pkg/cri/server/sandbox_run_windows.go b/pkg/cri/server/sandbox_run_windows.go index 017007f66..10b9b2faf 100644 --- a/pkg/cri/server/sandbox_run_windows.go +++ b/pkg/cri/server/sandbox_run_windows.go @@ -22,6 +22,7 @@ import ( "github.com/containerd/containerd" "github.com/containerd/containerd/oci" + "github.com/containerd/containerd/snapshots" imagespec "github.com/opencontainers/image-spec/specs-go/v1" runtimespec "github.com/opencontainers/runtime-spec/specs-go" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" @@ -116,3 +117,8 @@ func (c *criService) taskOpts(runtimeType string) []containerd.NewTaskOpts { func (c *criService) updateNetNamespacePath(spec *runtimespec.Spec, nsPath string) { spec.Windows.Network.NetworkNamespace = nsPath } + +// No sandbox snapshotter options needed for windows. +func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) { + return []snapshots.Opt{}, nil +} diff --git a/pkg/netns/netns_linux.go b/pkg/netns/netns_linux.go index 03f68a568..6e1a61f72 100644 --- a/pkg/netns/netns_linux.go +++ b/pkg/netns/netns_linux.go @@ -50,7 +50,9 @@ import ( // newNS creates a new persistent (bind-mounted) network namespace and returns the // path to the network namespace. -func newNS(baseDir string) (nsPath string, err error) { +// If pid is not 0, returns the netns from that pid persistently mounted. Otherwise, +// a new netns is created. +func newNS(baseDir string, pid uint32) (nsPath string, err error) { b := make([]byte, 16) _, err = rand.Read(b) @@ -81,6 +83,16 @@ func newNS(baseDir string) (nsPath string, err error) { } }() + if pid != 0 { + procNsPath := getNetNSPathFromPID(pid) + // bind mount the netns onto the mount point. This causes the namespace + // to persist, even when there are no threads in the ns. + if err = unix.Mount(procNsPath, nsPath, "none", unix.MS_BIND, ""); err != nil { + return "", fmt.Errorf("failed to bind mount ns src: %v at %s: %w", procNsPath, nsPath, err) + } + return nsPath, nil + } + var wg sync.WaitGroup wg.Add(1) @@ -155,6 +167,10 @@ func getCurrentThreadNetNSPath() string { return fmt.Sprintf("/proc/%d/task/%d/ns/net", os.Getpid(), unix.Gettid()) } +func getNetNSPathFromPID(pid uint32) string { + return fmt.Sprintf("/proc/%d/ns/net", pid) +} + // NetNS holds network namespace. type NetNS struct { path string @@ -162,7 +178,12 @@ type NetNS struct { // NewNetNS creates a network namespace. func NewNetNS(baseDir string) (*NetNS, error) { - path, err := newNS(baseDir) + return NewNetNSFromPID(baseDir, 0) +} + +// NewNetNS returns the netns from pid or a new netns if pid is 0. +func NewNetNSFromPID(baseDir string, pid uint32) (*NetNS, error) { + path, err := newNS(baseDir, pid) if err != nil { return nil, fmt.Errorf("failed to setup netns: %w", err) } diff --git a/pkg/netns/netns_other.go b/pkg/netns/netns_other.go index ec8124ceb..3cd60ef3f 100644 --- a/pkg/netns/netns_other.go +++ b/pkg/netns/netns_other.go @@ -35,6 +35,11 @@ func NewNetNS(baseDir string) (*NetNS, error) { return nil, errNotImplementedOnUnix } +// NewNetNS returns the netns from pid or a new netns if pid is 0. +func NewNetNSFromPID(baseDir string, pid uint32) (*NetNS, error) { + return nil, errNotImplementedOnUnix +} + // LoadNetNS loads existing network namespace. func LoadNetNS(path string) *NetNS { return &NetNS{path: path} diff --git a/pkg/netns/netns_windows.go b/pkg/netns/netns_windows.go index de02094b8..2d26d6f71 100644 --- a/pkg/netns/netns_windows.go +++ b/pkg/netns/netns_windows.go @@ -16,14 +16,20 @@ package netns -import "github.com/Microsoft/hcsshim/hcn" +import ( + "errors" + + "github.com/Microsoft/hcsshim/hcn" +) + +var errNotImplementedOnWindows = errors.New("not implemented on windows") // NetNS holds network namespace for sandbox type NetNS struct { path string } -// NewNetNS creates a network namespace for the sandbox +// NewNetNS creates a network namespace for the sandbox. func NewNetNS(baseDir string) (*NetNS, error) { temp := hcn.HostComputeNamespace{} hcnNamespace, err := temp.Create() @@ -34,6 +40,11 @@ func NewNetNS(baseDir string) (*NetNS, error) { return &NetNS{path: hcnNamespace.Id}, nil } +// NewNetNS returns the netns from pid or a new netns if pid is 0. +func NewNetNSFromPID(baseDir string, pid uint32) (*NetNS, error) { + return nil, errNotImplementedOnWindows +} + // LoadNetNS loads existing network namespace. func LoadNetNS(path string) *NetNS { return &NetNS{path: path} diff --git a/snapshots/snapshotter.go b/snapshots/snapshotter.go index 9da28583b..5fa5aa530 100644 --- a/snapshots/snapshotter.go +++ b/snapshots/snapshotter.go @@ -33,6 +33,11 @@ const ( UnpackKeyFormat = UnpackKeyPrefix + "-%s %s" inheritedLabelsPrefix = "containerd.io/snapshot/" labelSnapshotRef = "containerd.io/snapshot.ref" + + // LabelSnapshotUIDMapping is the label used for UID mappings + LabelSnapshotUIDMapping = "containerd.io/snapshot/uidmapping" + // LabelSnapshotGIDMapping is the label used for GID mappings + LabelSnapshotGIDMapping = "containerd.io/snapshot/gidmapping" ) // Kind identifies the kind of snapshot. diff --git a/snapshotter_opts_unix.go b/snapshotter_opts_unix.go index e25588b09..4739e192f 100644 --- a/snapshotter_opts_unix.go +++ b/snapshotter_opts_unix.go @@ -19,17 +19,92 @@ package containerd import ( + "context" "fmt" "github.com/containerd/containerd/snapshots" ) +const ( + capabRemapIDs = "remap-ids" +) + // WithRemapperLabels creates the labels used by any supporting snapshotter // to shift the filesystem ownership (user namespace mapping) automatically; currently // supported by the fuse-overlayfs snapshotter func WithRemapperLabels(ctrUID, hostUID, ctrGID, hostGID, length uint32) snapshots.Opt { return snapshots.WithLabels(map[string]string{ - "containerd.io/snapshot/uidmapping": fmt.Sprintf("%d:%d:%d", ctrUID, hostUID, length), - "containerd.io/snapshot/gidmapping": fmt.Sprintf("%d:%d:%d", ctrGID, hostGID, length), - }) + snapshots.LabelSnapshotUIDMapping: fmt.Sprintf("%d:%d:%d", ctrUID, hostUID, length), + snapshots.LabelSnapshotGIDMapping: fmt.Sprintf("%d:%d:%d", ctrGID, hostGID, length)}) +} + +func resolveSnapshotOptions(ctx context.Context, client *Client, snapshotterName string, snapshotter snapshots.Snapshotter, parent string, opts ...snapshots.Opt) (string, error) { + capabs, err := client.GetSnapshotterCapabilities(ctx, snapshotterName) + if err != nil { + return "", err + } + + for _, capab := range capabs { + if capab == capabRemapIDs { + // Snapshotter supports ID remapping, we don't need to do anything. + return parent, nil + } + } + + var local snapshots.Info + for _, opt := range opts { + opt(&local) + } + + needsRemap := false + var uidMap, gidMap string + + if value, ok := local.Labels[snapshots.LabelSnapshotUIDMapping]; ok { + needsRemap = true + uidMap = value + } + if value, ok := local.Labels[snapshots.LabelSnapshotGIDMapping]; ok { + needsRemap = true + gidMap = value + } + + if !needsRemap { + return parent, nil + } + + var ctrUID, hostUID, length uint32 + _, err = fmt.Sscanf(uidMap, "%d:%d:%d", &ctrUID, &hostUID, &length) + if err != nil { + return "", fmt.Errorf("uidMap unparsable: %w", err) + } + + var ctrGID, hostGID, lengthGID uint32 + _, err = fmt.Sscanf(gidMap, "%d:%d:%d", &ctrGID, &hostGID, &lengthGID) + if err != nil { + return "", fmt.Errorf("gidMap unparsable: %w", err) + } + + if ctrUID != 0 || ctrGID != 0 { + return "", fmt.Errorf("Container UID/GID of 0 only supported currently (%d/%d)", ctrUID, ctrGID) + } + + // TODO(dgl): length isn't taken into account for the intermediate snapshot id. + usernsID := fmt.Sprintf("%s-%d-%d", parent, hostUID, hostGID) + if _, err := snapshotter.Stat(ctx, usernsID); err == nil { + return usernsID, nil + } + mounts, err := snapshotter.Prepare(ctx, usernsID+"-remap", parent) + if err != nil { + return "", err + } + // TODO(dgl): length isn't taken into account here yet either. + if err := remapRootFS(ctx, mounts, hostUID, hostGID); err != nil { + snapshotter.Remove(ctx, usernsID+"-remap") + return "", err + } + if err := snapshotter.Commit(ctx, usernsID, usernsID+"-remap"); err != nil { + return "", err + } + + return usernsID, nil } diff --git a/snapshotter_opts_windows.go b/snapshotter_opts_windows.go new file mode 100644 index 000000000..540bcb313 --- /dev/null +++ b/snapshotter_opts_windows.go @@ -0,0 +1,27 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package containerd + +import ( + "context" + + "github.com/containerd/containerd/snapshots" +) + +func resolveSnapshotOptions(ctx context.Context, client *Client, snapshotterName string, snapshotter snapshots.Snapshotter, parent string, opts ...snapshots.Opt) (string, error) { + return parent, nil +}