diff --git a/integration/pod_userns_linux_test.go b/integration/pod_userns_linux_test.go index b020d64ae..e0a561433 100644 --- a/integration/pod_userns_linux_test.go +++ b/integration/pod_userns_linux_test.go @@ -94,9 +94,6 @@ func TestPodUserNS(t *testing.T) { }, } { t.Run(name, func(t *testing.T) { - if os.Getenv("ENABLE_CRI_SANDBOXES") == "'sandboxed'" { - t.Skip("skipping test: userns not supported/needed in sanboxed runtimes") - } cmd := exec.Command("true") cmd.SysProcAttr = &syscall.SysProcAttr{ Cloneflags: syscall.CLONE_NEWUSER, diff --git a/pkg/cri/sbserver/container_create.go b/pkg/cri/sbserver/container_create.go index 5d0ebc516..8ad47107c 100644 --- a/pkg/cri/sbserver/container_create.go +++ b/pkg/cri/sbserver/container_create.go @@ -206,7 +206,10 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta log.G(ctx).Debugf("Container %q spec: %#+v", id, spew.NewFormatter(spec)) // Grab any platform specific snapshotter opts. - sOpts := snapshotterOpts(c.config.ContainerdConfig.Snapshotter, config) + sOpts, err := snapshotterOpts(c.config.ContainerdConfig.Snapshotter, config) + if err != nil { + return nil, err + } // Set snapshotter before any other options. opts := []containerd.NewContainerOpts{ diff --git a/pkg/cri/sbserver/container_create_linux.go b/pkg/cri/sbserver/container_create_linux.go index f33438d65..71ad55476 100644 --- a/pkg/cri/sbserver/container_create_linux.go +++ b/pkg/cri/sbserver/container_create_linux.go @@ -264,6 +264,7 @@ func appArmorProfileExists(profile string) (bool, error) { } // snapshotterOpts returns any Linux specific snapshotter options for the rootfs snapshot -func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt { - return []snapshots.Opt{} +func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) { + nsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions() + return snapshotterRemapOpts(nsOpts) } diff --git a/pkg/cri/sbserver/container_create_other.go b/pkg/cri/sbserver/container_create_other.go index c97d2f568..a5feb385c 100644 --- a/pkg/cri/sbserver/container_create_other.go +++ b/pkg/cri/sbserver/container_create_other.go @@ -31,6 +31,6 @@ func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageCon } // snapshotterOpts returns snapshotter options for the rootfs snapshot -func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt { - return []snapshots.Opt{} +func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) { + return []snapshots.Opt{}, nil } diff --git a/pkg/cri/sbserver/container_create_windows.go b/pkg/cri/sbserver/container_create_windows.go index 66bcedf0f..7ab18a5aa 100644 --- a/pkg/cri/sbserver/container_create_windows.go +++ b/pkg/cri/sbserver/container_create_windows.go @@ -32,7 +32,7 @@ func (c *criService) containerSpecOpts(config *runtime.ContainerConfig, imageCon } // snapshotterOpts returns any Windows specific snapshotter options for the r/w layer -func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) []snapshots.Opt { +func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) ([]snapshots.Opt, error) { var opts []snapshots.Opt switch snapshotterName { @@ -47,5 +47,5 @@ func snapshotterOpts(snapshotterName string, config *runtime.ContainerConfig) [] } } - return opts + return opts, nil } diff --git a/pkg/cri/sbserver/helpers_linux.go b/pkg/cri/sbserver/helpers_linux.go index a718f3af4..bd142ecc0 100644 --- a/pkg/cri/sbserver/helpers_linux.go +++ b/pkg/cri/sbserver/helpers_linux.go @@ -30,12 +30,15 @@ import ( "github.com/moby/sys/mountinfo" "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" + "github.com/containerd/containerd" "github.com/containerd/containerd/log" "github.com/containerd/containerd/mount" "github.com/containerd/containerd/pkg/apparmor" "github.com/containerd/containerd/pkg/seccomp" "github.com/containerd/containerd/pkg/seutil" + "github.com/containerd/containerd/snapshots" ) // apparmorEnabled returns true if apparmor is enabled, supported by the host, @@ -181,3 +184,21 @@ func modifyProcessLabel(runtimeType string, spec *specs.Spec) error { func isUnifiedCgroupsMode() bool { return cgroups.Mode() == cgroups.Unified } + +func snapshotterRemapOpts(nsOpts *runtime.NamespaceOption) ([]snapshots.Opt, error) { + snapshotOpt := []snapshots.Opt{} + usernsOpts := nsOpts.GetUsernsOptions() + if usernsOpts == nil { + return snapshotOpt, nil + } + + uids, gids, err := parseUsernsIDs(usernsOpts) + if err != nil { + return nil, fmt.Errorf("user namespace configuration: %w", err) + } + + if usernsOpts.GetMode() == runtime.NamespaceMode_POD { + snapshotOpt = append(snapshotOpt, containerd.WithRemapperLabels(0, uids[0].HostID, 0, gids[0].HostID, uids[0].Size)) + } + return snapshotOpt, nil +} diff --git a/pkg/cri/sbserver/podsandbox/helpers_linux.go b/pkg/cri/sbserver/podsandbox/helpers_linux.go index a0a33b2bc..f51e9907e 100644 --- a/pkg/cri/sbserver/podsandbox/helpers_linux.go +++ b/pkg/cri/sbserver/podsandbox/helpers_linux.go @@ -28,12 +28,15 @@ import ( "syscall" "time" + "github.com/containerd/containerd" "github.com/containerd/containerd/log" "github.com/containerd/containerd/mount" "github.com/containerd/containerd/pkg/seccomp" "github.com/containerd/containerd/pkg/seutil" + "github.com/containerd/containerd/snapshots" + "github.com/moby/sys/mountinfo" - "github.com/opencontainers/runtime-spec/specs-go" + runtimespec "github.com/opencontainers/runtime-spec/specs-go" "github.com/opencontainers/selinux/go-selinux/label" "golang.org/x/sys/unix" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" @@ -242,7 +245,7 @@ func isVMBasedRuntime(runtimeType string) bool { return false } -func modifyProcessLabel(runtimeType string, spec *specs.Spec) error { +func modifyProcessLabel(runtimeType string, spec *runtimespec.Spec) error { if !isVMBasedRuntime(runtimeType) { return nil } @@ -253,3 +256,89 @@ func modifyProcessLabel(runtimeType string, spec *specs.Spec) error { spec.Process.SelinuxLabel = l return nil } + +func parseUsernsIDMap(runtimeIDMap []*runtime.IDMapping) ([]runtimespec.LinuxIDMapping, error) { + var m []runtimespec.LinuxIDMapping + + if len(runtimeIDMap) == 0 { + return m, nil + } + + if len(runtimeIDMap) > 1 { + // We only accept 1 line, because containerd.WithRemappedSnapshot() only supports that. + return m, fmt.Errorf("only one mapping line supported, got %v mapping lines", len(runtimeIDMap)) + } + + // We know len is 1 now. + if runtimeIDMap[0] == nil { + return m, nil + } + uidMap := *runtimeIDMap[0] + + if uidMap.Length < 1 { + return m, fmt.Errorf("invalid mapping length: %v", uidMap.Length) + } + + m = []runtimespec.LinuxIDMapping{ + { + ContainerID: uidMap.ContainerId, + HostID: uidMap.HostId, + Size: uidMap.Length, + }, + } + + return m, nil +} + +func parseUsernsIDs(userns *runtime.UserNamespace) (uids, gids []runtimespec.LinuxIDMapping, retErr error) { + if userns == nil { + // If userns is not set, the kubelet doesn't support this option + // and we should just fallback to no userns. This is completely + // valid. + return nil, nil, nil + } + + uids, err := parseUsernsIDMap(userns.GetUids()) + if err != nil { + return nil, nil, fmt.Errorf("UID mapping: %w", err) + } + + gids, err = parseUsernsIDMap(userns.GetGids()) + if err != nil { + return nil, nil, fmt.Errorf("GID mapping: %w", err) + } + + switch mode := userns.GetMode(); mode { + case runtime.NamespaceMode_NODE: + if len(uids) != 0 || len(gids) != 0 { + return nil, nil, fmt.Errorf("can't use user namespace mode %q with mappings. Got %v UID mappings and %v GID mappings", mode, len(uids), len(gids)) + } + case runtime.NamespaceMode_POD: + // This is valid, we will handle it in WithPodNamespaces(). + if len(uids) == 0 || len(gids) == 0 { + return nil, nil, fmt.Errorf("can't use user namespace mode %q without UID and GID mappings", mode) + } + default: + return nil, nil, fmt.Errorf("unsupported user namespace mode: %q", mode) + } + + return uids, gids, nil +} + +func snapshotterRemapOpts(nsOpts *runtime.NamespaceOption) ([]snapshots.Opt, error) { + snapshotOpt := []snapshots.Opt{} + usernsOpts := nsOpts.GetUsernsOptions() + if usernsOpts == nil { + return snapshotOpt, nil + } + + uids, gids, err := parseUsernsIDs(usernsOpts) + if err != nil { + return nil, fmt.Errorf("user namespace configuration: %w", err) + } + + if usernsOpts.GetMode() == runtime.NamespaceMode_POD { + snapshotOpt = append(snapshotOpt, containerd.WithRemapperLabels(0, uids[0].HostID, 0, gids[0].HostID, uids[0].Size)) + } + return snapshotOpt, nil +} diff --git a/pkg/cri/sbserver/podsandbox/sandbox_run.go b/pkg/cri/sbserver/podsandbox/sandbox_run.go index 171c8ff50..6357be370 100644 --- a/pkg/cri/sbserver/podsandbox/sandbox_run.go +++ b/pkg/cri/sbserver/podsandbox/sandbox_run.go @@ -136,10 +136,16 @@ func (c *Controller) Start(ctx context.Context, id string) (cin sandbox.Controll sandboxLabels := buildLabels(config.Labels, image.ImageSpec.Config.Labels, containerKindSandbox) - snapshotterOpt := snapshots.WithLabels(snapshots.FilterInheritedLabels(config.Annotations)) + snapshotterOpt := []snapshots.Opt{snapshots.WithLabels(snapshots.FilterInheritedLabels(config.Annotations))} + extraSOpts, err := sandboxSnapshotterOpts(config) + if err != nil { + return cin, err + } + snapshotterOpt = append(snapshotterOpt, extraSOpts...) + opts := []containerd.NewContainerOpts{ containerd.WithSnapshotter(c.runtimeSnapshotter(ctx, ociRuntime)), - customopts.WithNewSnapshot(id, containerdImage, snapshotterOpt), + customopts.WithNewSnapshot(id, containerdImage, snapshotterOpt...), containerd.WithSpec(spec, specOpts...), containerd.WithContainerLabels(sandboxLabels), containerd.WithContainerExtension(sandboxMetadataExtension, &metadata), diff --git a/pkg/cri/sbserver/podsandbox/sandbox_run_linux.go b/pkg/cri/sbserver/podsandbox/sandbox_run_linux.go index 576d66c79..3c75187a3 100644 --- a/pkg/cri/sbserver/podsandbox/sandbox_run_linux.go +++ b/pkg/cri/sbserver/podsandbox/sandbox_run_linux.go @@ -32,6 +32,7 @@ import ( "github.com/containerd/containerd/pkg/cri/annotations" customopts "github.com/containerd/containerd/pkg/cri/opts" "github.com/containerd/containerd/pkg/userns" + "github.com/containerd/containerd/snapshots" ) func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig, @@ -92,6 +93,25 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.IPCNamespace)) } + usernsOpts := nsOptions.GetUsernsOptions() + uids, gids, err := parseUsernsIDs(usernsOpts) + var usernsEnabled bool + if err != nil { + return nil, fmt.Errorf("user namespace configuration: %w", err) + } + + if usernsOpts != nil { + switch mode := usernsOpts.GetMode(); mode { + case runtime.NamespaceMode_NODE: + specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.UserNamespace)) + case runtime.NamespaceMode_POD: + specOpts = append(specOpts, oci.WithUserNamespace(uids, gids)) + usernsEnabled = true + default: + return nil, fmt.Errorf("unsupported user namespace mode: %q", mode) + } + } + // It's fine to generate the spec before the sandbox /dev/shm // is actually created. sandboxDevShm := c.getSandboxDevShm(id) @@ -100,9 +120,9 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC } // Remove the default /dev/shm mount from defaultMounts, it is added in oci/mounts.go. specOpts = append(specOpts, oci.WithoutMounts(devShm)) - // In future the when user-namespace is enabled, the `nosuid, nodev, noexec` flags are - // required, otherwise the remount will fail with EPERM. Just use them unconditionally, - // they are nice to have anyways. + // When user-namespace is enabled, the `nosuid, nodev, noexec` flags are + // required, otherwise the remount will fail with EPERM. Just use them + // unconditionally, they are nice to have anyways. specOpts = append(specOpts, oci.WithMounts([]runtimespec.Mount{ { Source: sandboxDevShm, @@ -146,10 +166,7 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC if c.config.EnableUnprivilegedPorts && !ipUnprivilegedPortStart { sysctls["net.ipv4.ip_unprivileged_port_start"] = "0" } - // TODO (rata): We need to set this only if the pod will - // **not** use user namespaces either. - // This will be done when user namespaces is ported to sbserver. - if c.config.EnableUnprivilegedICMP && !pingGroupRange && !userns.RunningInUserNS() { + if c.config.EnableUnprivilegedICMP && !pingGroupRange && !userns.RunningInUserNS() && !usernsEnabled { sysctls["net.ipv4.ping_group_range"] = "0 2147483647" } } @@ -326,3 +343,10 @@ func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxCo } return nil } + +// sandboxSnapshotterOpts generates any platform specific snapshotter options +// for a sandbox container. +func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) { + nsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions() + return snapshotterRemapOpts(nsOpts) +} diff --git a/pkg/cri/sbserver/podsandbox/sandbox_run_linux_test.go b/pkg/cri/sbserver/podsandbox/sandbox_run_linux_test.go index 82a914917..850d49571 100644 --- a/pkg/cri/sbserver/podsandbox/sandbox_run_linux_test.go +++ b/pkg/cri/sbserver/podsandbox/sandbox_run_linux_test.go @@ -106,6 +106,17 @@ func getRunPodSandboxTestData() (*runtime.PodSandboxConfig, *imagespec.ImageConf func TestLinuxSandboxContainerSpec(t *testing.T) { testID := "test-id" nsPath := "test-cni" + idMap := runtime.IDMapping{ + HostId: 1000, + ContainerId: 1000, + Length: 10, + } + expIDMap := runtimespec.LinuxIDMapping{ + HostID: 1000, + ContainerID: 1000, + Size: 10, + } + for _, test := range []struct { desc string configChange func(*runtime.PodSandboxConfig) @@ -134,6 +145,27 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { assert.Contains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") }, }, + { + desc: "spec shouldn't have ping_group_range if userns are in use", + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + Uids: []*runtime.IDMapping{&idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + }, + } + }, + specCheck: func(t *testing.T, spec *runtimespec.Spec) { + require.NotNil(t, spec.Linux) + assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.UserNamespace, + }) + assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") + }, + }, { desc: "host namespace", configChange: func(c *runtime.PodSandboxConfig) { @@ -164,6 +196,113 @@ func TestLinuxSandboxContainerSpec(t *testing.T) { assert.NotContains(t, spec.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") }, }, + { + desc: "user namespace", + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + Uids: []*runtime.IDMapping{&idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + }, + } + }, + specCheck: func(t *testing.T, spec *runtimespec.Spec) { + require.NotNil(t, spec.Linux) + assert.Contains(t, spec.Linux.Namespaces, runtimespec.LinuxNamespace{ + Type: runtimespec.UserNamespace, + }) + require.Equal(t, spec.Linux.UIDMappings, []runtimespec.LinuxIDMapping{expIDMap}) + require.Equal(t, spec.Linux.GIDMappings, []runtimespec.LinuxIDMapping{expIDMap}) + + }, + }, + { + desc: "user namespace mode node and mappings", + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_NODE, + Uids: []*runtime.IDMapping{&idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + }, + } + }, + expectErr: true, + }, + { + desc: "user namespace with several mappings", + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_NODE, + Uids: []*runtime.IDMapping{&idMap, &idMap}, + Gids: []*runtime.IDMapping{&idMap, &idMap}, + }, + }, + } + }, + expectErr: true, + }, + { + desc: "user namespace with uneven mappings", + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_NODE, + Uids: []*runtime.IDMapping{&idMap, &idMap}, + Gids: []*runtime.IDMapping{&idMap}, + }, + }, + } + }, + expectErr: true, + }, + { + desc: "user namespace mode container", + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_CONTAINER, + }, + }, + } + }, + expectErr: true, + }, + { + desc: "user namespace mode target", + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_TARGET, + }, + }, + } + }, + expectErr: true, + }, + { + desc: "user namespace unknown mode", + configChange: func(c *runtime.PodSandboxConfig) { + c.Linux.SecurityContext = &runtime.LinuxSandboxSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode(100), + }, + }, + } + }, + expectErr: true, + }, { desc: "should set supplemental groups correctly", configChange: func(c *runtime.PodSandboxConfig) { diff --git a/pkg/cri/sbserver/podsandbox/sandbox_run_other.go b/pkg/cri/sbserver/podsandbox/sandbox_run_other.go index 1a8d4d162..15691a189 100644 --- a/pkg/cri/sbserver/podsandbox/sandbox_run_other.go +++ b/pkg/cri/sbserver/podsandbox/sandbox_run_other.go @@ -21,6 +21,7 @@ package podsandbox import ( "github.com/containerd/containerd/oci" "github.com/containerd/containerd/pkg/cri/annotations" + "github.com/containerd/containerd/snapshots" imagespec "github.com/opencontainers/image-spec/specs-go/v1" runtimespec "github.com/opencontainers/runtime-spec/specs-go" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" @@ -48,3 +49,9 @@ func (c *Controller) setupSandboxFiles(id string, config *runtime.PodSandboxConf func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxConfig) error { return nil } + +// sandboxSnapshotterOpts generates any platform specific snapshotter options +// for a sandbox container. +func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) { + return []snapshots.Opt{}, nil +} diff --git a/pkg/cri/sbserver/podsandbox/sandbox_run_windows.go b/pkg/cri/sbserver/podsandbox/sandbox_run_windows.go index 5e201030a..9b2adc029 100644 --- a/pkg/cri/sbserver/podsandbox/sandbox_run_windows.go +++ b/pkg/cri/sbserver/podsandbox/sandbox_run_windows.go @@ -27,6 +27,7 @@ import ( "github.com/containerd/containerd/pkg/cri/annotations" customopts "github.com/containerd/containerd/pkg/cri/opts" + "github.com/containerd/containerd/snapshots" ) func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxConfig, @@ -101,3 +102,8 @@ func (c *Controller) setupSandboxFiles(id string, config *runtime.PodSandboxConf func (c *Controller) cleanupSandboxFiles(id string, config *runtime.PodSandboxConfig) error { return nil } + +// No sandbox snapshotter options needed for windows. +func sandboxSnapshotterOpts(config *runtime.PodSandboxConfig) ([]snapshots.Opt, error) { + return []snapshots.Opt{}, nil +} diff --git a/pkg/cri/sbserver/sandbox_run.go b/pkg/cri/sbserver/sandbox_run.go index d09b38dcd..4b3e77159 100644 --- a/pkg/cri/sbserver/sandbox_run.go +++ b/pkg/cri/sbserver/sandbox_run.go @@ -23,6 +23,7 @@ import ( "fmt" "math" "path/filepath" + goruntime "runtime" "strings" "time" @@ -94,6 +95,7 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox sandboxInfo.Runtime.Name = ociRuntime.Type + runtimeStart := time.Now() // Retrieve runtime options runtimeOpts, err := generateRuntimeOptions(ociRuntime) if err != nil { @@ -142,8 +144,27 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox } }() + userNsEnabled := false + if goruntime.GOOS != "windows" { + usernsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions() + if usernsOpts != nil && usernsOpts.GetMode() == runtime.NamespaceMode_POD { + userNsEnabled = true + } + } + // Setup the network namespace if host networking wasn't requested. - if !hostNetwork(config) { + if !hostNetwork(config) && !userNsEnabled { + // XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too. + // We can't move this to a function, as the defer calls need to be executed if other + // errors are returned in this function. So, we would need more refactors to move + // this code to a function and the idea was to not change the current code for + // !userNsEnabled case, therefore doing it would defeat the purpose. + // + // The difference between the cases is the use of netns.NewNetNS() vs + // netns.NewNetNSFromPID(). + // + // To simplify this, in the future, we should just remove this case (podNetwork && + // !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled). netStart := time.Now() // If it is not in host network namespace then create a namespace and set the sandbox // handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network @@ -222,8 +243,6 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err) } - runtimeStart := time.Now() - if err := controller.Create(ctx, id, sb.WithOptions(config), sb.WithNetNSPath(sandbox.NetNSPath)); err != nil { return nil, fmt.Errorf("failed to create sandbox %q: %w", id, err) } @@ -245,6 +264,88 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, fmt.Errorf("failed to start sandbox %q: %w", id, err) } + if !hostNetwork(config) && userNsEnabled { + // If userns is enabled, then the netns was created by the OCI runtime + // on controller.Start(). The OCI runtime needs to create the netns + // because, if userns is in use, the netns needs to be owned by the + // userns. So, let the OCI runtime just handle this for us. + // If the netns is not owned by the userns several problems will happen. + // For instance, the container will lack permission (even if + // capabilities are present) to modify the netns or, even worse, the OCI + // runtime will fail to mount sysfs: + // https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164 + // + // Note we do this after controller.Start(), as before that we + // can't get the PID for the sandbox that we need for the netns. + // Doing a controller.Status() call before that fails (can't + // find the sandbox) so we can't get the PID. + netStart := time.Now() + + // If it is not in host network namespace then create a namespace and set the sandbox + // handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network + // namespaces. If the pod is in host network namespace then both are empty and should not + // be used. + var netnsMountDir = "/var/run/netns" + if c.config.NetNSMountsUnderStateDir { + netnsMountDir = filepath.Join(c.config.StateDir, "netns") + } + + sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, ctrl.Pid) + if err != nil { + return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err) + } + + // Update network namespace in the store, which is used to generate the container's spec + sandbox.NetNSPath = sandbox.NetNS.GetPath() + defer func() { + // Remove the network namespace only if all the resource cleanup is done + if retErr != nil && cleanupErr == nil { + if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id) + return + } + sandbox.NetNSPath = "" + } + }() + + if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil { + return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err) + } + // Save sandbox metadata to store + if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil { + return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err) + } + + // Define this defer to teardownPodNetwork prior to the setupPodNetwork function call. + // This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource + // creation functions. + defer func() { + // Remove the network namespace only if all the resource cleanup is done. + if retErr != nil && cleanupErr == nil { + deferCtx, deferCancel := util.DeferContext() + defer deferCancel() + // Teardown network if an error is returned. + if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id) + } + + } + }() + + // Setup network for sandbox. + // Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524) + // rely on the assumption that CRI shim will not be querying the network namespace to check the + // network states such as IP. + // In future runtime implementation should avoid relying on CRI shim implementation details. + // In this case however caching the IP will add a subtle performance enhancement by avoiding + // calls to network namespace of the pod to query the IP of the veth interface on every + // SandboxStatus request. + if err := c.setupPodNetwork(ctx, &sandbox); err != nil { + return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err) + } + sandboxCreateNetworkTimer.UpdateSince(netStart) + } + // TODO: get rid of this. sandbox object should no longer have Container field. if ociRuntime.SandboxMode == string(criconfig.ModePodSandbox) { container, err := c.client.LoadContainer(ctx, id) diff --git a/snapshotter_opts_unix.go b/snapshotter_opts_unix.go index 4739e192f..2dff9b424 100644 --- a/snapshotter_opts_unix.go +++ b/snapshotter_opts_unix.go @@ -31,7 +31,7 @@ const ( // WithRemapperLabels creates the labels used by any supporting snapshotter // to shift the filesystem ownership (user namespace mapping) automatically; currently -// supported by the fuse-overlayfs snapshotter +// supported by the fuse-overlayfs and overlay snapshotters func WithRemapperLabels(ctrUID, hostUID, ctrGID, hostGID, length uint32) snapshots.Opt { return snapshots.WithLabels(map[string]string{ snapshots.LabelSnapshotUIDMapping: fmt.Sprintf("%d:%d:%d", ctrUID, hostUID, length),