From 97dfa7f55697650225c45aa4cf8e7082ecd168af Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Mon, 23 Jan 2023 18:35:28 +0100 Subject: [PATCH 01/11] cri/server: Pass down uidMappings to OCI runtime When the kubelet sends the uid/gid mappings for a mount, just pass them down to the OCI runtime. OCI runtimes support this since runc 1.2 and crun 1.8.1. And whenever we add mounts (container mounts or image spec volumes) and userns are requested by the kubelet, we use those mappings in the mounts so the mounts are idmapped correctly. If no userns is used, we don't send any mappings which just keeps the current behavior. Signed-off-by: Rodrigo Campos --- pkg/cri/opts/spec_linux_opts.go | 23 +++++++++++++++++++++++ pkg/cri/server/container_create.go | 15 +++++++++++++-- pkg/cri/server/container_create_linux.go | 22 ++++++++++++++++++++++ pkg/cri/server/container_create_test.go | 3 ++- 4 files changed, 60 insertions(+), 3 deletions(-) diff --git a/pkg/cri/opts/spec_linux_opts.go b/pkg/cri/opts/spec_linux_opts.go index 8af03daf7..a052a7b54 100644 --- a/pkg/cri/opts/spec_linux_opts.go +++ b/pkg/cri/opts/spec_linux_opts.go @@ -166,11 +166,34 @@ func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*ru return fmt.Errorf("idmap mounts not yet supported, but they were requested for: %q", src) } + var uidMapping []runtimespec.LinuxIDMapping + if mount.UidMappings != nil { + for _, mapping := range mount.UidMappings { + uidMapping = append(uidMapping, runtimespec.LinuxIDMapping{ + HostID: mapping.HostId, + ContainerID: mapping.ContainerId, + Size: mapping.Length, + }) + } + } + var gidMapping []runtimespec.LinuxIDMapping + if mount.GidMappings != nil { + for _, mapping := range mount.GidMappings { + gidMapping = append(gidMapping, runtimespec.LinuxIDMapping{ + HostID: mapping.HostId, + ContainerID: mapping.ContainerId, + Size: mapping.Length, + }) + } + } + s.Mounts = append(s.Mounts, runtimespec.Mount{ Source: src, Destination: dst, Type: "bind", Options: options, + UIDMappings: uidMapping, + GIDMappings: gidMapping, }) } return nil diff --git a/pkg/cri/server/container_create.go b/pkg/cri/server/container_create.go index 4cc8defae..15efb3e10 100644 --- a/pkg/cri/server/container_create.go +++ b/pkg/cri/server/container_create.go @@ -143,7 +143,7 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta var volumeMounts []*runtime.Mount if !c.config.IgnoreImageDefinedVolumes { // Create container image volumes mounts. - volumeMounts = c.volumeMounts(containerRootDir, config.GetMounts(), &image.ImageSpec.Config) + volumeMounts = c.volumeMounts(containerRootDir, config, &image.ImageSpec.Config) } else if len(image.ImageSpec.Config.Volumes) != 0 { log.G(ctx).Debugf("Ignoring volumes defined in image %v because IgnoreImageDefinedVolumes is set", image.ID) } @@ -318,10 +318,19 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta // volumeMounts sets up image volumes for container. Rely on the removal of container // root directory to do cleanup. Note that image volume will be skipped, if there is criMounts // specified with the same destination. -func (c *criService) volumeMounts(containerRootDir string, criMounts []*runtime.Mount, config *imagespec.ImageConfig) []*runtime.Mount { +func (c *criService) volumeMounts(containerRootDir string, containerConfig *runtime.ContainerConfig, config *imagespec.ImageConfig) []*runtime.Mount { if len(config.Volumes) == 0 { return nil } + var uidMappings, gidMappings []*runtime.IDMapping + if goruntime.GOOS != "windows" { + if usernsOpts := containerConfig.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions(); usernsOpts != nil { + uidMappings = usernsOpts.GetUids() + gidMappings = usernsOpts.GetGids() + } + } + + criMounts := containerConfig.GetMounts() var mounts []*runtime.Mount for dst := range config.Volumes { if isInCRIMounts(dst, criMounts) { @@ -343,6 +352,8 @@ func (c *criService) volumeMounts(containerRootDir string, criMounts []*runtime. ContainerPath: dst, HostPath: src, SelinuxRelabel: true, + UidMappings: uidMappings, + GidMappings: gidMappings, }) } return mounts diff --git a/pkg/cri/server/container_create_linux.go b/pkg/cri/server/container_create_linux.go index d01c2c2e6..1d7c66ccf 100644 --- a/pkg/cri/server/container_create_linux.go +++ b/pkg/cri/server/container_create_linux.go @@ -62,6 +62,12 @@ const ( func (c *criService) containerMounts(sandboxID string, config *runtime.ContainerConfig) []*runtime.Mount { var mounts []*runtime.Mount securityContext := config.GetLinux().GetSecurityContext() + var uidMappings, gidMappings []*runtime.IDMapping + if usernsOpts := securityContext.GetNamespaceOptions().GetUsernsOptions(); usernsOpts != nil { + uidMappings = usernsOpts.GetUids() + gidMappings = usernsOpts.GetGids() + } + if !isInCRIMounts(etcHostname, config.GetMounts()) { // /etc/hostname is added since 1.1.6, 1.2.4 and 1.3. // For in-place upgrade, the old sandbox doesn't have the hostname file, @@ -75,6 +81,8 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container HostPath: hostpath, Readonly: securityContext.GetReadonlyRootfs(), SelinuxRelabel: true, + UidMappings: uidMappings, + GidMappings: gidMappings, }) } } @@ -85,6 +93,8 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container HostPath: c.getSandboxHosts(sandboxID), Readonly: securityContext.GetReadonlyRootfs(), SelinuxRelabel: true, + UidMappings: uidMappings, + GidMappings: gidMappings, }) } @@ -96,6 +106,8 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container HostPath: c.getResolvPath(sandboxID), Readonly: securityContext.GetReadonlyRootfs(), SelinuxRelabel: true, + UidMappings: uidMappings, + GidMappings: gidMappings, }) } @@ -109,6 +121,16 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container HostPath: sandboxDevShm, Readonly: false, SelinuxRelabel: sandboxDevShm != devShm, + // XXX: tmpfs support for idmap mounts got merged in + // Linux 6.3. + // Our CI runs with 5.15 kernels, so disabling idmap + // mounts for this case makes the CI happy (the other fs + // used support idmap mounts in 5.15 kernels). + // We can enable this at a later stage, but as this + // tmpfs mount is exposed empty to the container (no + // prepopulated files) and using the hostIPC with userns + // is blocked by k8s, we can just avoid using the + // mappings and it should work fine. }) } return mounts diff --git a/pkg/cri/server/container_create_test.go b/pkg/cri/server/container_create_test.go index df65106cd..94bf0c4ae 100644 --- a/pkg/cri/server/container_create_test.go +++ b/pkg/cri/server/container_create_test.go @@ -279,8 +279,9 @@ func TestVolumeMounts(t *testing.T) { config := &imagespec.ImageConfig{ Volumes: test.imageVolumes, } + containerConfig := &runtime.ContainerConfig{Mounts: test.criMounts} c := newTestCRIService() - got := c.volumeMounts(testContainerRootDir, test.criMounts, config) + got := c.volumeMounts(testContainerRootDir, containerConfig, config) assert.Len(t, got, len(test.expectedMountDest)) for _, dest := range test.expectedMountDest { found := false From 10cb112e4a6d9ecf5772caca824af3988be087e7 Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Wed, 19 Jul 2023 15:55:26 +0200 Subject: [PATCH 02/11] cri/server: Add tests for ContainerMounts() Signed-off-by: Rodrigo Campos --- pkg/cri/server/container_create_linux_test.go | 52 +++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/pkg/cri/server/container_create_linux_test.go b/pkg/cri/server/container_create_linux_test.go index 31aab83b9..79f9a6e94 100644 --- a/pkg/cri/server/container_create_linux_test.go +++ b/pkg/cri/server/container_create_linux_test.go @@ -459,6 +459,14 @@ func TestContainerAndSandboxPrivileged(t *testing.T) { func TestContainerMounts(t *testing.T) { const testSandboxID = "test-id" + idmap := []*runtime.IDMapping{ + { + ContainerId: 0, + HostId: 100, + Length: 1, + }, + } + for _, test := range []struct { desc string statFn func(string) (os.FileInfo, error) @@ -528,6 +536,50 @@ func TestContainerMounts(t *testing.T) { }, }, }, + { + desc: "should setup uidMappings/gidMappings when userns is used", + securityContext: &runtime.LinuxContainerSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + Uids: idmap, + Gids: idmap, + }, + }, + }, + expectedMounts: []*runtime.Mount{ + { + ContainerPath: "/etc/hostname", + HostPath: filepath.Join(testRootDir, sandboxesDir, testSandboxID, "hostname"), + Readonly: false, + SelinuxRelabel: true, + UidMappings: idmap, + GidMappings: idmap, + }, + { + ContainerPath: "/etc/hosts", + HostPath: filepath.Join(testRootDir, sandboxesDir, testSandboxID, "hosts"), + Readonly: false, + SelinuxRelabel: true, + UidMappings: idmap, + GidMappings: idmap, + }, + { + ContainerPath: resolvConfPath, + HostPath: filepath.Join(testRootDir, sandboxesDir, testSandboxID, "resolv.conf"), + Readonly: false, + SelinuxRelabel: true, + UidMappings: idmap, + GidMappings: idmap, + }, + { + ContainerPath: "/dev/shm", + HostPath: filepath.Join(testStateDir, sandboxesDir, testSandboxID, "shm"), + Readonly: false, + SelinuxRelabel: true, + }, + }, + }, { desc: "should use host /dev/shm when host ipc is set", securityContext: &runtime.LinuxContainerSecurityContext{ From e0b2b17de312f1e893d7a26515b36545279e888d Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Wed, 19 Jul 2023 15:55:54 +0200 Subject: [PATCH 03/11] cri/server: Add tests for the linux-specific parts of VolumeMounts() Signed-off-by: Rodrigo Campos --- pkg/cri/server/container_create_linux_test.go | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) diff --git a/pkg/cri/server/container_create_linux_test.go b/pkg/cri/server/container_create_linux_test.go index 79f9a6e94..cb07d8c60 100644 --- a/pkg/cri/server/container_create_linux_test.go +++ b/pkg/cri/server/container_create_linux_test.go @@ -2311,3 +2311,141 @@ containerEdits: }) } } + +// TestLinuxVolumeMounts tests the linux-specific parts of VolumeMounts. +func TestLinuxVolumeMounts(t *testing.T) { + testContainerRootDir := "test-container-root" + idmap := []*runtime.IDMapping{ + { + ContainerId: 0, + HostId: 100, + Length: 1, + }, + } + + for _, test := range []struct { + desc string + criMounts []*runtime.Mount + imageVolumes map[string]struct{} + usernsEnabled bool + expectedMountDest []string + expectedMappings []*runtime.IDMapping + }{ + { + desc: "should skip image volumes if already mounted by CRI", + usernsEnabled: true, + criMounts: []*runtime.Mount{ + { + ContainerPath: "/test-volume-1", + HostPath: "/test-hostpath-1", + }, + }, + imageVolumes: map[string]struct{}{ + "/test-volume-1": {}, + "/test-volume-2": {}, + }, + expectedMountDest: []string{ + "/test-volume-2", + }, + expectedMappings: idmap, + }, + { + desc: "should include mappings for image volumes", + usernsEnabled: true, + imageVolumes: map[string]struct{}{ + "/test-volume-1/": {}, + "/test-volume-2/": {}, + }, + expectedMountDest: []string{ + "/test-volume-2/", + "/test-volume-2/", + }, + expectedMappings: idmap, + }, + { + desc: "should convert rel imageVolume paths to abs paths", + imageVolumes: map[string]struct{}{ + "test-volume-1/": {}, + "./test-volume-2/": {}, + "../../test-volume-3/": {}, + }, + expectedMountDest: []string{ + "/test-volume-1", + "/test-volume-2", + "/test-volume-3", + }, + }, + { + desc: "should convert rel imageVolume paths to abs paths and add userns mappings", + usernsEnabled: true, + imageVolumes: map[string]struct{}{ + "test-volume-1/": {}, + "./test-volume-2/": {}, + "../../test-volume-3/": {}, + }, + expectedMountDest: []string{ + "/test-volume-1", + "/test-volume-2", + "/test-volume-3", + }, + expectedMappings: idmap, + }, + { + desc: "doesn't include mappings for image volumes if userns is disabled", + imageVolumes: map[string]struct{}{ + "/test-volume-1/": {}, + "/test-volume-2/": {}, + }, + expectedMountDest: []string{ + "/test-volume-2/", + "/test-volume-2/", + }, + }, + } { + test := test + t.Run(test.desc, func(t *testing.T) { + config := &imagespec.ImageConfig{ + Volumes: test.imageVolumes, + } + containerConfig := &runtime.ContainerConfig{ + Mounts: test.criMounts, + } + + if test.usernsEnabled { + containerConfig.Linux = &runtime.LinuxContainerConfig{ + SecurityContext: &runtime.LinuxContainerSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + Uids: idmap, + Gids: idmap, + }, + }, + }, + } + } + + c := newTestCRIService() + got := c.volumeMounts(testContainerRootDir, containerConfig, config) + assert.Len(t, got, len(test.expectedMountDest)) + for _, dest := range test.expectedMountDest { + found := false + for _, m := range got { + if m.ContainerPath != dest { + continue + } + found = true + assert.Equal(t, + filepath.Dir(m.HostPath), + filepath.Join(testContainerRootDir, "volumes")) + + if test.expectedMappings != nil { + assert.Equal(t, test.expectedMappings, m.UidMappings) + assert.Equal(t, test.expectedMappings, m.GidMappings) + } + } + assert.True(t, found) + } + }) + } +} From ab5b43fe80ca9418c1afdda4b808c8cc1566183d Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Fri, 7 Jul 2023 19:02:07 +0200 Subject: [PATCH 04/11] cri/sbserver: Pass down UID/GID mappings to OCI runtime Signed-off-by: Rodrigo Campos --- pkg/cri/sbserver/container_create.go | 41 ++++++- pkg/cri/sbserver/container_create_test.go | 139 ++++++++++++++++++++-- 2 files changed, 170 insertions(+), 10 deletions(-) diff --git a/pkg/cri/sbserver/container_create.go b/pkg/cri/sbserver/container_create.go index f3a241369..ad523a520 100644 --- a/pkg/cri/sbserver/container_create.go +++ b/pkg/cri/sbserver/container_create.go @@ -157,7 +157,7 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta var volumeMounts []*runtime.Mount if !c.config.IgnoreImageDefinedVolumes { // Create container image volumes mounts. - volumeMounts = c.volumeMounts(platform, containerRootDir, config.GetMounts(), &image.ImageSpec.Config) + volumeMounts = c.volumeMounts(platform, containerRootDir, config, &image.ImageSpec.Config) } else if len(image.ImageSpec.Config.Volumes) != 0 { log.G(ctx).Debugf("Ignoring volumes defined in image %v because IgnoreImageDefinedVolumes is set", image.ID) } @@ -341,7 +341,17 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta // volumeMounts sets up image volumes for container. Rely on the removal of container // root directory to do cleanup. Note that image volume will be skipped, if there is criMounts // specified with the same destination. -func (c *criService) volumeMounts(platform platforms.Platform, containerRootDir string, criMounts []*runtime.Mount, config *imagespec.ImageConfig) []*runtime.Mount { +func (c *criService) volumeMounts(platform platforms.Platform, containerRootDir string, containerConfig *runtime.ContainerConfig, config *imagespec.ImageConfig) []*runtime.Mount { + var uidMappings, gidMappings []*runtime.IDMapping + if platform.OS == "linux" { + if usernsOpts := containerConfig.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions(); usernsOpts != nil { + uidMappings = usernsOpts.GetUids() + gidMappings = usernsOpts.GetGids() + } + } + + criMounts := containerConfig.GetMounts() + if len(config.Volumes) == 0 { return nil } @@ -371,6 +381,8 @@ func (c *criService) volumeMounts(platform platforms.Platform, containerRootDir ContainerPath: dst, HostPath: src, SelinuxRelabel: true, + UidMappings: uidMappings, + GidMappings: gidMappings, }) } return mounts @@ -966,11 +978,17 @@ func (c *criService) buildDarwinSpec( return specOpts, nil } -// containerMounts sets up necessary container system file mounts +// linuxContainerMounts sets up necessary container system file mounts // including /dev/shm, /etc/hosts and /etc/resolv.conf. func (c *criService) linuxContainerMounts(sandboxID string, config *runtime.ContainerConfig) []*runtime.Mount { var mounts []*runtime.Mount securityContext := config.GetLinux().GetSecurityContext() + var uidMappings, gidMappings []*runtime.IDMapping + if usernsOpts := securityContext.GetNamespaceOptions().GetUsernsOptions(); usernsOpts != nil { + uidMappings = usernsOpts.GetUids() + gidMappings = usernsOpts.GetGids() + } + if !isInCRIMounts(etcHostname, config.GetMounts()) { // /etc/hostname is added since 1.1.6, 1.2.4 and 1.3. // For in-place upgrade, the old sandbox doesn't have the hostname file, @@ -984,6 +1002,8 @@ func (c *criService) linuxContainerMounts(sandboxID string, config *runtime.Cont HostPath: hostpath, Readonly: securityContext.GetReadonlyRootfs(), SelinuxRelabel: true, + UidMappings: uidMappings, + GidMappings: gidMappings, }) } } @@ -994,6 +1014,8 @@ func (c *criService) linuxContainerMounts(sandboxID string, config *runtime.Cont HostPath: c.getSandboxHosts(sandboxID), Readonly: securityContext.GetReadonlyRootfs(), SelinuxRelabel: true, + UidMappings: uidMappings, + GidMappings: gidMappings, }) } @@ -1005,6 +1027,8 @@ func (c *criService) linuxContainerMounts(sandboxID string, config *runtime.Cont HostPath: c.getResolvPath(sandboxID), Readonly: securityContext.GetReadonlyRootfs(), SelinuxRelabel: true, + UidMappings: uidMappings, + GidMappings: gidMappings, }) } @@ -1018,6 +1042,17 @@ func (c *criService) linuxContainerMounts(sandboxID string, config *runtime.Cont HostPath: sandboxDevShm, Readonly: false, SelinuxRelabel: sandboxDevShm != devShm, + // XXX: tmpfs support for idmap mounts got merged in + // Linux 6.3. + // Our Ubuntu 22.04 CI runs with 5.15 kernels, so + // disabling idmap mounts for this case makes the CI + // happy (the other fs used support idmap mounts in 5.15 + // kernels). + // We can enable this at a later stage, but as this + // tmpfs mount is exposed empty to the container (no + // prepopulated files) and using the hostIPC with userns + // is blocked by k8s, we can just avoid using the + // mappings and it should work fine. }) } return mounts diff --git a/pkg/cri/sbserver/container_create_test.go b/pkg/cri/sbserver/container_create_test.go index 3f6cb067c..bbc21f209 100644 --- a/pkg/cri/sbserver/container_create_test.go +++ b/pkg/cri/sbserver/container_create_test.go @@ -231,12 +231,22 @@ func TestContainerSpecCommand(t *testing.T) { func TestVolumeMounts(t *testing.T) { testContainerRootDir := "test-container-root" + idmap := []*runtime.IDMapping{ + { + ContainerId: 0, + HostId: 100, + Length: 1, + }, + } + for _, test := range []struct { desc string platform platforms.Platform criMounts []*runtime.Mount + usernsEnabled bool imageVolumes map[string]struct{} expectedMountDest []string + expectedMappings []*runtime.IDMapping }{ { desc: "should setup rw mount for image volumes", @@ -297,25 +307,88 @@ func TestVolumeMounts(t *testing.T) { "/abs/test-volume-4", }, }, + { + desc: "should include mappings for image volumes on Linux", + platform: platforms.Platform{OS: "linux"}, + usernsEnabled: true, + imageVolumes: map[string]struct{}{ + "/test-volume-1/": {}, + "/test-volume-2/": {}, + }, + expectedMountDest: []string{ + "/test-volume-2/", + "/test-volume-2/", + }, + expectedMappings: idmap, + }, + { + desc: "should NOT include mappings for image volumes on Linux if !userns", + platform: platforms.Platform{OS: "linux"}, + usernsEnabled: false, + imageVolumes: map[string]struct{}{ + "/test-volume-1/": {}, + "/test-volume-2/": {}, + }, + expectedMountDest: []string{ + "/test-volume-2/", + "/test-volume-2/", + }, + }, + { + desc: "should convert rel imageVolume paths to abs paths and add userns mappings", + platform: platforms.Platform{OS: "linux"}, + usernsEnabled: true, + imageVolumes: map[string]struct{}{ + "test-volume-1/": {}, + "C:/test-volume-2/": {}, + "../../test-volume-3/": {}, + }, + expectedMountDest: []string{ + "/test-volume-1", + "/C:/test-volume-2", + "/test-volume-3", + }, + expectedMappings: idmap, + }, } { test := test t.Run(test.desc, func(t *testing.T) { config := &imagespec.ImageConfig{ Volumes: test.imageVolumes, } + containerConfig := &runtime.ContainerConfig{Mounts: test.criMounts} + if test.usernsEnabled { + containerConfig.Linux = &runtime.LinuxContainerConfig{ + SecurityContext: &runtime.LinuxContainerSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + Uids: idmap, + Gids: idmap, + }, + }, + }, + } + } + c := newTestCRIService() - got := c.volumeMounts(test.platform, testContainerRootDir, test.criMounts, config) + got := c.volumeMounts(test.platform, testContainerRootDir, containerConfig, config) assert.Len(t, got, len(test.expectedMountDest)) for _, dest := range test.expectedMountDest { found := false for _, m := range got { - if m.ContainerPath == dest { - found = true - assert.Equal(t, - filepath.Dir(m.HostPath), - filepath.Join(testContainerRootDir, "volumes")) - break + if m.ContainerPath != dest { + continue } + found = true + assert.Equal(t, + filepath.Dir(m.HostPath), + filepath.Join(testContainerRootDir, "volumes")) + if test.expectedMappings != nil { + assert.Equal(t, test.expectedMappings, m.UidMappings) + assert.Equal(t, test.expectedMappings, m.GidMappings) + } + break } assert.True(t, found) } @@ -481,6 +554,14 @@ func TestBaseRuntimeSpec(t *testing.T) { func TestLinuxContainerMounts(t *testing.T) { const testSandboxID = "test-id" + idmap := []*runtime.IDMapping{ + { + ContainerId: 0, + HostId: 100, + Length: 1, + }, + } + for _, test := range []struct { desc string statFn func(string) (os.FileInfo, error) @@ -550,6 +631,50 @@ func TestLinuxContainerMounts(t *testing.T) { }, }, }, + { + desc: "should setup uidMappings/gidMappings when userns is used", + securityContext: &runtime.LinuxContainerSecurityContext{ + NamespaceOptions: &runtime.NamespaceOption{ + UsernsOptions: &runtime.UserNamespace{ + Mode: runtime.NamespaceMode_POD, + Uids: idmap, + Gids: idmap, + }, + }, + }, + expectedMounts: []*runtime.Mount{ + { + ContainerPath: "/etc/hostname", + HostPath: filepath.Join(testRootDir, sandboxesDir, testSandboxID, "hostname"), + Readonly: false, + SelinuxRelabel: true, + UidMappings: idmap, + GidMappings: idmap, + }, + { + ContainerPath: "/etc/hosts", + HostPath: filepath.Join(testRootDir, sandboxesDir, testSandboxID, "hosts"), + Readonly: false, + SelinuxRelabel: true, + UidMappings: idmap, + GidMappings: idmap, + }, + { + ContainerPath: resolvConfPath, + HostPath: filepath.Join(testRootDir, sandboxesDir, testSandboxID, "resolv.conf"), + Readonly: false, + SelinuxRelabel: true, + UidMappings: idmap, + GidMappings: idmap, + }, + { + ContainerPath: "/dev/shm", + HostPath: filepath.Join(testStateDir, sandboxesDir, testSandboxID, "shm"), + Readonly: false, + SelinuxRelabel: true, + }, + }, + }, { desc: "should use host /dev/shm when host ipc is set", securityContext: &runtime.LinuxContainerSecurityContext{ From 24aa808fe26c8d557f1981531009e3e306d0d22c Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Fri, 21 Jul 2023 13:30:51 +0200 Subject: [PATCH 05/11] integration: Add userns test with volumes Signed-off-by: Rodrigo Campos --- integration/main_test.go | 15 +++ integration/pod_userns_linux_test.go | 143 ++++++++++++++++++++++++++- 2 files changed, 153 insertions(+), 5 deletions(-) diff --git a/integration/main_test.go b/integration/main_test.go index 24f6ac93b..4c9e733ae 100644 --- a/integration/main_test.go +++ b/integration/main_test.go @@ -298,6 +298,21 @@ func WithVolumeMount(hostPath, containerPath string) ContainerOpts { } } +func WithIDMapVolumeMount(hostPath, containerPath string, uidMaps, gidMaps []*runtime.IDMapping) ContainerOpts { + return func(c *runtime.ContainerConfig) { + hostPath, _ = filepath.Abs(hostPath) + containerPath, _ = filepath.Abs(containerPath) + mount := &runtime.Mount{ + HostPath: hostPath, + ContainerPath: containerPath, + SelinuxRelabel: selinux.GetEnabled(), + UidMappings: uidMaps, + GidMappings: gidMaps, + } + c.Mounts = append(c.Mounts, mount) + } +} + func WithWindowsUsername(username string) ContainerOpts { return func(c *runtime.ContainerConfig) { if c.Windows == nil { diff --git a/integration/pod_userns_linux_test.go b/integration/pod_userns_linux_test.go index e0a561433..9553a48cc 100644 --- a/integration/pod_userns_linux_test.go +++ b/integration/pod_userns_linux_test.go @@ -19,7 +19,9 @@ package integration import ( "fmt" "os" + "os/user" "path/filepath" + "strings" "syscall" "testing" "time" @@ -28,17 +30,121 @@ import ( "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" exec "golang.org/x/sys/execabs" + "golang.org/x/sys/unix" runtime "k8s.io/cri-api/pkg/apis/runtime/v1" ) +const ( + defaultRoot = "/var/lib/containerd-test" +) + +func supportsUserNS() bool { + if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) { + return false + } + return true +} + +func supportsIDMap(path string) bool { + treeFD, err := unix.OpenTree(-1, path, uint(unix.OPEN_TREE_CLONE|unix.OPEN_TREE_CLOEXEC)) + if err != nil { + return false + } + defer unix.Close(treeFD) + + // We want to test if idmap mounts are supported. + // So we use just some random mapping, it doesn't really matter which one. + // For the helper command, we just need something that is alive while we + // test this, a sleep 5 will do it. + cmd := exec.Command("sleep", "5") + cmd.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: syscall.CLONE_NEWUSER, + UidMappings: []syscall.SysProcIDMap{{ContainerID: 0, HostID: 65536, Size: 65536}}, + GidMappings: []syscall.SysProcIDMap{{ContainerID: 0, HostID: 65536, Size: 65536}}, + } + if err := cmd.Start(); err != nil { + return false + } + defer func() { + _ = cmd.Process.Kill() + _ = cmd.Wait() + }() + + usernsFD := fmt.Sprintf("/proc/%d/ns/user", cmd.Process.Pid) + var usernsFile *os.File + if usernsFile, err = os.Open(usernsFD); err != nil { + return false + } + defer usernsFile.Close() + + attr := unix.MountAttr{ + Attr_set: unix.MOUNT_ATTR_IDMAP, + Userns_fd: uint64(usernsFile.Fd()), + } + if err := unix.MountSetattr(treeFD, "", unix.AT_EMPTY_PATH, &attr); err != nil { + return false + } + + return true +} + +// traversePath gives 755 permissions for all elements in tPath below +// os.TempDir() and errors out if elements above it don't have read+exec +// permissions for others. tPath MUST be a descendant of os.TempDir(). The path +// returned by testing.TempDir() usually is. +func traversePath(tPath string) error { + // Check the assumption that the argument is under os.TempDir(). + tempBase := os.TempDir() + if !strings.HasPrefix(tPath, tempBase) { + return fmt.Errorf("traversePath: %q is not a descendant of %q", tPath, tempBase) + } + + var path string + for _, p := range strings.SplitAfter(tPath, "/") { + path = path + p + stats, err := os.Stat(path) + if err != nil { + return err + } + + perm := stats.Mode().Perm() + if perm&0o5 == 0o5 { + continue + } + if strings.HasPrefix(tempBase, path) { + return fmt.Errorf("traversePath: directory %q MUST have read+exec permissions for others", path) + } + + if err := os.Chmod(path, perm|0o755); err != nil { + return err + } + } + + return nil +} + func TestPodUserNS(t *testing.T) { containerID := uint32(0) hostID := uint32(65536) size := uint32(65536) + idmap := []*runtime.IDMapping{ + { + ContainerId: containerID, + HostId: hostID, + Length: size, + }, + } + + volumeHostPath := t.TempDir() + if err := traversePath(volumeHostPath); err != nil { + t.Fatalf("failed to setup volume host path: %v", err) + } + for name, test := range map[string]struct { sandboxOpts []PodSandboxOpts containerOpts []ContainerOpts checkOutput func(t *testing.T, output string) + hostVolumes bool // whether to config uses host Volumes expectErr bool }{ "userns uid mapping": { @@ -85,6 +191,31 @@ func TestPodUserNS(t *testing.T) { assert.Contains(t, output, "=0=0=") }, }, + "volumes permissions": { + sandboxOpts: []PodSandboxOpts{ + WithPodUserNs(containerID, hostID, size), + }, + hostVolumes: true, + containerOpts: []ContainerOpts{ + WithUserNamespace(containerID, hostID, size), + WithIDMapVolumeMount(volumeHostPath, "/mnt", idmap, idmap), + // Prints numeric UID and GID for path. + // For example, if UID and GID is 0 it will print: =0=0= + // We add the "=" signs so we use can assert.Contains() and be sure + // the UID/GID is 0 and not things like 100 (that contain 0). + // We can't use assert.Equal() easily as it contains timestamp, etc. + WithCommand("stat", "-c", "'=%u=%g='", "/mnt/"), + }, + checkOutput: func(t *testing.T, output string) { + // The UID and GID should be the current user if chown/remap is done correctly. + uid := "0" + user, err := user.Current() + if user != nil && err == nil { + uid = user.Uid + } + assert.Contains(t, output, "="+uid+"="+uid+"=") + }, + }, "fails with several mappings": { sandboxOpts: []PodSandboxOpts{ WithPodUserNs(containerID, hostID, size), @@ -94,12 +225,14 @@ func TestPodUserNS(t *testing.T) { }, } { t.Run(name, func(t *testing.T) { - cmd := exec.Command("true") - cmd.SysProcAttr = &syscall.SysProcAttr{ - Cloneflags: syscall.CLONE_NEWUSER, + if !supportsUserNS() { + t.Skip("User namespaces are not supported") } - if err := cmd.Run(); err != nil { - t.Skip("skipping test: user namespaces are unavailable") + if !supportsIDMap(defaultRoot) { + t.Skipf("ID mappings are not supported on: %v", defaultRoot) + } + if test.hostVolumes && !supportsIDMap(volumeHostPath) { + t.Skipf("ID mappings are not supported host volume filesystem: %v", volumeHostPath) } testPodLogDir := t.TempDir() From e832605a80793b2a90c3fab2c0625409ed41bc99 Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Fri, 21 Jul 2023 13:31:33 +0200 Subject: [PATCH 06/11] integration: Simplify WithVolumeMount() Signed-off-by: Rodrigo Campos --- integration/main_test.go | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/integration/main_test.go b/integration/main_test.go index 4c9e733ae..53c5d7ce4 100644 --- a/integration/main_test.go +++ b/integration/main_test.go @@ -286,16 +286,7 @@ func WithWindowsResources(r *runtime.WindowsContainerResources) ContainerOpts { } func WithVolumeMount(hostPath, containerPath string) ContainerOpts { - return func(c *runtime.ContainerConfig) { - hostPath, _ = filepath.Abs(hostPath) - containerPath, _ = filepath.Abs(containerPath) - mount := &runtime.Mount{ - HostPath: hostPath, - ContainerPath: containerPath, - SelinuxRelabel: selinux.GetEnabled(), - } - c.Mounts = append(c.Mounts, mount) - } + return WithIDMapVolumeMount(hostPath, containerPath, nil, nil) } func WithIDMapVolumeMount(hostPath, containerPath string, uidMaps, gidMaps []*runtime.IDMapping) ContainerOpts { From a81f80884bd5bb38e70a9af6a3160e3bcec93cdb Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Thu, 13 Apr 2023 11:11:40 +0200 Subject: [PATCH 07/11] Revert "cri: Throw an error if idmap mounts is requested" This reverts commit 7e6ab84884f3ea52ee9f59699967d5578f957261. Signed-off-by: Rodrigo Campos --- pkg/cri/opts/spec_linux_opts.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/pkg/cri/opts/spec_linux_opts.go b/pkg/cri/opts/spec_linux_opts.go index a052a7b54..791f6e4f8 100644 --- a/pkg/cri/opts/spec_linux_opts.go +++ b/pkg/cri/opts/spec_linux_opts.go @@ -162,9 +162,6 @@ func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*ru return fmt.Errorf("relabel %q with %q failed: %w", src, mountLabel, err) } } - if mount.UidMappings != nil || mount.GidMappings != nil { - return fmt.Errorf("idmap mounts not yet supported, but they were requested for: %q", src) - } var uidMapping []runtimespec.LinuxIDMapping if mount.UidMappings != nil { From fce1b950768968666f86da9a20dfa5265922924d Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Thu, 24 Aug 2023 12:27:52 +0200 Subject: [PATCH 08/11] go.mod: Update runtime spec to include features.MountExtensions Future patches will use that field. Signed-off-by: Rodrigo Campos --- go.mod | 2 +- go.sum | 4 ++-- integration/client/go.mod | 2 +- integration/client/go.sum | 3 ++- .../specs-go/features/features.go | 24 +++++++++++++++---- .../runtime-spec/specs-go/version.go | 2 +- vendor/modules.txt | 2 +- 7 files changed, 27 insertions(+), 12 deletions(-) diff --git a/go.mod b/go.mod index cc2821c1d..4eeb30ffd 100644 --- a/go.mod +++ b/go.mod @@ -46,7 +46,7 @@ require ( github.com/opencontainers/go-digest v1.0.0 github.com/opencontainers/image-spec v1.1.0-rc4 github.com/opencontainers/runc v1.1.9 - github.com/opencontainers/runtime-spec v1.1.0 + github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 github.com/opencontainers/selinux v1.11.0 github.com/pelletier/go-toml v1.9.5 diff --git a/go.sum b/go.sum index c44d9779c..312814754 100644 --- a/go.sum +++ b/go.sum @@ -799,8 +799,8 @@ github.com/opencontainers/runtime-spec v1.0.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/ github.com/opencontainers/runtime-spec v1.0.3-0.20200929063507-e6143ca7d51d/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= -github.com/opencontainers/runtime-spec v1.1.0 h1:HHUyrt9mwHUjtasSbXSMvs4cyFxh+Bll4AjJ9odEGpg= -github.com/opencontainers/runtime-spec v1.1.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 h1:EctkgBjZ1y4q+sibyuuIgiKpa0QSd2elFtSSdNvBVow= +github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-tools v0.0.0-20181011054405-1d69bd0f9c39/go.mod h1:r3f7wjNzSs2extwzU3Y+6pKfobzPh+kKFJ3ofN+3nfs= github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 h1:DmNGcqH3WDbV5k8OJ+esPWbqUOX5rMLR2PMvziDMJi0= github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626/go.mod h1:BRHJJd0E+cx42OybVYSgUvZmU0B8P9gZuRXlZUP7TKI= diff --git a/integration/client/go.mod b/integration/client/go.mod index a6e8b9f74..2b6775b89 100644 --- a/integration/client/go.mod +++ b/integration/client/go.mod @@ -14,7 +14,7 @@ require ( github.com/containerd/typeurl/v2 v2.1.1 github.com/opencontainers/go-digest v1.0.0 github.com/opencontainers/image-spec v1.1.0-rc4 - github.com/opencontainers/runtime-spec v1.1.0 + github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 github.com/stretchr/testify v1.8.4 go.opentelemetry.io/otel v1.14.0 go.opentelemetry.io/otel/sdk v1.14.0 diff --git a/integration/client/go.sum b/integration/client/go.sum index c845e3655..8cd6d0371 100644 --- a/integration/client/go.sum +++ b/integration/client/go.sum @@ -1469,8 +1469,9 @@ github.com/opencontainers/runtime-spec v1.0.3-0.20200929063507-e6143ca7d51d/go.m github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-spec v1.0.3-0.20220825212826-86290f6a00fb/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-spec v1.1.0-rc.2/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= -github.com/opencontainers/runtime-spec v1.1.0 h1:HHUyrt9mwHUjtasSbXSMvs4cyFxh+Bll4AjJ9odEGpg= github.com/opencontainers/runtime-spec v1.1.0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 h1:EctkgBjZ1y4q+sibyuuIgiKpa0QSd2elFtSSdNvBVow= +github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/runtime-tools v0.0.0-20181011054405-1d69bd0f9c39/go.mod h1:r3f7wjNzSs2extwzU3Y+6pKfobzPh+kKFJ3ofN+3nfs= github.com/opencontainers/runtime-tools v0.9.0/go.mod h1:r3f7wjNzSs2extwzU3Y+6pKfobzPh+kKFJ3ofN+3nfs= github.com/opencontainers/runtime-tools v0.9.1-0.20221107090550-2e043c6bd626 h1:DmNGcqH3WDbV5k8OJ+esPWbqUOX5rMLR2PMvziDMJi0= diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/features/features.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/features/features.go index 230e88f56..39009c79d 100644 --- a/vendor/github.com/opencontainers/runtime-spec/specs-go/features/features.go +++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/features/features.go @@ -36,11 +36,12 @@ type Linux struct { // Nil value means "unknown", not "no support for any capability". Capabilities []string `json:"capabilities,omitempty"` - Cgroup *Cgroup `json:"cgroup,omitempty"` - Seccomp *Seccomp `json:"seccomp,omitempty"` - Apparmor *Apparmor `json:"apparmor,omitempty"` - Selinux *Selinux `json:"selinux,omitempty"` - IntelRdt *IntelRdt `json:"intelRdt,omitempty"` + Cgroup *Cgroup `json:"cgroup,omitempty"` + Seccomp *Seccomp `json:"seccomp,omitempty"` + Apparmor *Apparmor `json:"apparmor,omitempty"` + Selinux *Selinux `json:"selinux,omitempty"` + IntelRdt *IntelRdt `json:"intelRdt,omitempty"` + MountExtensions *MountExtensions `json:"mountExtensions,omitempty"` } // Cgroup represents the "cgroup" field. @@ -123,3 +124,16 @@ type IntelRdt struct { // Nil value means "unknown", not "false". Enabled *bool `json:"enabled,omitempty"` } + +// MountExtensions represents the "mountExtensions" field. +type MountExtensions struct { + // IDMap represents the status of idmap mounts support. + IDMap *IDMap `json:"idmap,omitempty"` +} + +type IDMap struct { + // Enabled represents whether idmap mounts supports is compiled in. + // Unrelated to whether the host supports it or not. + // Nil value means "unknown", not "false". + Enabled *bool `json:"enabled,omitempty"` +} diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go index b3fca349c..35358c2c5 100644 --- a/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go +++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/version.go @@ -11,7 +11,7 @@ const ( VersionPatch = 0 // VersionDev indicates development branch. Releases will be empty string. - VersionDev = "" + VersionDev = "+dev" ) // Version is the specification version that the package types support. diff --git a/vendor/modules.txt b/vendor/modules.txt index 9746f82ae..83f0c469e 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -343,7 +343,7 @@ github.com/opencontainers/image-spec/specs-go/v1 # github.com/opencontainers/runc v1.1.9 ## explicit; go 1.17 github.com/opencontainers/runc/libcontainer/user -# github.com/opencontainers/runtime-spec v1.1.0 +# github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4 ## explicit github.com/opencontainers/runtime-spec/specs-go github.com/opencontainers/runtime-spec/specs-go/features From 2e13d39546cf1e2710d561589a9b9dfe0ec7b6d5 Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Thu, 17 Aug 2023 12:31:21 +0200 Subject: [PATCH 09/11] pkg/process: Only use idmap mounts if runc supports it runc, as mandated by the runtime-spec, ignores unknown fields in the config.json. This is unfortunate for cases where we _must_ enable that feature or fail. For example, if we want to start a container with user namespaces and volumes, using the uidMappings/gidMappings field is needed so the UID/GIDs in the volume don't end up with garbage. However, if we don't fail when runc will ignore these fields (because they are unknown to runc), we will just start a container without using the mappings and the UID/GIDs the container will persist to volumes the hostUID/GID, that can change if the container is re-scheduled by Kubernetes. This will end up in volumes having "garbage" and unmapped UIDs that the container can no longer change. So, let's avoid this entirely by just checking that runc supports idmap mounts if the container we are about to create needs them. Please note that the "runc features" subcommand is only run when we are using idmap mounts. If idmap mounts are not used, the subcommand is not run and therefore this should not affect containers that don't use idmap mounts in any way. Signed-off-by: Rodrigo Campos --- integration/pod_userns_linux_test.go | 25 ++++++++++++ pkg/process/init.go | 58 ++++++++++++++++++++++++++++ pkg/process/utils.go | 24 ++++++++++++ 3 files changed, 107 insertions(+) diff --git a/integration/pod_userns_linux_test.go b/integration/pod_userns_linux_test.go index 9553a48cc..0980ec664 100644 --- a/integration/pod_userns_linux_test.go +++ b/integration/pod_userns_linux_test.go @@ -17,6 +17,8 @@ package integration import ( + "context" + "errors" "fmt" "os" "os/user" @@ -27,6 +29,7 @@ import ( "time" "github.com/containerd/containerd/integration/images" + runc "github.com/containerd/go-runc" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" exec "golang.org/x/sys/execabs" @@ -234,6 +237,9 @@ func TestPodUserNS(t *testing.T) { if test.hostVolumes && !supportsIDMap(volumeHostPath) { t.Skipf("ID mappings are not supported host volume filesystem: %v", volumeHostPath) } + if err := supportsRuncIDMap(); err != nil { + t.Skipf("OCI runtime doesn't support idmap mounts: %v", err) + } testPodLogDir := t.TempDir() sandboxOpts := append(test.sandboxOpts, WithPodLogDirectory(testPodLogDir)) @@ -297,3 +303,22 @@ func TestPodUserNS(t *testing.T) { }) } } + +func supportsRuncIDMap() error { + var r runc.Runc + features, err := r.Features(context.Background()) + if err != nil { + // If the features command is not implemented, then runc is too old. + return fmt.Errorf("features command failed: %w", err) + } + + if features.Linux.MountExtensions == nil || features.Linux.MountExtensions.IDMap == nil { + return errors.New("missing `mountExtensions.idmap` entry in `features` command") + + } + if enabled := features.Linux.MountExtensions.IDMap.Enabled; enabled == nil || !*enabled { + return errors.New("idmap mounts not supported") + } + + return nil +} diff --git a/pkg/process/init.go b/pkg/process/init.go index 00343f854..13de95a57 100644 --- a/pkg/process/init.go +++ b/pkg/process/init.go @@ -21,6 +21,7 @@ package process import ( "context" "encoding/json" + "errors" "fmt" "io" "os" @@ -140,6 +141,13 @@ func (p *Init) Create(ctx context.Context, r *CreateConfig) error { if socket != nil { opts.ConsoleSocket = socket } + + // runc ignores silently features it doesn't know about, so for things that this is + // problematic let's check if this runc version supports them. + if err := p.validateRuncFeatures(ctx, r.Bundle); err != nil { + return fmt.Errorf("failed to detect OCI runtime features: %w", err) + } + if err := p.runtime.Create(ctx, r.ID, r.Bundle, opts); err != nil { return p.runtimeError(err, "OCI runtime create failed") } @@ -173,6 +181,56 @@ func (p *Init) Create(ctx context.Context, r *CreateConfig) error { return nil } +func (p *Init) validateRuncFeatures(ctx context.Context, bundle string) error { + // TODO: We should remove the logic from here and rebase on #8509. + // This way we can avoid the call to readConfig() here and the call to p.runtime.Features() + // in validateIDMapMounts(). + // But that PR is not yet merged nor it is clear if it will be refactored. + // Do this contained hack for now. + spec, err := readConfig(bundle) + if err != nil { + return fmt.Errorf("failed to read config: %w", err) + } + + if err := p.validateIDMapMounts(ctx, spec); err != nil { + return fmt.Errorf("OCI runtime doesn't support idmap mounts: %w", err) + } + + return nil +} + +func (p *Init) validateIDMapMounts(ctx context.Context, spec *specs.Spec) error { + var used bool + for _, m := range spec.Mounts { + if m.UIDMappings != nil || m.GIDMappings != nil { + used = true + break + } + } + + if !used { + return nil + } + + // From here onwards, we require idmap mounts. So if we fail to check, we return an error. + features, err := p.runtime.Features(ctx) + if err != nil { + // If the features command is not implemented, then runc is too old. + return fmt.Errorf("features command failed: %w", err) + + } + + if features.Linux.MountExtensions == nil || features.Linux.MountExtensions.IDMap == nil { + return errors.New("missing `mountExtensions.idmap` entry in `features` command") + } + + if enabled := features.Linux.MountExtensions.IDMap.Enabled; enabled == nil || !*enabled { + return errors.New("idmap mounts not supported") + } + + return nil +} + func (p *Init) openStdin(path string) error { sc, err := fifo.OpenFifo(context.Background(), path, unix.O_WRONLY|unix.O_NONBLOCK, 0) if err != nil { diff --git a/pkg/process/utils.go b/pkg/process/utils.go index 34b1f633d..58f9a21d2 100644 --- a/pkg/process/utils.go +++ b/pkg/process/utils.go @@ -21,6 +21,7 @@ package process import ( "context" "encoding/json" + "errors" "fmt" "io" "os" @@ -31,6 +32,7 @@ import ( "github.com/containerd/containerd/errdefs" runc "github.com/containerd/go-runc" + specs "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" ) @@ -39,6 +41,8 @@ const ( RuncRoot = "/run/containerd/runc" // InitPidFile name of the file that contains the init pid InitPidFile = "init.pid" + // configFile is the name of the runc config file + configFile = "config.json" ) // safePid is a thread safe wrapper for pid. @@ -184,3 +188,23 @@ func stateName(v interface{}) string { } panic(fmt.Errorf("invalid state %v", v)) } + +func readConfig(path string) (spec *specs.Spec, err error) { + cfg := filepath.Join(path, configFile) + f, err := os.Open(cfg) + if err != nil { + if os.IsNotExist(err) { + return nil, fmt.Errorf("JSON specification file %s not found", cfg) + } + return nil, err + } + defer f.Close() + + if err = json.NewDecoder(f).Decode(&spec); err != nil { + return nil, fmt.Errorf("failed to parse config: %w", err) + } + if spec == nil { + return nil, errors.New("config cannot be null") + } + return spec, nil +} From 967313049f3746f9125d385061bffeaf3cac911d Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Thu, 7 Sep 2023 12:33:28 +0200 Subject: [PATCH 10/11] doc: Add documentation about CRI user namespaces Signed-off-by: Rodrigo Campos --- RELEASES.md | 2 +- docs/user-namespaces/README.md | 146 +++++++++++++++++++++++ docs/user-namespaces/config.json | 199 +++++++++++++++++++++++++++++++ 3 files changed, 346 insertions(+), 1 deletion(-) create mode 100644 docs/user-namespaces/README.md create mode 100644 docs/user-namespaces/config.json diff --git a/RELEASES.md b/RELEASES.md index 086ba6f1f..8c8964a9c 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -461,4 +461,4 @@ more quickly. | [NRI in CRI Support](https://github.com/containerd/containerd/pull/6019) | containerd v1.7 | containerd v2.0 | | [gRPC Shim](https://github.com/containerd/containerd/pull/8052) | containerd v1.7 | containerd v2.0 | | [CRI Runtime Specific Snapshotter](https://github.com/containerd/containerd/pull/6899) | containerd v1.7 | containerd v2.0 | -| [CRI Support for User Namespaces](https://github.com/containerd/containerd/pull/7679) | containerd v1.7 | containerd v2.0 | +| [CRI Support for User Namespaces](./docs/user-namespaces/README.md) | containerd v1.7 | containerd v2.0 | diff --git a/docs/user-namespaces/README.md b/docs/user-namespaces/README.md new file mode 100644 index 000000000..92e83c454 --- /dev/null +++ b/docs/user-namespaces/README.md @@ -0,0 +1,146 @@ +# Support for user namespaces + +Kubernetes supports running pods with user namespace since v1.25. This document explains the +containerd support for this feature. + +## What are user namespaces? + +A user namespace isolates the user running inside the container from the one in the host. + +A process running as root in a container can run as a different (non-root) user in the host; in +other words, the process has full privileges for operations inside the user namespace, but is +unprivileged for operations outside the namespace. + +You can use this feature to reduce the damage a compromised container can do to the host or other +pods in the same node. There are several security vulnerabilities rated either HIGH or CRITICAL that +were not exploitable when user namespaces is active. It is expected user namespace will mitigate +some future vulnerabilities too. + +See [the kubernetes documentation][kube-intro] for a high-level introduction to +user namespaces. + +[kube-intro]: https://kubernetes.io/docs/concepts/workloads/pods/user-namespaces/#introduction + +## Stack requirements + +The Kubernetes implementation was redesigned in 1.27, so the requirements are different for versions +pre and post Kubernetes 1.27. + +Please note that if you try to use user namespaces with containerd 1.6 or older, the `hostUsers: +false` setting in your pod.spec will be **silently ignored**. + +### Kubernetes 1.25 and 1.26 + + * Containerd 1.7 or greater + * runc 1.1 or greater + +### Kubernetes 1.27 and greater + + * Linux 6.3 or greater + * Containerd 2.0 or greater + * You can use runc or crun as the OCI runtime: + * runc 1.2 or greater + * crun 1.9 or greater + +Furthermore, all the file-systems used by the volumes in the pod need kernel-support for idmap +mounts. Some popular file-systems that support idmap mounts in Linux 6.3 are: `btrfs`, `ext4`, `xfs`, +`fat`, `tmpfs`, `overlayfs`. + +The kubelet is in charge of populating some files to the containers (like configmap, secrets, etc.). +The file-system used in that path needs to support idmap mounts too. See [the Kubernetes +documentation][kube-req] for more info on that. + + +[kube-req]: https://kubernetes.io/docs/concepts/workloads/pods/user-namespaces/#before-you-begin + +## Creating a Kubernetes pod with user namespaces + +First check your containerd, Linux and Kubernetes versions. If those are okay, then there is no +special configuration needed on conntainerd. You can just follow the steps in the [Kubernetes +website][kube-example]. + +[kube-example]: https://kubernetes.io/docs/tasks/configure-pod-container/user-namespaces/ + +# Limitations + +You can check the limitations Kubernetes has [here][kube-limitations]. Note that different +Kubernetes versions have different limitations, be sure to check the site for the Kubernetes version +you are using. + +Different containerd versions have different limitations too, those are highlighted in this section. + +[kube-limitations]: https://kubernetes.io/docs/concepts/workloads/pods/user-namespaces/#limitations + +### containerd 1.7 + +One limitation present in containerd 1.7 is that it needs to change the ownership of every file and +directory inside the container image, during Pod startup. This means it has a storage overhead (the +size of the container image is duplicated each time a pod is created) and can significantly impact +the container startup latency. + +You can mitigate this limitation by switching `/sys/module/overlay/parameters/metacopy` to `Y`. This +will significantly reduce the storage and performance overhead, as only the inode for each file of +the container image will be duplicated, but not the content of the file. This means it will use less +storage and it will be faster. However, it is not a panacea. + +If you change the metacopy param, make sure to do it in a way that is persistant across reboots. You +should also be aware that this setting will be used for all containers, not just containers with +user namespaces enabled. This will affect all the snapshots that you take manually (if you happen to +do that). In that case, make sure to use the same value of `/sys/module/overlay/parameters/metacopy` +when creating and restoring the snapshot. + +### containerd 2.0 + +The storage and latency limitation from containerd 1.7 are not present in container 2.0 and above, +if you use the overlay snapshotter (this is used by default). It will not use more storage at all, +and there is no startup latency. + +This is achieved by using the kernel feature idmap mounts with the container rootfs (the container +image). This allows an overlay file-system to expose the image with different UID/GID without copying +the files nor the inodes, just using a bind-mount. + +You can check if you are using idmap mounts for the container image if you create a pod with user +namespaces, exec into it and run: + +``` +mount | grep overlay +``` + +You should see a reference to the idmap mount in the `lowerdir` parameter, in this case we can see +`idmapped` used there: + +``` +overlay on / type overlay (rw,relatime,lowerdir=/tmp/ovl-idmapped823885363/0,upperdir=/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/1018/fs,workdir=/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/1018/work) +``` + +## Creating a container with user namespaces with `ctr` + +You can also create a container with user namespaces using `ctr`. This is more low-level, be warned. + +Create an OCI bundle as explained [here][runc-bundle]. Then, change the UID/GID to 65536: + +``` +sudo chown -R 65536:65536 rootfs/ +``` + +Copy [this config.json](./config.json) and replace `XXX-path-to-rootfs` with the +absolute path to the rootfs you just chowned. + +Then create and start the container with: + +``` +sudo ctr create --config /config.json userns-test +sudo ctr t start userns-test +``` + +This will open a shell inside the container. You can run this, to verify you are inside a user +namespace: + +``` +root@runc:/# cat /proc/self/uid_map + 0 65536 65536 +``` + +The output should be exactly the same. + +[runc-bundle]: https://github.com/opencontainers/runc#creating-an-oci-bundle diff --git a/docs/user-namespaces/config.json b/docs/user-namespaces/config.json new file mode 100644 index 000000000..246876388 --- /dev/null +++ b/docs/user-namespaces/config.json @@ -0,0 +1,199 @@ +{ + "ociVersion": "1.0.2-dev", + "process": { + "terminal": true, + "user": { + "uid": 0, + "gid": 0 + }, + "args": [ + "bash" + ], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "/", + "capabilities": { + "bounding": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "effective": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "inheritable": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "permitted": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "ambient": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ] + }, + "rlimits": [ + { + "type": "RLIMIT_NOFILE", + "hard": 1024, + "soft": 1024 + } + ], + "noNewPrivileges": true + }, + "root": { + "path": "XXX-path-to-rootfs" + }, + "hostname": "runc", + "mounts": [ + { + "destination": "/proc", + "type": "proc", + "source": "proc" + }, + { + "destination": "/dev", + "type": "tmpfs", + "source": "tmpfs", + "options": [ + "nosuid", + "strictatime", + "mode=755", + "size=65536k" + ] + }, + { + "destination": "/dev/pts", + "type": "devpts", + "source": "devpts", + "options": [ + "nosuid", + "noexec", + "newinstance", + "ptmxmode=0666", + "mode=0620", + "gid=5" + ] + }, + { + "destination": "/dev/shm", + "type": "tmpfs", + "source": "shm", + "options": [ + "nosuid", + "noexec", + "nodev", + "mode=1777", + "size=65536k" + ] + }, + { + "destination": "/dev/mqueue", + "type": "mqueue", + "source": "mqueue", + "options": [ + "nosuid", + "noexec", + "nodev" + ] + }, + { + "destination": "/sys", + "type": "sysfs", + "source": "sysfs", + "options": [ + "nosuid", + "noexec", + "nodev", + "ro" + ] + }, + { + "destination": "/sys/fs/cgroup", + "type": "cgroup", + "source": "cgroup", + "options": [ + "nosuid", + "noexec", + "nodev", + "relatime", + "ro" + ] + } + ], + "linux": { + "uidMappings": [ + { + "containerID": 0, + "hostID": 65536, + "size": 65536 + } + ], + "gidMappings": [ + { + "containerID": 0, + "hostID": 65536, + "size": 65536 + } + ], + "resources": { + "devices": [ + { + "allow": false, + "access": "rwm" + } + ] + }, + "namespaces": [ + { + "type": "pid" + }, + { + "type": "network" + }, + { + "type": "ipc" + }, + { + "type": "uts" + }, + { + "type": "mount" + }, + { + "type": "cgroup" + }, + { + "type": "user" + } + ], + "maskedPaths": [ + "/proc/acpi", + "/proc/asound", + "/proc/kcore", + "/proc/keys", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + "/proc/scsi" + ], + "readonlyPaths": [ + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger" + ] + } +} From 83240a4f77623a55d344c28c0d3715e4baf868fa Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Thu, 7 Sep 2023 15:28:28 +0200 Subject: [PATCH 11/11] Bump crun to 1.9 crun 1.9 was just released with fixes and exposes idmap mounts support via the "features" sub-command. We use that feature to throw a clear error to users (if they request idmap mounts and the OCI runtime doesn't support it), but also to skip tests on CI when the OCI runtime doesn't support it. Let's bump it so the CI runs the tests with crun. Signed-off-by: Rodrigo Campos --- script/setup/crun-version | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/setup/crun-version b/script/setup/crun-version index 88d3ee790..2e0e38c63 100644 --- a/script/setup/crun-version +++ b/script/setup/crun-version @@ -1 +1 @@ -1.8.7 +1.9