From 97dfa7f55697650225c45aa4cf8e7082ecd168af Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Mon, 23 Jan 2023 18:35:28 +0100 Subject: [PATCH] cri/server: Pass down uidMappings to OCI runtime When the kubelet sends the uid/gid mappings for a mount, just pass them down to the OCI runtime. OCI runtimes support this since runc 1.2 and crun 1.8.1. And whenever we add mounts (container mounts or image spec volumes) and userns are requested by the kubelet, we use those mappings in the mounts so the mounts are idmapped correctly. If no userns is used, we don't send any mappings which just keeps the current behavior. Signed-off-by: Rodrigo Campos --- pkg/cri/opts/spec_linux_opts.go | 23 +++++++++++++++++++++++ pkg/cri/server/container_create.go | 15 +++++++++++++-- pkg/cri/server/container_create_linux.go | 22 ++++++++++++++++++++++ pkg/cri/server/container_create_test.go | 3 ++- 4 files changed, 60 insertions(+), 3 deletions(-) diff --git a/pkg/cri/opts/spec_linux_opts.go b/pkg/cri/opts/spec_linux_opts.go index 8af03daf7..a052a7b54 100644 --- a/pkg/cri/opts/spec_linux_opts.go +++ b/pkg/cri/opts/spec_linux_opts.go @@ -166,11 +166,34 @@ func WithMounts(osi osinterface.OS, config *runtime.ContainerConfig, extra []*ru return fmt.Errorf("idmap mounts not yet supported, but they were requested for: %q", src) } + var uidMapping []runtimespec.LinuxIDMapping + if mount.UidMappings != nil { + for _, mapping := range mount.UidMappings { + uidMapping = append(uidMapping, runtimespec.LinuxIDMapping{ + HostID: mapping.HostId, + ContainerID: mapping.ContainerId, + Size: mapping.Length, + }) + } + } + var gidMapping []runtimespec.LinuxIDMapping + if mount.GidMappings != nil { + for _, mapping := range mount.GidMappings { + gidMapping = append(gidMapping, runtimespec.LinuxIDMapping{ + HostID: mapping.HostId, + ContainerID: mapping.ContainerId, + Size: mapping.Length, + }) + } + } + s.Mounts = append(s.Mounts, runtimespec.Mount{ Source: src, Destination: dst, Type: "bind", Options: options, + UIDMappings: uidMapping, + GIDMappings: gidMapping, }) } return nil diff --git a/pkg/cri/server/container_create.go b/pkg/cri/server/container_create.go index 4cc8defae..15efb3e10 100644 --- a/pkg/cri/server/container_create.go +++ b/pkg/cri/server/container_create.go @@ -143,7 +143,7 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta var volumeMounts []*runtime.Mount if !c.config.IgnoreImageDefinedVolumes { // Create container image volumes mounts. - volumeMounts = c.volumeMounts(containerRootDir, config.GetMounts(), &image.ImageSpec.Config) + volumeMounts = c.volumeMounts(containerRootDir, config, &image.ImageSpec.Config) } else if len(image.ImageSpec.Config.Volumes) != 0 { log.G(ctx).Debugf("Ignoring volumes defined in image %v because IgnoreImageDefinedVolumes is set", image.ID) } @@ -318,10 +318,19 @@ func (c *criService) CreateContainer(ctx context.Context, r *runtime.CreateConta // volumeMounts sets up image volumes for container. Rely on the removal of container // root directory to do cleanup. Note that image volume will be skipped, if there is criMounts // specified with the same destination. -func (c *criService) volumeMounts(containerRootDir string, criMounts []*runtime.Mount, config *imagespec.ImageConfig) []*runtime.Mount { +func (c *criService) volumeMounts(containerRootDir string, containerConfig *runtime.ContainerConfig, config *imagespec.ImageConfig) []*runtime.Mount { if len(config.Volumes) == 0 { return nil } + var uidMappings, gidMappings []*runtime.IDMapping + if goruntime.GOOS != "windows" { + if usernsOpts := containerConfig.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions(); usernsOpts != nil { + uidMappings = usernsOpts.GetUids() + gidMappings = usernsOpts.GetGids() + } + } + + criMounts := containerConfig.GetMounts() var mounts []*runtime.Mount for dst := range config.Volumes { if isInCRIMounts(dst, criMounts) { @@ -343,6 +352,8 @@ func (c *criService) volumeMounts(containerRootDir string, criMounts []*runtime. ContainerPath: dst, HostPath: src, SelinuxRelabel: true, + UidMappings: uidMappings, + GidMappings: gidMappings, }) } return mounts diff --git a/pkg/cri/server/container_create_linux.go b/pkg/cri/server/container_create_linux.go index d01c2c2e6..1d7c66ccf 100644 --- a/pkg/cri/server/container_create_linux.go +++ b/pkg/cri/server/container_create_linux.go @@ -62,6 +62,12 @@ const ( func (c *criService) containerMounts(sandboxID string, config *runtime.ContainerConfig) []*runtime.Mount { var mounts []*runtime.Mount securityContext := config.GetLinux().GetSecurityContext() + var uidMappings, gidMappings []*runtime.IDMapping + if usernsOpts := securityContext.GetNamespaceOptions().GetUsernsOptions(); usernsOpts != nil { + uidMappings = usernsOpts.GetUids() + gidMappings = usernsOpts.GetGids() + } + if !isInCRIMounts(etcHostname, config.GetMounts()) { // /etc/hostname is added since 1.1.6, 1.2.4 and 1.3. // For in-place upgrade, the old sandbox doesn't have the hostname file, @@ -75,6 +81,8 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container HostPath: hostpath, Readonly: securityContext.GetReadonlyRootfs(), SelinuxRelabel: true, + UidMappings: uidMappings, + GidMappings: gidMappings, }) } } @@ -85,6 +93,8 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container HostPath: c.getSandboxHosts(sandboxID), Readonly: securityContext.GetReadonlyRootfs(), SelinuxRelabel: true, + UidMappings: uidMappings, + GidMappings: gidMappings, }) } @@ -96,6 +106,8 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container HostPath: c.getResolvPath(sandboxID), Readonly: securityContext.GetReadonlyRootfs(), SelinuxRelabel: true, + UidMappings: uidMappings, + GidMappings: gidMappings, }) } @@ -109,6 +121,16 @@ func (c *criService) containerMounts(sandboxID string, config *runtime.Container HostPath: sandboxDevShm, Readonly: false, SelinuxRelabel: sandboxDevShm != devShm, + // XXX: tmpfs support for idmap mounts got merged in + // Linux 6.3. + // Our CI runs with 5.15 kernels, so disabling idmap + // mounts for this case makes the CI happy (the other fs + // used support idmap mounts in 5.15 kernels). + // We can enable this at a later stage, but as this + // tmpfs mount is exposed empty to the container (no + // prepopulated files) and using the hostIPC with userns + // is blocked by k8s, we can just avoid using the + // mappings and it should work fine. }) } return mounts diff --git a/pkg/cri/server/container_create_test.go b/pkg/cri/server/container_create_test.go index df65106cd..94bf0c4ae 100644 --- a/pkg/cri/server/container_create_test.go +++ b/pkg/cri/server/container_create_test.go @@ -279,8 +279,9 @@ func TestVolumeMounts(t *testing.T) { config := &imagespec.ImageConfig{ Volumes: test.imageVolumes, } + containerConfig := &runtime.ContainerConfig{Mounts: test.criMounts} c := newTestCRIService() - got := c.volumeMounts(testContainerRootDir, test.criMounts, config) + got := c.volumeMounts(testContainerRootDir, containerConfig, config) assert.Len(t, got, len(test.expectedMountDest)) for _, dest := range test.expectedMountDest { found := false