From e0f8c04dad213975bf55dcfa71986be6d8b489e2 Mon Sep 17 00:00:00 2001 From: Mikko Ylinen Date: Tue, 9 Mar 2021 09:33:03 +0200 Subject: [PATCH] cri: Devices ownership from SecurityContext CRI container runtimes mount devices (set via kubernetes device plugins) to containers by taking the host user/group IDs (uid/gid) to the corresponding container device. This triggers a problem when trying to run those containers with non-zero (root uid/gid = 0) uid/gid set via runAsUser/runAsGroup: the container process has no permission to use the device even when its gid is permissive to non-root users because the container user does not belong to that group. It is possible to workaround the problem by manually adding the device gid(s) to supplementalGroups. However, this is also problematic because the device gid(s) may have different values depending on the workers' distro/version in the cluster. This patch suggests to take RunAsUser/RunAsGroup set via SecurityContext as the device UID/GID, respectively. The feature must be enabled by setting device_ownership_from_security_context runtime config value to true (valid on Linux only). Signed-off-by: Mikko Ylinen --- oci/spec_opts_unix.go | 2 +- pkg/cri/config/config.go | 3 + pkg/cri/opts/spec_linux.go | 44 ++++++++++++- pkg/cri/server/container_create_linux.go | 4 +- pkg/cri/server/container_create_linux_test.go | 66 +++++++++++++++++++ 5 files changed, 115 insertions(+), 4 deletions(-) diff --git a/oci/spec_opts_unix.go b/oci/spec_opts_unix.go index 609684a75..9d03091aa 100644 --- a/oci/spec_opts_unix.go +++ b/oci/spec_opts_unix.go @@ -37,7 +37,7 @@ func WithHostDevices(_ context.Context, _ Client, _ *containers.Container, s *Sp return nil } -// WithDevices recursively adds devices from the passed in path and associated cgroup rules for that device. +// WithDevices recursively adds devices from the passed in path. // If devicePath is a dir it traverses the dir to add all devices in that dir. // If devicePath is not a dir, it attempts to add the single device. func WithDevices(devicePath, containerPath, permissions string) SpecOpts { diff --git a/pkg/cri/config/config.go b/pkg/cri/config/config.go index b1c04add3..286738e3c 100644 --- a/pkg/cri/config/config.go +++ b/pkg/cri/config/config.go @@ -258,6 +258,9 @@ type PluginConfig struct { // present in /sys/fs/cgroup/cgroup.controllers. // This helps with running rootless mode + cgroup v2 + systemd but without hugetlb delegation. DisableHugetlbController bool `toml:"disable_hugetlb_controller" json:"disableHugetlbController"` + // DeviceOwnershipFromSecurityContext changes the default behavior of setting container devices uid/gid + // from CRI's SecurityContext (RunAsUser/RunAsGroup) instead of taking host's uid/gid. Defaults to false. + DeviceOwnershipFromSecurityContext bool `toml:"device_ownership_from_security_context" json:"device_ownership_from_security_context"` // IgnoreImageDefinedVolumes ignores volumes defined by the image. Useful for better resource // isolation, security and early detection of issues in the mount configuration when using // ReadOnlyRootFilesystem since containers won't silently mount a temporary volume. diff --git a/pkg/cri/opts/spec_linux.go b/pkg/cri/opts/spec_linux.go index bc30d0caa..995a36669 100644 --- a/pkg/cri/opts/spec_linux.go +++ b/pkg/cri/opts/spec_linux.go @@ -285,8 +285,30 @@ func ensureSharedOrSlave(path string, lookupMount func(string) (mount.Info, erro return errors.Errorf("path %q is mounted on %q but it is not a shared or slave mount", path, mountInfo.Mountpoint) } +// getDeviceUserGroupID() is used to find the right uid/gid +// value for the device node created in the container namespace. +// The runtime executes mknod() and chmod()s the created +// device with the values returned here. +// +// On Linux, uid and gid are sufficient and the user/groupname do not +// need to be resolved. +// +// TODO(mythi): In case of user namespaces, the runtime simply bind +// mounts the devices from the host. Additional logic is needed +// to check that the runtimes effective UID/GID on the host has the +// permissions to access the device node and/or the right user namespace +// mappings are created. +// +// Ref: https://github.com/kubernetes/kubernetes/issues/92211 +func getDeviceUserGroupID(runAsVal *runtime.Int64Value) uint32 { + if runAsVal != nil { + return uint32(runAsVal.GetValue()) + } + return 0 +} + // WithDevices sets the provided devices onto the container spec -func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig) oci.SpecOpts { +func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDeviceOwnershipFromSecurityContext bool) oci.SpecOpts { return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) { if s.Linux == nil { s.Linux = &runtimespec.Linux{} @@ -295,6 +317,8 @@ func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig) oci.SpecOp s.Linux.Resources = &runtimespec.LinuxResources{} } + oldDevices := len(s.Linux.Devices) + for _, device := range config.GetDevices() { path, err := osi.ResolveSymbolicLink(device.HostPath) if err != nil { @@ -306,6 +330,24 @@ func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig) oci.SpecOp return err } } + + if enableDeviceOwnershipFromSecurityContext { + UID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsUser()) + GID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsGroup()) + // Loop all new devices added by oci.WithDevices() to update their + // dev.UID/dev.GID. + // + // non-zero UID/GID from SecurityContext is used to override host's + // device UID/GID for the container. + for idx := oldDevices; idx < len(s.Linux.Devices); idx++ { + if UID != 0 { + *s.Linux.Devices[idx].UID = UID + } + if GID != 0 { + *s.Linux.Devices[idx].GID = GID + } + } + } return nil } } diff --git a/pkg/cri/server/container_create_linux.go b/pkg/cri/server/container_create_linux.go index d662c8add..d1168d8f5 100644 --- a/pkg/cri/server/container_create_linux.go +++ b/pkg/cri/server/container_create_linux.go @@ -222,11 +222,11 @@ func (c *criService) containerSpec( specOpts = append(specOpts, oci.WithHostDevices, oci.WithAllDevicesAllowed) } else { // add requested devices by the config as host devices are not automatically added - specOpts = append(specOpts, customopts.WithDevices(c.os, config), + specOpts = append(specOpts, customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext), customopts.WithCapabilities(securityContext, c.allCaps)) } } else { // not privileged - specOpts = append(specOpts, customopts.WithDevices(c.os, config), + specOpts = append(specOpts, customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext), customopts.WithCapabilities(securityContext, c.allCaps)) } diff --git a/pkg/cri/server/container_create_linux_test.go b/pkg/cri/server/container_create_linux_test.go index 43af0aae6..4cdc9bbdb 100644 --- a/pkg/cri/server/container_create_linux_test.go +++ b/pkg/cri/server/container_create_linux_test.go @@ -1304,6 +1304,72 @@ func TestGenerateUserString(t *testing.T) { } } +func TestNonRootUserAndDevices(t *testing.T) { + testPid := uint32(1234) + c := newTestCRIService() + testSandboxID := "sandbox-id" + testContainerName := "container-name" + containerConfig, sandboxConfig, imageConfig, _ := getCreateContainerTestData() + + hostDevicesRaw, err := oci.HostDevices() + assert.NoError(t, err) + + testDevice := hostDevicesRaw[0] + + for desc, test := range map[string]struct { + uid, gid *runtime.Int64Value + deviceOwnershipFromSecurityContext bool + expectedDeviceUID uint32 + expectedDeviceGID uint32 + }{ + "expect non-root container's Devices Uid/Gid to be the same as the device Uid/Gid on the host when deviceOwnershipFromSecurityContext is disabled": { + uid: &runtime.Int64Value{Value: 1}, + gid: &runtime.Int64Value{Value: 10}, + expectedDeviceUID: *testDevice.UID, + expectedDeviceGID: *testDevice.GID, + }, + "expect root container's Devices Uid/Gid to be the same as the device Uid/Gid on the host when deviceOwnershipFromSecurityContext is disabled": { + uid: &runtime.Int64Value{Value: 0}, + gid: &runtime.Int64Value{Value: 0}, + expectedDeviceUID: *testDevice.UID, + expectedDeviceGID: *testDevice.GID, + }, + "expect non-root container's Devices Uid/Gid to be the same as RunAsUser/RunAsGroup when deviceOwnershipFromSecurityContext is enabled": { + uid: &runtime.Int64Value{Value: 1}, + gid: &runtime.Int64Value{Value: 10}, + deviceOwnershipFromSecurityContext: true, + expectedDeviceUID: 1, + expectedDeviceGID: 10, + }, + "expect root container's Devices Uid/Gid to be the same as the device Uid/Gid on the host when deviceOwnershipFromSecurityContext is enabled": { + uid: &runtime.Int64Value{Value: 0}, + gid: &runtime.Int64Value{Value: 0}, + deviceOwnershipFromSecurityContext: true, + expectedDeviceUID: *testDevice.UID, + expectedDeviceGID: *testDevice.GID, + }, + } { + t.Logf("TestCase %q", desc) + + c.config.DeviceOwnershipFromSecurityContext = test.deviceOwnershipFromSecurityContext + containerConfig.Linux.SecurityContext.RunAsUser = test.uid + containerConfig.Linux.SecurityContext.RunAsGroup = test.gid + containerConfig.Devices = []*runtime.Device{ + { + ContainerPath: testDevice.Path, + HostPath: testDevice.Path, + Permissions: "r", + }, + } + + spec, err := c.containerSpec(t.Name(), testSandboxID, testPid, "", testContainerName, testImageName, containerConfig, sandboxConfig, imageConfig, nil, config.Runtime{}) + assert.NoError(t, err) + + assert.Equal(t, test.expectedDeviceUID, *spec.Linux.Devices[0].UID) + assert.Equal(t, test.expectedDeviceGID, *spec.Linux.Devices[0].GID) + } +} + func TestPrivilegedDevices(t *testing.T) { testPid := uint32(1234) c := newTestCRIService()