diff --git a/oci/spec_opts_unix.go b/oci/spec_opts_unix.go index 609684a75..9d03091aa 100644 --- a/oci/spec_opts_unix.go +++ b/oci/spec_opts_unix.go @@ -37,7 +37,7 @@ func WithHostDevices(_ context.Context, _ Client, _ *containers.Container, s *Sp return nil } -// WithDevices recursively adds devices from the passed in path and associated cgroup rules for that device. +// WithDevices recursively adds devices from the passed in path. // If devicePath is a dir it traverses the dir to add all devices in that dir. // If devicePath is not a dir, it attempts to add the single device. func WithDevices(devicePath, containerPath, permissions string) SpecOpts { diff --git a/pkg/cri/config/config.go b/pkg/cri/config/config.go index b1c04add3..286738e3c 100644 --- a/pkg/cri/config/config.go +++ b/pkg/cri/config/config.go @@ -258,6 +258,9 @@ type PluginConfig struct { // present in /sys/fs/cgroup/cgroup.controllers. // This helps with running rootless mode + cgroup v2 + systemd but without hugetlb delegation. DisableHugetlbController bool `toml:"disable_hugetlb_controller" json:"disableHugetlbController"` + // DeviceOwnershipFromSecurityContext changes the default behavior of setting container devices uid/gid + // from CRI's SecurityContext (RunAsUser/RunAsGroup) instead of taking host's uid/gid. Defaults to false. + DeviceOwnershipFromSecurityContext bool `toml:"device_ownership_from_security_context" json:"device_ownership_from_security_context"` // IgnoreImageDefinedVolumes ignores volumes defined by the image. Useful for better resource // isolation, security and early detection of issues in the mount configuration when using // ReadOnlyRootFilesystem since containers won't silently mount a temporary volume. diff --git a/pkg/cri/opts/spec_linux.go b/pkg/cri/opts/spec_linux.go index bc30d0caa..995a36669 100644 --- a/pkg/cri/opts/spec_linux.go +++ b/pkg/cri/opts/spec_linux.go @@ -285,8 +285,30 @@ func ensureSharedOrSlave(path string, lookupMount func(string) (mount.Info, erro return errors.Errorf("path %q is mounted on %q but it is not a shared or slave mount", path, mountInfo.Mountpoint) } +// getDeviceUserGroupID() is used to find the right uid/gid +// value for the device node created in the container namespace. +// The runtime executes mknod() and chmod()s the created +// device with the values returned here. +// +// On Linux, uid and gid are sufficient and the user/groupname do not +// need to be resolved. +// +// TODO(mythi): In case of user namespaces, the runtime simply bind +// mounts the devices from the host. Additional logic is needed +// to check that the runtimes effective UID/GID on the host has the +// permissions to access the device node and/or the right user namespace +// mappings are created. +// +// Ref: https://github.com/kubernetes/kubernetes/issues/92211 +func getDeviceUserGroupID(runAsVal *runtime.Int64Value) uint32 { + if runAsVal != nil { + return uint32(runAsVal.GetValue()) + } + return 0 +} + // WithDevices sets the provided devices onto the container spec -func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig) oci.SpecOpts { +func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDeviceOwnershipFromSecurityContext bool) oci.SpecOpts { return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) { if s.Linux == nil { s.Linux = &runtimespec.Linux{} @@ -295,6 +317,8 @@ func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig) oci.SpecOp s.Linux.Resources = &runtimespec.LinuxResources{} } + oldDevices := len(s.Linux.Devices) + for _, device := range config.GetDevices() { path, err := osi.ResolveSymbolicLink(device.HostPath) if err != nil { @@ -306,6 +330,24 @@ func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig) oci.SpecOp return err } } + + if enableDeviceOwnershipFromSecurityContext { + UID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsUser()) + GID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsGroup()) + // Loop all new devices added by oci.WithDevices() to update their + // dev.UID/dev.GID. + // + // non-zero UID/GID from SecurityContext is used to override host's + // device UID/GID for the container. + for idx := oldDevices; idx < len(s.Linux.Devices); idx++ { + if UID != 0 { + *s.Linux.Devices[idx].UID = UID + } + if GID != 0 { + *s.Linux.Devices[idx].GID = GID + } + } + } return nil } } diff --git a/pkg/cri/server/container_create_linux.go b/pkg/cri/server/container_create_linux.go index d662c8add..d1168d8f5 100644 --- a/pkg/cri/server/container_create_linux.go +++ b/pkg/cri/server/container_create_linux.go @@ -222,11 +222,11 @@ func (c *criService) containerSpec( specOpts = append(specOpts, oci.WithHostDevices, oci.WithAllDevicesAllowed) } else { // add requested devices by the config as host devices are not automatically added - specOpts = append(specOpts, customopts.WithDevices(c.os, config), + specOpts = append(specOpts, customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext), customopts.WithCapabilities(securityContext, c.allCaps)) } } else { // not privileged - specOpts = append(specOpts, customopts.WithDevices(c.os, config), + specOpts = append(specOpts, customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext), customopts.WithCapabilities(securityContext, c.allCaps)) } diff --git a/pkg/cri/server/container_create_linux_test.go b/pkg/cri/server/container_create_linux_test.go index 43af0aae6..4cdc9bbdb 100644 --- a/pkg/cri/server/container_create_linux_test.go +++ b/pkg/cri/server/container_create_linux_test.go @@ -1304,6 +1304,72 @@ func TestGenerateUserString(t *testing.T) { } } +func TestNonRootUserAndDevices(t *testing.T) { + testPid := uint32(1234) + c := newTestCRIService() + testSandboxID := "sandbox-id" + testContainerName := "container-name" + containerConfig, sandboxConfig, imageConfig, _ := getCreateContainerTestData() + + hostDevicesRaw, err := oci.HostDevices() + assert.NoError(t, err) + + testDevice := hostDevicesRaw[0] + + for desc, test := range map[string]struct { + uid, gid *runtime.Int64Value + deviceOwnershipFromSecurityContext bool + expectedDeviceUID uint32 + expectedDeviceGID uint32 + }{ + "expect non-root container's Devices Uid/Gid to be the same as the device Uid/Gid on the host when deviceOwnershipFromSecurityContext is disabled": { + uid: &runtime.Int64Value{Value: 1}, + gid: &runtime.Int64Value{Value: 10}, + expectedDeviceUID: *testDevice.UID, + expectedDeviceGID: *testDevice.GID, + }, + "expect root container's Devices Uid/Gid to be the same as the device Uid/Gid on the host when deviceOwnershipFromSecurityContext is disabled": { + uid: &runtime.Int64Value{Value: 0}, + gid: &runtime.Int64Value{Value: 0}, + expectedDeviceUID: *testDevice.UID, + expectedDeviceGID: *testDevice.GID, + }, + "expect non-root container's Devices Uid/Gid to be the same as RunAsUser/RunAsGroup when deviceOwnershipFromSecurityContext is enabled": { + uid: &runtime.Int64Value{Value: 1}, + gid: &runtime.Int64Value{Value: 10}, + deviceOwnershipFromSecurityContext: true, + expectedDeviceUID: 1, + expectedDeviceGID: 10, + }, + "expect root container's Devices Uid/Gid to be the same as the device Uid/Gid on the host when deviceOwnershipFromSecurityContext is enabled": { + uid: &runtime.Int64Value{Value: 0}, + gid: &runtime.Int64Value{Value: 0}, + deviceOwnershipFromSecurityContext: true, + expectedDeviceUID: *testDevice.UID, + expectedDeviceGID: *testDevice.GID, + }, + } { + t.Logf("TestCase %q", desc) + + c.config.DeviceOwnershipFromSecurityContext = test.deviceOwnershipFromSecurityContext + containerConfig.Linux.SecurityContext.RunAsUser = test.uid + containerConfig.Linux.SecurityContext.RunAsGroup = test.gid + containerConfig.Devices = []*runtime.Device{ + { + ContainerPath: testDevice.Path, + HostPath: testDevice.Path, + Permissions: "r", + }, + } + + spec, err := c.containerSpec(t.Name(), testSandboxID, testPid, "", testContainerName, testImageName, containerConfig, sandboxConfig, imageConfig, nil, config.Runtime{}) + assert.NoError(t, err) + + assert.Equal(t, test.expectedDeviceUID, *spec.Linux.Devices[0].UID) + assert.Equal(t, test.expectedDeviceGID, *spec.Linux.Devices[0].GID) + } +} + func TestPrivilegedDevices(t *testing.T) { testPid := uint32(1234) c := newTestCRIService()