cri: Devices ownership from SecurityContext

CRI container runtimes mount devices (set via kubernetes device plugins)
to containers by taking the host user/group IDs (uid/gid) to the
corresponding container device.

This triggers a problem when trying to run those containers with
non-zero (root uid/gid = 0) uid/gid set via runAsUser/runAsGroup:
the container process has no permission to use the device even when
its gid is permissive to non-root users because the container user
does not belong to that group.

It is possible to workaround the problem by manually adding the device
gid(s) to supplementalGroups. However, this is also problematic because
the device gid(s) may have different values depending on the workers'
distro/version in the cluster.

This patch suggests to take RunAsUser/RunAsGroup set via SecurityContext
as the device UID/GID, respectively. The feature must be enabled by
setting device_ownership_from_security_context runtime config value to
true (valid on Linux only).

Signed-off-by: Mikko Ylinen <mikko.ylinen@intel.com>
This commit is contained in:
Mikko Ylinen 2021-03-09 09:33:03 +02:00
parent af1a0908d0
commit e0f8c04dad
5 changed files with 115 additions and 4 deletions

View File

@ -37,7 +37,7 @@ func WithHostDevices(_ context.Context, _ Client, _ *containers.Container, s *Sp
return nil
}
// WithDevices recursively adds devices from the passed in path and associated cgroup rules for that device.
// WithDevices recursively adds devices from the passed in path.
// If devicePath is a dir it traverses the dir to add all devices in that dir.
// If devicePath is not a dir, it attempts to add the single device.
func WithDevices(devicePath, containerPath, permissions string) SpecOpts {

View File

@ -258,6 +258,9 @@ type PluginConfig struct {
// present in /sys/fs/cgroup/cgroup.controllers.
// This helps with running rootless mode + cgroup v2 + systemd but without hugetlb delegation.
DisableHugetlbController bool `toml:"disable_hugetlb_controller" json:"disableHugetlbController"`
// DeviceOwnershipFromSecurityContext changes the default behavior of setting container devices uid/gid
// from CRI's SecurityContext (RunAsUser/RunAsGroup) instead of taking host's uid/gid. Defaults to false.
DeviceOwnershipFromSecurityContext bool `toml:"device_ownership_from_security_context" json:"device_ownership_from_security_context"`
// IgnoreImageDefinedVolumes ignores volumes defined by the image. Useful for better resource
// isolation, security and early detection of issues in the mount configuration when using
// ReadOnlyRootFilesystem since containers won't silently mount a temporary volume.

View File

@ -285,8 +285,30 @@ func ensureSharedOrSlave(path string, lookupMount func(string) (mount.Info, erro
return errors.Errorf("path %q is mounted on %q but it is not a shared or slave mount", path, mountInfo.Mountpoint)
}
// getDeviceUserGroupID() is used to find the right uid/gid
// value for the device node created in the container namespace.
// The runtime executes mknod() and chmod()s the created
// device with the values returned here.
//
// On Linux, uid and gid are sufficient and the user/groupname do not
// need to be resolved.
//
// TODO(mythi): In case of user namespaces, the runtime simply bind
// mounts the devices from the host. Additional logic is needed
// to check that the runtimes effective UID/GID on the host has the
// permissions to access the device node and/or the right user namespace
// mappings are created.
//
// Ref: https://github.com/kubernetes/kubernetes/issues/92211
func getDeviceUserGroupID(runAsVal *runtime.Int64Value) uint32 {
if runAsVal != nil {
return uint32(runAsVal.GetValue())
}
return 0
}
// WithDevices sets the provided devices onto the container spec
func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig) oci.SpecOpts {
func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig, enableDeviceOwnershipFromSecurityContext bool) oci.SpecOpts {
return func(ctx context.Context, client oci.Client, c *containers.Container, s *runtimespec.Spec) (err error) {
if s.Linux == nil {
s.Linux = &runtimespec.Linux{}
@ -295,6 +317,8 @@ func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig) oci.SpecOp
s.Linux.Resources = &runtimespec.LinuxResources{}
}
oldDevices := len(s.Linux.Devices)
for _, device := range config.GetDevices() {
path, err := osi.ResolveSymbolicLink(device.HostPath)
if err != nil {
@ -306,6 +330,24 @@ func WithDevices(osi osinterface.OS, config *runtime.ContainerConfig) oci.SpecOp
return err
}
}
if enableDeviceOwnershipFromSecurityContext {
UID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsUser())
GID := getDeviceUserGroupID(config.GetLinux().GetSecurityContext().GetRunAsGroup())
// Loop all new devices added by oci.WithDevices() to update their
// dev.UID/dev.GID.
//
// non-zero UID/GID from SecurityContext is used to override host's
// device UID/GID for the container.
for idx := oldDevices; idx < len(s.Linux.Devices); idx++ {
if UID != 0 {
*s.Linux.Devices[idx].UID = UID
}
if GID != 0 {
*s.Linux.Devices[idx].GID = GID
}
}
}
return nil
}
}

View File

@ -222,11 +222,11 @@ func (c *criService) containerSpec(
specOpts = append(specOpts, oci.WithHostDevices, oci.WithAllDevicesAllowed)
} else {
// add requested devices by the config as host devices are not automatically added
specOpts = append(specOpts, customopts.WithDevices(c.os, config),
specOpts = append(specOpts, customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext),
customopts.WithCapabilities(securityContext, c.allCaps))
}
} else { // not privileged
specOpts = append(specOpts, customopts.WithDevices(c.os, config),
specOpts = append(specOpts, customopts.WithDevices(c.os, config, c.config.DeviceOwnershipFromSecurityContext),
customopts.WithCapabilities(securityContext, c.allCaps))
}

View File

@ -1304,6 +1304,72 @@ func TestGenerateUserString(t *testing.T) {
}
}
func TestNonRootUserAndDevices(t *testing.T) {
testPid := uint32(1234)
c := newTestCRIService()
testSandboxID := "sandbox-id"
testContainerName := "container-name"
containerConfig, sandboxConfig, imageConfig, _ := getCreateContainerTestData()
hostDevicesRaw, err := oci.HostDevices()
assert.NoError(t, err)
testDevice := hostDevicesRaw[0]
for desc, test := range map[string]struct {
uid, gid *runtime.Int64Value
deviceOwnershipFromSecurityContext bool
expectedDeviceUID uint32
expectedDeviceGID uint32
}{
"expect non-root container's Devices Uid/Gid to be the same as the device Uid/Gid on the host when deviceOwnershipFromSecurityContext is disabled": {
uid: &runtime.Int64Value{Value: 1},
gid: &runtime.Int64Value{Value: 10},
expectedDeviceUID: *testDevice.UID,
expectedDeviceGID: *testDevice.GID,
},
"expect root container's Devices Uid/Gid to be the same as the device Uid/Gid on the host when deviceOwnershipFromSecurityContext is disabled": {
uid: &runtime.Int64Value{Value: 0},
gid: &runtime.Int64Value{Value: 0},
expectedDeviceUID: *testDevice.UID,
expectedDeviceGID: *testDevice.GID,
},
"expect non-root container's Devices Uid/Gid to be the same as RunAsUser/RunAsGroup when deviceOwnershipFromSecurityContext is enabled": {
uid: &runtime.Int64Value{Value: 1},
gid: &runtime.Int64Value{Value: 10},
deviceOwnershipFromSecurityContext: true,
expectedDeviceUID: 1,
expectedDeviceGID: 10,
},
"expect root container's Devices Uid/Gid to be the same as the device Uid/Gid on the host when deviceOwnershipFromSecurityContext is enabled": {
uid: &runtime.Int64Value{Value: 0},
gid: &runtime.Int64Value{Value: 0},
deviceOwnershipFromSecurityContext: true,
expectedDeviceUID: *testDevice.UID,
expectedDeviceGID: *testDevice.GID,
},
} {
t.Logf("TestCase %q", desc)
c.config.DeviceOwnershipFromSecurityContext = test.deviceOwnershipFromSecurityContext
containerConfig.Linux.SecurityContext.RunAsUser = test.uid
containerConfig.Linux.SecurityContext.RunAsGroup = test.gid
containerConfig.Devices = []*runtime.Device{
{
ContainerPath: testDevice.Path,
HostPath: testDevice.Path,
Permissions: "r",
},
}
spec, err := c.containerSpec(t.Name(), testSandboxID, testPid, "", testContainerName, testImageName, containerConfig, sandboxConfig, imageConfig, nil, config.Runtime{})
assert.NoError(t, err)
assert.Equal(t, test.expectedDeviceUID, *spec.Linux.Devices[0].UID)
assert.Equal(t, test.expectedDeviceGID, *spec.Linux.Devices[0].GID)
}
}
func TestPrivilegedDevices(t *testing.T) {
testPid := uint32(1234)
c := newTestCRIService()