*: should align pipe's owner with init process

The containerd-shim creates pipes and passes them to the init container as
stdin, stdout, and stderr for logging purposes. By default, these pipes are
owned by the root user (UID/GID: 0/0). The init container can access them
directly through inheritance.

However, if the init container attempts to open any files pointing to these
pipes (e.g., /proc/1/fd/2, /dev/stderr), it will encounter a permission issue
since it is not the owner. To avoid this, we need to align the ownership of
the pipes with the init process.

Fixes: #10598

Signed-off-by: Wei Fu <fuweid89@gmail.com>
This commit is contained in:
Wei Fu 2024-10-28 19:00:53 +00:00 committed by k8s-infra-cherrypick-robot
parent 6e51f71621
commit cf07f28ee2
6 changed files with 245 additions and 0 deletions

View File

@ -38,6 +38,7 @@ type ImageList struct {
VolumeOwnership string
ArgsEscaped string
DockerSchema1 string
Nginx string
}
var (
@ -57,6 +58,7 @@ func initImages(imageListFile string) {
VolumeOwnership: "ghcr.io/containerd/volume-ownership:2.1",
ArgsEscaped: "cplatpublic.azurecr.io/args-escaped-test-image-ns:1.0",
DockerSchema1: "registry.k8s.io/busybox@sha256:4bdd623e848417d96127e16037743f0cd8b528c026e9175e22a84f639eca58ff",
Nginx: "ghcr.io/containerd/nginx:1.27.0",
}
if imageListFile != "" {
@ -96,6 +98,8 @@ const (
ArgsEscaped
// DockerSchema1 image with docker schema 1
DockerSchema1
// Nginx image
Nginx
)
func initImageMap(imageList ImageList) map[int]string {
@ -108,6 +112,7 @@ func initImageMap(imageList ImageList) map[int]string {
images[VolumeOwnership] = imageList.VolumeOwnership
images[ArgsEscaped] = imageList.ArgsEscaped
images[DockerSchema1] = imageList.DockerSchema1
images[Nginx] = imageList.Nginx
return images
}

View File

@ -471,6 +471,24 @@ func WithDevice(containerPath, hostPath, permissions string) ContainerOpts {
}
}
// WithSELinuxOptions allows to set SELinux option for container.
func WithSELinuxOptions(user, role, typ, level string) ContainerOpts {
return func(c *runtime.ContainerConfig) {
if c.Linux == nil {
c.Linux = &runtime.LinuxContainerConfig{}
}
if c.Linux.SecurityContext == nil {
c.Linux.SecurityContext = &runtime.LinuxContainerSecurityContext{}
}
c.Linux.SecurityContext.SelinuxOptions = &runtime.SELinuxOption{
User: user,
Role: role,
Type: typ,
Level: level,
}
}
}
// ContainerConfig creates a container config given a name and image name
// and additional container config options
func ContainerConfig(name, image string, opts ...ContainerOpts) *runtime.ContainerConfig {

View File

@ -304,6 +304,125 @@ func TestPodUserNS(t *testing.T) {
}
}
// TestIssue10598 tests a case[1] that init processes in container should be able
// to open /dev/stdout or /dev/stderr if init processes are running in their
// user namespace instead of root user.
//
// The shim server creates pipe for init processes' standard output. By default,
// the owner of pipe is the same to shim server (root user). Let's say, the init
// process is running with uid=1000/gid=1000 user. Init processes inherits the
// pipe created by shim server so that it can just write data into that pipe.
// However, if that init process tries to open /dev/stderr, the kernel will
// return no permission error.
//
// The following output is from retsnoop[2].
//
// → do_open
// → inode_permission
// → generic_permission
// ↔ make_vfsuid [0] 0.500us
// ↔ make_vfsuid [0] 6.501us
// ↔ from_kuid [0xffffffff] 0.700us
// ← generic_permission [-EACCES] 13.501us
//
// Since uid_map/gid_map doesn't cover uid=0/gid=0, the kernel can't convert
// uid=0 into valid uid in that uid_map. So, `from_kuid` returns invalid uid
// value and then `do_open` returns EACCES error.
//
// [1]: https://github.com/containerd/containerd/issues/10598
// [2]: https://github.com/anakryiko/retsnoop
func TestIssue10598(t *testing.T) {
if !supportsUserNS() {
t.Skip("User namespaces are not supported")
}
if !supportsIDMap(defaultRoot) {
t.Skipf("ID mappings are not supported on: %v", defaultRoot)
}
if err := supportsRuncIDMap(); err != nil {
t.Skipf("OCI runtime doesn't support idmap mounts: %v", err)
}
testPodLogDir := t.TempDir()
containerID := uint32(0)
hostID := uint32(65536)
size := uint32(65536)
t.Log("Create a sandbox with userns")
sandboxOpts := []PodSandboxOpts{
WithPodUserNs(containerID, hostID, size),
WithPodLogDirectory(testPodLogDir),
}
sbConfig := PodSandboxConfig("issue10598", "userns", sandboxOpts...)
sb, err := runtimeService.RunPodSandbox(sbConfig, *runtimeHandler)
require.NoError(t, err)
// Make sure the sandbox is cleaned up.
defer func() {
assert.NoError(t, runtimeService.StopPodSandbox(sb))
assert.NoError(t, runtimeService.RemovePodSandbox(sb))
}()
t.Log("Create a container for userns")
containerName := "nginx-userns"
testImage := images.Get(images.Nginx)
EnsureImageExists(t, testImage)
containerOpts := []ContainerOpts{
WithUserNamespace(containerID, hostID, size),
WithLogPath(containerName),
// The SELinux policy enforced by container-selinux prevents
// NGINX from opening the /proc/self/fd/2 pipe. This scenario
// is not intended to verify SELinux behavior in the user namespace
// but rather to confirm the ownership of the standard output
// file descriptor. The following option demonstrates how to
// disable the restrictive SELinux rule for the NGINX process.
WithSELinuxOptions(
"unconfined_u",
"unconfined_r",
"container_runtime_t",
"s0",
),
}
cnConfig := ContainerConfig(
containerName,
testImage,
containerOpts...,
)
cn, err := runtimeService.CreateContainer(sb, cnConfig, sbConfig)
require.NoError(t, err)
t.Log("Start the container")
require.NoError(t, runtimeService.StartContainer(cn))
t.Log("Wait for container to start")
require.NoError(t, Eventually(func() (bool, error) {
content, err := os.ReadFile(filepath.Join(testPodLogDir, containerName))
if err != nil {
return false, err
}
s, err := runtimeService.ContainerStatus(cn)
if err != nil {
return false, err
}
if state := s.GetState(); state != runtime.ContainerState_CONTAINER_RUNNING {
return false, fmt.Errorf("%s is not running\nstate: %s\nlog: %s",
containerName, state, string(content))
}
started := strings.Contains(string(content), "start worker processes")
if started {
t.Log(string(content))
}
return started, nil
}, time.Second, 30*time.Second))
}
func supportsRuncIDMap() error {
var r runc.Runc
features, err := r.Features(context.Background())

View File

@ -129,6 +129,12 @@ func (c *criService) StartContainer(ctx context.Context, r *runtime.StartContain
containerd.WithTaskAPIEndpoint(endpoint.Address, endpoint.Version))
}
ioOwnerTaskOpts, err := updateContainerIOOwner(ctx, container, config)
if err != nil {
return nil, fmt.Errorf("failed to update container IO owner: %w", err)
}
taskOpts = append(taskOpts, ioOwnerTaskOpts...)
task, err := container.NewTask(ctx, ioCreation, taskOpts...)
if err != nil {
return nil, fmt.Errorf("failed to create containerd task: %w", err)

View File

@ -0,0 +1,66 @@
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package server
import (
"context"
"fmt"
containerd "github.com/containerd/containerd/v2/client"
"github.com/containerd/containerd/v2/internal/userns"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)
// updateContainerIOOwner updates I/O files' owner to align with initial processe's UID/GID.
func updateContainerIOOwner(ctx context.Context, cntr containerd.Container, config *runtime.ContainerConfig) ([]containerd.NewTaskOpts, error) {
if config.GetLinux() == nil {
return nil, nil
}
// FIXME(fuweid): Ideally, the pipe owner should be aligned with process owner.
// No matter what user namespace container uses, it should work well. However,
// it breaks the sig-node conformance case - [when querying /stats/summary should report resource usage through the stats api].
// In order to keep compatible, the change should apply to user namespace only.
if config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions() == nil {
return nil, nil
}
spec, err := cntr.Spec(ctx)
if err != nil {
return nil, fmt.Errorf("failed to get spec: %w", err)
}
if spec.Linux == nil || spec.Process == nil {
return nil, fmt.Errorf("invalid linux platform oci runtime spec")
}
hostID, err := userns.IDMap{
UidMap: spec.Linux.UIDMappings,
GidMap: spec.Linux.GIDMappings,
}.ToHost(userns.User{
Uid: spec.Process.User.UID,
Gid: spec.Process.User.GID,
})
if err != nil {
return nil, fmt.Errorf("failed to do idmap to get host ID: %w", err)
}
return []containerd.NewTaskOpts{
containerd.WithUIDOwner(hostID.Uid),
containerd.WithGIDOwner(hostID.Gid),
}, nil
}

View File

@ -0,0 +1,31 @@
//go:build !linux
/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package server
import (
"context"
containerd "github.com/containerd/containerd/v2/client"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
)
// updateContainerIOOwner updates I/O files' owner to align with initial processe's UID/GID.
func updateContainerIOOwner(ctx context.Context, cntr containerd.Container, config *runtime.ContainerConfig) ([]containerd.NewTaskOpts, error) {
return nil, nil
}