*: should align pipe's owner with init process

The containerd-shim creates pipes and passes them to the init container as stdin, stdout, and stderr for logging purposes. By default, these pipes are owned by the root user (UID/GID: 0/0). The init container can access them directly through inheritance. However, if the init container attempts to open any files pointing to these pipes (e.g., /proc/1/fd/2, /dev/stderr), it will encounter a permission issue since it is not the owner. To avoid this, we need to align the ownership of the pipes with the init process. Fixes: #10598 Signed-off-by: Wei Fu <fuweid89@gmail.com>
2024-10-28 19:00:53 +00:00 · 2024-10-28 19:00:53 +00:00 · cf07f28ee2
commit cf07f28ee2
parent 6e51f71621
6 changed files with 245 additions and 0 deletions
--- a/integration/images/image_list.go
+++ b/integration/images/image_list.go
@ -38,6 +38,7 @@ type ImageList struct {
 	VolumeOwnership  string
 	ArgsEscaped      string
 	DockerSchema1    string
+	Nginx            string
 }

 var (
@ -57,6 +58,7 @@ func initImages(imageListFile string) {
 		VolumeOwnership:  "ghcr.io/containerd/volume-ownership:2.1",
 		ArgsEscaped:      "cplatpublic.azurecr.io/args-escaped-test-image-ns:1.0",
 		DockerSchema1:    "registry.k8s.io/busybox@sha256:4bdd623e848417d96127e16037743f0cd8b528c026e9175e22a84f639eca58ff",
+		Nginx:            "ghcr.io/containerd/nginx:1.27.0",
 	}

 	if imageListFile != "" {
@ -96,6 +98,8 @@ const (
 	ArgsEscaped
 	// DockerSchema1 image with docker schema 1
 	DockerSchema1
+	// Nginx image
+	Nginx
 )

 func initImageMap(imageList ImageList) map[int]string {
@ -108,6 +112,7 @@ func initImageMap(imageList ImageList) map[int]string {
 	images[VolumeOwnership] = imageList.VolumeOwnership
 	images[ArgsEscaped] = imageList.ArgsEscaped
 	images[DockerSchema1] = imageList.DockerSchema1
+	images[Nginx] = imageList.Nginx
 	return images
 }

--- a/integration/main_test.go
+++ b/integration/main_test.go
@ -471,6 +471,24 @@ func WithDevice(containerPath, hostPath, permissions string) ContainerOpts {
 	}
 }

+// WithSELinuxOptions allows to set SELinux option for container.
+func WithSELinuxOptions(user, role, typ, level string) ContainerOpts {
+	return func(c *runtime.ContainerConfig) {
+		if c.Linux == nil {
+			c.Linux = &runtime.LinuxContainerConfig{}
+		}
+		if c.Linux.SecurityContext == nil {
+			c.Linux.SecurityContext = &runtime.LinuxContainerSecurityContext{}
+		}
+		c.Linux.SecurityContext.SelinuxOptions = &runtime.SELinuxOption{
+			User:  user,
+			Role:  role,
+			Type:  typ,
+			Level: level,
+		}
+	}
+}
+
 // ContainerConfig creates a container config given a name and image name
 // and additional container config options
 func ContainerConfig(name, image string, opts ...ContainerOpts) *runtime.ContainerConfig {
--- a/integration/pod_userns_linux_test.go
+++ b/integration/pod_userns_linux_test.go
@ -304,6 +304,125 @@ func TestPodUserNS(t *testing.T) {
 	}
 }

+// TestIssue10598 tests a case[1] that init processes in container should be able
+// to open /dev/stdout or /dev/stderr if init processes are running in their
+// user namespace instead of root user.
+//
+// The shim server creates pipe for init processes' standard output. By default,
+// the owner of pipe is the same to shim server (root user). Let's say, the init
+// process is running with uid=1000/gid=1000 user. Init processes inherits the
+// pipe created by shim server so that it can just write data into that pipe.
+// However, if that init process tries to open /dev/stderr, the kernel will
+// return no permission error.
+//
+// The following output is from retsnoop[2].
+//
+//	→ do_open
+//	         → inode_permission
+//	             → generic_permission
+//	                 ↔ make_vfsuid      [0]                     0.500us
+//	                 ↔ make_vfsuid      [0]                     6.501us
+//	                 ↔ from_kuid        [0xffffffff]            0.700us
+//	             ← generic_permission   [-EACCES]              13.501us
+//
+// Since uid_map/gid_map doesn't cover uid=0/gid=0, the kernel can't convert
+// uid=0 into valid uid in that uid_map. So, `from_kuid` returns invalid uid
+// value and then `do_open` returns EACCES error.
+//
+// [1]: https://github.com/containerd/containerd/issues/10598
+// [2]: https://github.com/anakryiko/retsnoop
+func TestIssue10598(t *testing.T) {
+	if !supportsUserNS() {
+		t.Skip("User namespaces are not supported")
+	}
+	if !supportsIDMap(defaultRoot) {
+		t.Skipf("ID mappings are not supported on: %v", defaultRoot)
+	}
+	if err := supportsRuncIDMap(); err != nil {
+		t.Skipf("OCI runtime doesn't support idmap mounts: %v", err)
+	}
+
+	testPodLogDir := t.TempDir()
+
+	containerID := uint32(0)
+	hostID := uint32(65536)
+	size := uint32(65536)
+
+	t.Log("Create a sandbox with userns")
+	sandboxOpts := []PodSandboxOpts{
+		WithPodUserNs(containerID, hostID, size),
+		WithPodLogDirectory(testPodLogDir),
+	}
+	sbConfig := PodSandboxConfig("issue10598", "userns", sandboxOpts...)
+	sb, err := runtimeService.RunPodSandbox(sbConfig, *runtimeHandler)
+	require.NoError(t, err)
+
+	// Make sure the sandbox is cleaned up.
+	defer func() {
+		assert.NoError(t, runtimeService.StopPodSandbox(sb))
+		assert.NoError(t, runtimeService.RemovePodSandbox(sb))
+	}()
+
+	t.Log("Create a container for userns")
+
+	containerName := "nginx-userns"
+	testImage := images.Get(images.Nginx)
+
+	EnsureImageExists(t, testImage)
+
+	containerOpts := []ContainerOpts{
+		WithUserNamespace(containerID, hostID, size),
+		WithLogPath(containerName),
+		// The SELinux policy enforced by container-selinux prevents
+		// NGINX from opening the /proc/self/fd/2 pipe. This scenario
+		// is not intended to verify SELinux behavior in the user namespace
+		// but rather to confirm the ownership of the standard output
+		// file descriptor. The following option demonstrates how to
+		// disable the restrictive SELinux rule for the NGINX process.
+		WithSELinuxOptions(
+			"unconfined_u",
+			"unconfined_r",
+			"container_runtime_t",
+			"s0",
+		),
+	}
+
+	cnConfig := ContainerConfig(
+		containerName,
+		testImage,
+		containerOpts...,
+	)
+	cn, err := runtimeService.CreateContainer(sb, cnConfig, sbConfig)
+	require.NoError(t, err)
+
+	t.Log("Start the container")
+	require.NoError(t, runtimeService.StartContainer(cn))
+
+	t.Log("Wait for container to start")
+	require.NoError(t, Eventually(func() (bool, error) {
+		content, err := os.ReadFile(filepath.Join(testPodLogDir, containerName))
+		if err != nil {
+			return false, err
+		}
+
+		s, err := runtimeService.ContainerStatus(cn)
+		if err != nil {
+			return false, err
+		}
+
+		if state := s.GetState(); state != runtime.ContainerState_CONTAINER_RUNNING {
+			return false, fmt.Errorf("%s is not running\nstate: %s\nlog: %s",
+				containerName, state, string(content))
+		}
+
+		started := strings.Contains(string(content), "start worker processes")
+		if started {
+			t.Log(string(content))
+		}
+		return started, nil
+	}, time.Second, 30*time.Second))
+}
+
 func supportsRuncIDMap() error {
 	var r runc.Runc
 	features, err := r.Features(context.Background())
--- a/internal/cri/server/container_start.go
+++ b/internal/cri/server/container_start.go
@ -129,6 +129,12 @@ func (c *criService) StartContainer(ctx context.Context, r *runtime.StartContain
 			containerd.WithTaskAPIEndpoint(endpoint.Address, endpoint.Version))
 	}

+	ioOwnerTaskOpts, err := updateContainerIOOwner(ctx, container, config)
+	if err != nil {
+		return nil, fmt.Errorf("failed to update container IO owner: %w", err)
+	}
+	taskOpts = append(taskOpts, ioOwnerTaskOpts...)
+
 	task, err := container.NewTask(ctx, ioCreation, taskOpts...)
 	if err != nil {
 		return nil, fmt.Errorf("failed to create containerd task: %w", err)
--- a/internal/cri/server/container_start_linux.go
+++ b/internal/cri/server/container_start_linux.go
@ -0,0 +1,66 @@
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+package server
+
+import (
+	"context"
+	"fmt"
+
+	containerd "github.com/containerd/containerd/v2/client"
+	"github.com/containerd/containerd/v2/internal/userns"
+	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
+)
+
+// updateContainerIOOwner updates I/O files' owner to align with initial processe's UID/GID.
+func updateContainerIOOwner(ctx context.Context, cntr containerd.Container, config *runtime.ContainerConfig) ([]containerd.NewTaskOpts, error) {
+	if config.GetLinux() == nil {
+		return nil, nil
+	}
+
+	// FIXME(fuweid): Ideally, the pipe owner should be aligned with process owner.
+	// No matter what user namespace container uses, it should work well. However,
+	// it breaks the sig-node conformance case - [when querying /stats/summary should report resource usage through the stats api].
+	// In order to keep compatible, the change should apply to user namespace only.
+	if config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions() == nil {
+		return nil, nil
+	}
+
+	spec, err := cntr.Spec(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("failed to get spec: %w", err)
+	}
+
+	if spec.Linux == nil || spec.Process == nil {
+		return nil, fmt.Errorf("invalid linux platform oci runtime spec")
+	}
+
+	hostID, err := userns.IDMap{
+		UidMap: spec.Linux.UIDMappings,
+		GidMap: spec.Linux.GIDMappings,
+	}.ToHost(userns.User{
+		Uid: spec.Process.User.UID,
+		Gid: spec.Process.User.GID,
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to do idmap to get host ID: %w", err)
+	}
+
+	return []containerd.NewTaskOpts{
+		containerd.WithUIDOwner(hostID.Uid),
+		containerd.WithGIDOwner(hostID.Gid),
+	}, nil
+}
--- a/internal/cri/server/container_start_other.go
+++ b/internal/cri/server/container_start_other.go
@ -0,0 +1,31 @@
+//go:build !linux
+
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+package server
+
+import (
+	"context"
+
+	containerd "github.com/containerd/containerd/v2/client"
+	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
+)
+
+// updateContainerIOOwner updates I/O files' owner to align with initial processe's UID/GID.
+func updateContainerIOOwner(ctx context.Context, cntr containerd.Container, config *runtime.ContainerConfig) ([]containerd.NewTaskOpts, error) {
+	return nil, nil
+}