The containerd-shim creates pipes and passes them to the init container as stdin, stdout, and stderr for logging purposes. By default, these pipes are owned by the root user (UID/GID: 0/0). The init container can access them directly through inheritance. However, if the init container attempts to open any files pointing to these pipes (e.g., /proc/1/fd/2, /dev/stderr), it will encounter a permission issue since it is not the owner. To avoid this, we need to align the ownership of the pipes with the init process. Fixes: #10598 Signed-off-by: Wei Fu <fuweid89@gmail.com>
		
			
				
	
	
		
			444 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			444 lines
		
	
	
		
			13 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
/*
 | 
						|
   Copyright The containerd Authors.
 | 
						|
 | 
						|
   Licensed under the Apache License, Version 2.0 (the "License");
 | 
						|
   you may not use this file except in compliance with the License.
 | 
						|
   You may obtain a copy of the License at
 | 
						|
 | 
						|
       http://www.apache.org/licenses/LICENSE-2.0
 | 
						|
 | 
						|
   Unless required by applicable law or agreed to in writing, software
 | 
						|
   distributed under the License is distributed on an "AS IS" BASIS,
 | 
						|
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						|
   See the License for the specific language governing permissions and
 | 
						|
   limitations under the License.
 | 
						|
*/
 | 
						|
 | 
						|
package integration
 | 
						|
 | 
						|
import (
 | 
						|
	"context"
 | 
						|
	"errors"
 | 
						|
	"fmt"
 | 
						|
	"os"
 | 
						|
	"os/exec"
 | 
						|
	"os/user"
 | 
						|
	"path/filepath"
 | 
						|
	"strings"
 | 
						|
	"syscall"
 | 
						|
	"testing"
 | 
						|
	"time"
 | 
						|
 | 
						|
	"github.com/containerd/containerd/v2/integration/images"
 | 
						|
	runc "github.com/containerd/go-runc"
 | 
						|
	"github.com/stretchr/testify/assert"
 | 
						|
	"github.com/stretchr/testify/require"
 | 
						|
	"golang.org/x/sys/unix"
 | 
						|
	runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
 | 
						|
)
 | 
						|
 | 
						|
const (
 | 
						|
	defaultRoot = "/var/lib/containerd-test"
 | 
						|
)
 | 
						|
 | 
						|
func supportsUserNS() bool {
 | 
						|
	if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
 | 
						|
		return false
 | 
						|
	}
 | 
						|
	return true
 | 
						|
}
 | 
						|
 | 
						|
func supportsIDMap(path string) bool {
 | 
						|
	treeFD, err := unix.OpenTree(-1, path, uint(unix.OPEN_TREE_CLONE|unix.OPEN_TREE_CLOEXEC))
 | 
						|
	if err != nil {
 | 
						|
		return false
 | 
						|
	}
 | 
						|
	defer unix.Close(treeFD)
 | 
						|
 | 
						|
	// We want to test if idmap mounts are supported.
 | 
						|
	// So we use just some random mapping, it doesn't really matter which one.
 | 
						|
	// For the helper command, we just need something that is alive while we
 | 
						|
	// test this, a sleep 5 will do it.
 | 
						|
	cmd := exec.Command("sleep", "5")
 | 
						|
	cmd.SysProcAttr = &syscall.SysProcAttr{
 | 
						|
		Cloneflags:  syscall.CLONE_NEWUSER,
 | 
						|
		UidMappings: []syscall.SysProcIDMap{{ContainerID: 0, HostID: 65536, Size: 65536}},
 | 
						|
		GidMappings: []syscall.SysProcIDMap{{ContainerID: 0, HostID: 65536, Size: 65536}},
 | 
						|
	}
 | 
						|
	if err := cmd.Start(); err != nil {
 | 
						|
		return false
 | 
						|
	}
 | 
						|
	defer func() {
 | 
						|
		_ = cmd.Process.Kill()
 | 
						|
		_ = cmd.Wait()
 | 
						|
	}()
 | 
						|
 | 
						|
	usernsFD := fmt.Sprintf("/proc/%d/ns/user", cmd.Process.Pid)
 | 
						|
	var usernsFile *os.File
 | 
						|
	if usernsFile, err = os.Open(usernsFD); err != nil {
 | 
						|
		return false
 | 
						|
	}
 | 
						|
	defer usernsFile.Close()
 | 
						|
 | 
						|
	attr := unix.MountAttr{
 | 
						|
		Attr_set:  unix.MOUNT_ATTR_IDMAP,
 | 
						|
		Userns_fd: uint64(usernsFile.Fd()),
 | 
						|
	}
 | 
						|
	if err := unix.MountSetattr(treeFD, "", unix.AT_EMPTY_PATH, &attr); err != nil {
 | 
						|
		return false
 | 
						|
	}
 | 
						|
 | 
						|
	return true
 | 
						|
}
 | 
						|
 | 
						|
// traversePath gives 755 permissions for all elements in tPath below
 | 
						|
// os.TempDir() and errors out if elements above it don't have read+exec
 | 
						|
// permissions for others.  tPath MUST be a descendant of os.TempDir(). The path
 | 
						|
// returned by testing.TempDir() usually is.
 | 
						|
func traversePath(tPath string) error {
 | 
						|
	// Check the assumption that the argument is under os.TempDir().
 | 
						|
	tempBase := os.TempDir()
 | 
						|
	if !strings.HasPrefix(tPath, tempBase) {
 | 
						|
		return fmt.Errorf("traversePath: %q is not a descendant of %q", tPath, tempBase)
 | 
						|
	}
 | 
						|
 | 
						|
	var path string
 | 
						|
	for _, p := range strings.SplitAfter(tPath, "/") {
 | 
						|
		path = path + p
 | 
						|
		stats, err := os.Stat(path)
 | 
						|
		if err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
 | 
						|
		perm := stats.Mode().Perm()
 | 
						|
		if perm&0o5 == 0o5 {
 | 
						|
			continue
 | 
						|
		}
 | 
						|
		if strings.HasPrefix(tempBase, path) {
 | 
						|
			return fmt.Errorf("traversePath: directory %q MUST have read+exec permissions for others", path)
 | 
						|
		}
 | 
						|
 | 
						|
		if err := os.Chmod(path, perm|0o755); err != nil {
 | 
						|
			return err
 | 
						|
		}
 | 
						|
	}
 | 
						|
 | 
						|
	return nil
 | 
						|
}
 | 
						|
 | 
						|
func TestPodUserNS(t *testing.T) {
 | 
						|
	containerID := uint32(0)
 | 
						|
	hostID := uint32(65536)
 | 
						|
	size := uint32(65536)
 | 
						|
	idmap := []*runtime.IDMapping{
 | 
						|
		{
 | 
						|
			ContainerId: containerID,
 | 
						|
			HostId:      hostID,
 | 
						|
			Length:      size,
 | 
						|
		},
 | 
						|
	}
 | 
						|
 | 
						|
	volumeHostPath := t.TempDir()
 | 
						|
	if err := traversePath(volumeHostPath); err != nil {
 | 
						|
		t.Fatalf("failed to setup volume host path: %v", err)
 | 
						|
	}
 | 
						|
 | 
						|
	for name, test := range map[string]struct {
 | 
						|
		sandboxOpts   []PodSandboxOpts
 | 
						|
		containerOpts []ContainerOpts
 | 
						|
		checkOutput   func(t *testing.T, output string)
 | 
						|
		hostVolumes   bool // whether to config uses host Volumes
 | 
						|
		expectErr     bool
 | 
						|
	}{
 | 
						|
		"userns uid mapping": {
 | 
						|
			sandboxOpts: []PodSandboxOpts{
 | 
						|
				WithPodUserNs(containerID, hostID, size),
 | 
						|
			},
 | 
						|
			containerOpts: []ContainerOpts{
 | 
						|
				WithUserNamespace(containerID, hostID, size),
 | 
						|
				WithCommand("cat", "/proc/self/uid_map"),
 | 
						|
			},
 | 
						|
			checkOutput: func(t *testing.T, output string) {
 | 
						|
				// The output should contain the length of the userns requested.
 | 
						|
				assert.Contains(t, output, fmt.Sprint(size))
 | 
						|
			},
 | 
						|
		},
 | 
						|
		"userns gid mapping": {
 | 
						|
			sandboxOpts: []PodSandboxOpts{
 | 
						|
				WithPodUserNs(containerID, hostID, size),
 | 
						|
			},
 | 
						|
			containerOpts: []ContainerOpts{
 | 
						|
				WithUserNamespace(containerID, hostID, size),
 | 
						|
				WithCommand("cat", "/proc/self/gid_map"),
 | 
						|
			},
 | 
						|
			checkOutput: func(t *testing.T, output string) {
 | 
						|
				// The output should contain the length of the userns requested.
 | 
						|
				assert.Contains(t, output, fmt.Sprint(size))
 | 
						|
			},
 | 
						|
		},
 | 
						|
		"rootfs permissions": {
 | 
						|
			sandboxOpts: []PodSandboxOpts{
 | 
						|
				WithPodUserNs(containerID, hostID, size),
 | 
						|
			},
 | 
						|
			containerOpts: []ContainerOpts{
 | 
						|
				WithUserNamespace(containerID, hostID, size),
 | 
						|
				// Prints numeric UID and GID for path.
 | 
						|
				// For example, if UID and GID is 0 it will print: =0=0=
 | 
						|
				// We add the "=" signs so we use can assert.Contains() and be sure
 | 
						|
				// the UID/GID is 0 and not things like 100 (that contain 0).
 | 
						|
				// We can't use assert.Equal() easily as it contains timestamp, etc.
 | 
						|
				WithCommand("stat", "-c", "'=%u=%g='", "/root/"),
 | 
						|
			},
 | 
						|
			checkOutput: func(t *testing.T, output string) {
 | 
						|
				// The UID and GID should be 0 (root) if the chown/remap is done correctly.
 | 
						|
				assert.Contains(t, output, "=0=0=")
 | 
						|
			},
 | 
						|
		},
 | 
						|
		"volumes permissions": {
 | 
						|
			sandboxOpts: []PodSandboxOpts{
 | 
						|
				WithPodUserNs(containerID, hostID, size),
 | 
						|
			},
 | 
						|
			hostVolumes: true,
 | 
						|
			containerOpts: []ContainerOpts{
 | 
						|
				WithUserNamespace(containerID, hostID, size),
 | 
						|
				WithIDMapVolumeMount(volumeHostPath, "/mnt", idmap, idmap),
 | 
						|
				// Prints numeric UID and GID for path.
 | 
						|
				// For example, if UID and GID is 0 it will print: =0=0=
 | 
						|
				// We add the "=" signs so we use can assert.Contains() and be sure
 | 
						|
				// the UID/GID is 0 and not things like 100 (that contain 0).
 | 
						|
				// We can't use assert.Equal() easily as it contains timestamp, etc.
 | 
						|
				WithCommand("stat", "-c", "'=%u=%g='", "/mnt/"),
 | 
						|
			},
 | 
						|
			checkOutput: func(t *testing.T, output string) {
 | 
						|
				// The UID and GID should be the current user if chown/remap is done correctly.
 | 
						|
				uid := "0"
 | 
						|
				user, err := user.Current()
 | 
						|
				if user != nil && err == nil {
 | 
						|
					uid = user.Uid
 | 
						|
				}
 | 
						|
				assert.Contains(t, output, "="+uid+"="+uid+"=")
 | 
						|
			},
 | 
						|
		},
 | 
						|
		"fails with several mappings": {
 | 
						|
			sandboxOpts: []PodSandboxOpts{
 | 
						|
				WithPodUserNs(containerID, hostID, size),
 | 
						|
				WithPodUserNs(containerID*2, hostID*2, size*2),
 | 
						|
			},
 | 
						|
			expectErr: true,
 | 
						|
		},
 | 
						|
	} {
 | 
						|
		t.Run(name, func(t *testing.T) {
 | 
						|
			if !supportsUserNS() {
 | 
						|
				t.Skip("User namespaces are not supported")
 | 
						|
			}
 | 
						|
			if !supportsIDMap(defaultRoot) {
 | 
						|
				t.Skipf("ID mappings are not supported on: %v", defaultRoot)
 | 
						|
			}
 | 
						|
			if test.hostVolumes && !supportsIDMap(volumeHostPath) {
 | 
						|
				t.Skipf("ID mappings are not supported host volume filesystem: %v", volumeHostPath)
 | 
						|
			}
 | 
						|
			if err := supportsRuncIDMap(); err != nil {
 | 
						|
				t.Skipf("OCI runtime doesn't support idmap mounts: %v", err)
 | 
						|
			}
 | 
						|
 | 
						|
			testPodLogDir := t.TempDir()
 | 
						|
			sandboxOpts := append(test.sandboxOpts, WithPodLogDirectory(testPodLogDir))
 | 
						|
			t.Log("Create a sandbox with userns")
 | 
						|
			sbConfig := PodSandboxConfig("sandbox", "userns", sandboxOpts...)
 | 
						|
			sb, err := runtimeService.RunPodSandbox(sbConfig, *runtimeHandler)
 | 
						|
			if err != nil {
 | 
						|
				if !test.expectErr {
 | 
						|
					t.Fatalf("Unexpected RunPodSandbox error: %v", err)
 | 
						|
				}
 | 
						|
				return
 | 
						|
			}
 | 
						|
			// Make sure the sandbox is cleaned up.
 | 
						|
			defer func() {
 | 
						|
				assert.NoError(t, runtimeService.StopPodSandbox(sb))
 | 
						|
				assert.NoError(t, runtimeService.RemovePodSandbox(sb))
 | 
						|
			}()
 | 
						|
			if test.expectErr {
 | 
						|
				t.Fatalf("Expected RunPodSandbox to return error")
 | 
						|
			}
 | 
						|
 | 
						|
			var (
 | 
						|
				testImage     = images.Get(images.BusyBox)
 | 
						|
				containerName = "test-container"
 | 
						|
			)
 | 
						|
 | 
						|
			EnsureImageExists(t, testImage)
 | 
						|
 | 
						|
			containerOpts := append(test.containerOpts,
 | 
						|
				WithLogPath(containerName),
 | 
						|
			)
 | 
						|
			t.Log("Create a container for userns")
 | 
						|
			cnConfig := ContainerConfig(
 | 
						|
				containerName,
 | 
						|
				testImage,
 | 
						|
				containerOpts...,
 | 
						|
			)
 | 
						|
			cn, err := runtimeService.CreateContainer(sb, cnConfig, sbConfig)
 | 
						|
			require.NoError(t, err)
 | 
						|
 | 
						|
			t.Log("Start the container")
 | 
						|
			require.NoError(t, runtimeService.StartContainer(cn))
 | 
						|
 | 
						|
			t.Log("Wait for container to finish running")
 | 
						|
			require.NoError(t, Eventually(func() (bool, error) {
 | 
						|
				s, err := runtimeService.ContainerStatus(cn)
 | 
						|
				if err != nil {
 | 
						|
					return false, err
 | 
						|
				}
 | 
						|
				if s.GetState() == runtime.ContainerState_CONTAINER_EXITED {
 | 
						|
					return true, nil
 | 
						|
				}
 | 
						|
				return false, nil
 | 
						|
			}, time.Second, 30*time.Second))
 | 
						|
 | 
						|
			content, err := os.ReadFile(filepath.Join(testPodLogDir, containerName))
 | 
						|
			assert.NoError(t, err)
 | 
						|
 | 
						|
			t.Log("Running check function")
 | 
						|
			test.checkOutput(t, string(content))
 | 
						|
		})
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// TestIssue10598 tests a case[1] that init processes in container should be able
 | 
						|
// to open /dev/stdout or /dev/stderr if init processes are running in their
 | 
						|
// user namespace instead of root user.
 | 
						|
//
 | 
						|
// The shim server creates pipe for init processes' standard output. By default,
 | 
						|
// the owner of pipe is the same to shim server (root user). Let's say, the init
 | 
						|
// process is running with uid=1000/gid=1000 user. Init processes inherits the
 | 
						|
// pipe created by shim server so that it can just write data into that pipe.
 | 
						|
// However, if that init process tries to open /dev/stderr, the kernel will
 | 
						|
// return no permission error.
 | 
						|
//
 | 
						|
// The following output is from retsnoop[2].
 | 
						|
//
 | 
						|
//	→ do_open
 | 
						|
//	         → inode_permission
 | 
						|
//	             → generic_permission
 | 
						|
//	                 ↔ make_vfsuid      [0]                     0.500us
 | 
						|
//	                 ↔ make_vfsuid      [0]                     6.501us
 | 
						|
//	                 ↔ from_kuid        [0xffffffff]            0.700us
 | 
						|
//	             ← generic_permission   [-EACCES]              13.501us
 | 
						|
//
 | 
						|
// Since uid_map/gid_map doesn't cover uid=0/gid=0, the kernel can't convert
 | 
						|
// uid=0 into valid uid in that uid_map. So, `from_kuid` returns invalid uid
 | 
						|
// value and then `do_open` returns EACCES error.
 | 
						|
//
 | 
						|
// [1]: https://github.com/containerd/containerd/issues/10598
 | 
						|
// [2]: https://github.com/anakryiko/retsnoop
 | 
						|
func TestIssue10598(t *testing.T) {
 | 
						|
	if !supportsUserNS() {
 | 
						|
		t.Skip("User namespaces are not supported")
 | 
						|
	}
 | 
						|
	if !supportsIDMap(defaultRoot) {
 | 
						|
		t.Skipf("ID mappings are not supported on: %v", defaultRoot)
 | 
						|
	}
 | 
						|
	if err := supportsRuncIDMap(); err != nil {
 | 
						|
		t.Skipf("OCI runtime doesn't support idmap mounts: %v", err)
 | 
						|
	}
 | 
						|
 | 
						|
	testPodLogDir := t.TempDir()
 | 
						|
 | 
						|
	containerID := uint32(0)
 | 
						|
	hostID := uint32(65536)
 | 
						|
	size := uint32(65536)
 | 
						|
 | 
						|
	t.Log("Create a sandbox with userns")
 | 
						|
	sandboxOpts := []PodSandboxOpts{
 | 
						|
		WithPodUserNs(containerID, hostID, size),
 | 
						|
		WithPodLogDirectory(testPodLogDir),
 | 
						|
	}
 | 
						|
	sbConfig := PodSandboxConfig("issue10598", "userns", sandboxOpts...)
 | 
						|
	sb, err := runtimeService.RunPodSandbox(sbConfig, *runtimeHandler)
 | 
						|
	require.NoError(t, err)
 | 
						|
 | 
						|
	// Make sure the sandbox is cleaned up.
 | 
						|
	defer func() {
 | 
						|
		assert.NoError(t, runtimeService.StopPodSandbox(sb))
 | 
						|
		assert.NoError(t, runtimeService.RemovePodSandbox(sb))
 | 
						|
	}()
 | 
						|
 | 
						|
	t.Log("Create a container for userns")
 | 
						|
 | 
						|
	containerName := "nginx-userns"
 | 
						|
	testImage := images.Get(images.Nginx)
 | 
						|
 | 
						|
	EnsureImageExists(t, testImage)
 | 
						|
 | 
						|
	containerOpts := []ContainerOpts{
 | 
						|
		WithUserNamespace(containerID, hostID, size),
 | 
						|
		WithLogPath(containerName),
 | 
						|
		// The SELinux policy enforced by container-selinux prevents
 | 
						|
		// NGINX from opening the /proc/self/fd/2 pipe. This scenario
 | 
						|
		// is not intended to verify SELinux behavior in the user namespace
 | 
						|
		// but rather to confirm the ownership of the standard output
 | 
						|
		// file descriptor. The following option demonstrates how to
 | 
						|
		// disable the restrictive SELinux rule for the NGINX process.
 | 
						|
		WithSELinuxOptions(
 | 
						|
			"unconfined_u",
 | 
						|
			"unconfined_r",
 | 
						|
			"container_runtime_t",
 | 
						|
			"s0",
 | 
						|
		),
 | 
						|
	}
 | 
						|
 | 
						|
	cnConfig := ContainerConfig(
 | 
						|
		containerName,
 | 
						|
		testImage,
 | 
						|
		containerOpts...,
 | 
						|
	)
 | 
						|
	cn, err := runtimeService.CreateContainer(sb, cnConfig, sbConfig)
 | 
						|
	require.NoError(t, err)
 | 
						|
 | 
						|
	t.Log("Start the container")
 | 
						|
	require.NoError(t, runtimeService.StartContainer(cn))
 | 
						|
 | 
						|
	t.Log("Wait for container to start")
 | 
						|
	require.NoError(t, Eventually(func() (bool, error) {
 | 
						|
		content, err := os.ReadFile(filepath.Join(testPodLogDir, containerName))
 | 
						|
		if err != nil {
 | 
						|
			return false, err
 | 
						|
		}
 | 
						|
 | 
						|
		s, err := runtimeService.ContainerStatus(cn)
 | 
						|
		if err != nil {
 | 
						|
			return false, err
 | 
						|
		}
 | 
						|
 | 
						|
		if state := s.GetState(); state != runtime.ContainerState_CONTAINER_RUNNING {
 | 
						|
			return false, fmt.Errorf("%s is not running\nstate: %s\nlog: %s",
 | 
						|
				containerName, state, string(content))
 | 
						|
		}
 | 
						|
 | 
						|
		started := strings.Contains(string(content), "start worker processes")
 | 
						|
		if started {
 | 
						|
			t.Log(string(content))
 | 
						|
		}
 | 
						|
		return started, nil
 | 
						|
	}, time.Second, 30*time.Second))
 | 
						|
}
 | 
						|
 | 
						|
func supportsRuncIDMap() error {
 | 
						|
	var r runc.Runc
 | 
						|
	features, err := r.Features(context.Background())
 | 
						|
	if err != nil {
 | 
						|
		// If the features command is not implemented, then runc is too old.
 | 
						|
		return fmt.Errorf("features command failed: %w", err)
 | 
						|
	}
 | 
						|
 | 
						|
	if features.Linux.MountExtensions == nil || features.Linux.MountExtensions.IDMap == nil {
 | 
						|
		return errors.New("missing `mountExtensions.idmap` entry in `features` command")
 | 
						|
 | 
						|
	}
 | 
						|
	if enabled := features.Linux.MountExtensions.IDMap.Enabled; enabled == nil || !*enabled {
 | 
						|
		return errors.New("idmap mounts not supported")
 | 
						|
	}
 | 
						|
 | 
						|
	return nil
 | 
						|
}
 |