
The containerd-shim creates pipes and passes them to the init container as stdin, stdout, and stderr for logging purposes. By default, these pipes are owned by the root user (UID/GID: 0/0). The init container can access them directly through inheritance. However, if the init container attempts to open any files pointing to these pipes (e.g., /proc/1/fd/2, /dev/stderr), it will encounter a permission issue since it is not the owner. To avoid this, we need to align the ownership of the pipes with the init process. Fixes: #10598 Signed-off-by: Wei Fu <fuweid89@gmail.com>
444 lines
13 KiB
Go
444 lines
13 KiB
Go
/*
|
|
Copyright The containerd Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package integration
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"os"
|
|
"os/exec"
|
|
"os/user"
|
|
"path/filepath"
|
|
"strings"
|
|
"syscall"
|
|
"testing"
|
|
"time"
|
|
|
|
"github.com/containerd/containerd/v2/integration/images"
|
|
runc "github.com/containerd/go-runc"
|
|
"github.com/stretchr/testify/assert"
|
|
"github.com/stretchr/testify/require"
|
|
"golang.org/x/sys/unix"
|
|
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
|
|
)
|
|
|
|
const (
|
|
defaultRoot = "/var/lib/containerd-test"
|
|
)
|
|
|
|
func supportsUserNS() bool {
|
|
if _, err := os.Stat("/proc/self/ns/user"); os.IsNotExist(err) {
|
|
return false
|
|
}
|
|
return true
|
|
}
|
|
|
|
func supportsIDMap(path string) bool {
|
|
treeFD, err := unix.OpenTree(-1, path, uint(unix.OPEN_TREE_CLONE|unix.OPEN_TREE_CLOEXEC))
|
|
if err != nil {
|
|
return false
|
|
}
|
|
defer unix.Close(treeFD)
|
|
|
|
// We want to test if idmap mounts are supported.
|
|
// So we use just some random mapping, it doesn't really matter which one.
|
|
// For the helper command, we just need something that is alive while we
|
|
// test this, a sleep 5 will do it.
|
|
cmd := exec.Command("sleep", "5")
|
|
cmd.SysProcAttr = &syscall.SysProcAttr{
|
|
Cloneflags: syscall.CLONE_NEWUSER,
|
|
UidMappings: []syscall.SysProcIDMap{{ContainerID: 0, HostID: 65536, Size: 65536}},
|
|
GidMappings: []syscall.SysProcIDMap{{ContainerID: 0, HostID: 65536, Size: 65536}},
|
|
}
|
|
if err := cmd.Start(); err != nil {
|
|
return false
|
|
}
|
|
defer func() {
|
|
_ = cmd.Process.Kill()
|
|
_ = cmd.Wait()
|
|
}()
|
|
|
|
usernsFD := fmt.Sprintf("/proc/%d/ns/user", cmd.Process.Pid)
|
|
var usernsFile *os.File
|
|
if usernsFile, err = os.Open(usernsFD); err != nil {
|
|
return false
|
|
}
|
|
defer usernsFile.Close()
|
|
|
|
attr := unix.MountAttr{
|
|
Attr_set: unix.MOUNT_ATTR_IDMAP,
|
|
Userns_fd: uint64(usernsFile.Fd()),
|
|
}
|
|
if err := unix.MountSetattr(treeFD, "", unix.AT_EMPTY_PATH, &attr); err != nil {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
// traversePath gives 755 permissions for all elements in tPath below
|
|
// os.TempDir() and errors out if elements above it don't have read+exec
|
|
// permissions for others. tPath MUST be a descendant of os.TempDir(). The path
|
|
// returned by testing.TempDir() usually is.
|
|
func traversePath(tPath string) error {
|
|
// Check the assumption that the argument is under os.TempDir().
|
|
tempBase := os.TempDir()
|
|
if !strings.HasPrefix(tPath, tempBase) {
|
|
return fmt.Errorf("traversePath: %q is not a descendant of %q", tPath, tempBase)
|
|
}
|
|
|
|
var path string
|
|
for _, p := range strings.SplitAfter(tPath, "/") {
|
|
path = path + p
|
|
stats, err := os.Stat(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
perm := stats.Mode().Perm()
|
|
if perm&0o5 == 0o5 {
|
|
continue
|
|
}
|
|
if strings.HasPrefix(tempBase, path) {
|
|
return fmt.Errorf("traversePath: directory %q MUST have read+exec permissions for others", path)
|
|
}
|
|
|
|
if err := os.Chmod(path, perm|0o755); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func TestPodUserNS(t *testing.T) {
|
|
containerID := uint32(0)
|
|
hostID := uint32(65536)
|
|
size := uint32(65536)
|
|
idmap := []*runtime.IDMapping{
|
|
{
|
|
ContainerId: containerID,
|
|
HostId: hostID,
|
|
Length: size,
|
|
},
|
|
}
|
|
|
|
volumeHostPath := t.TempDir()
|
|
if err := traversePath(volumeHostPath); err != nil {
|
|
t.Fatalf("failed to setup volume host path: %v", err)
|
|
}
|
|
|
|
for name, test := range map[string]struct {
|
|
sandboxOpts []PodSandboxOpts
|
|
containerOpts []ContainerOpts
|
|
checkOutput func(t *testing.T, output string)
|
|
hostVolumes bool // whether to config uses host Volumes
|
|
expectErr bool
|
|
}{
|
|
"userns uid mapping": {
|
|
sandboxOpts: []PodSandboxOpts{
|
|
WithPodUserNs(containerID, hostID, size),
|
|
},
|
|
containerOpts: []ContainerOpts{
|
|
WithUserNamespace(containerID, hostID, size),
|
|
WithCommand("cat", "/proc/self/uid_map"),
|
|
},
|
|
checkOutput: func(t *testing.T, output string) {
|
|
// The output should contain the length of the userns requested.
|
|
assert.Contains(t, output, fmt.Sprint(size))
|
|
},
|
|
},
|
|
"userns gid mapping": {
|
|
sandboxOpts: []PodSandboxOpts{
|
|
WithPodUserNs(containerID, hostID, size),
|
|
},
|
|
containerOpts: []ContainerOpts{
|
|
WithUserNamespace(containerID, hostID, size),
|
|
WithCommand("cat", "/proc/self/gid_map"),
|
|
},
|
|
checkOutput: func(t *testing.T, output string) {
|
|
// The output should contain the length of the userns requested.
|
|
assert.Contains(t, output, fmt.Sprint(size))
|
|
},
|
|
},
|
|
"rootfs permissions": {
|
|
sandboxOpts: []PodSandboxOpts{
|
|
WithPodUserNs(containerID, hostID, size),
|
|
},
|
|
containerOpts: []ContainerOpts{
|
|
WithUserNamespace(containerID, hostID, size),
|
|
// Prints numeric UID and GID for path.
|
|
// For example, if UID and GID is 0 it will print: =0=0=
|
|
// We add the "=" signs so we use can assert.Contains() and be sure
|
|
// the UID/GID is 0 and not things like 100 (that contain 0).
|
|
// We can't use assert.Equal() easily as it contains timestamp, etc.
|
|
WithCommand("stat", "-c", "'=%u=%g='", "/root/"),
|
|
},
|
|
checkOutput: func(t *testing.T, output string) {
|
|
// The UID and GID should be 0 (root) if the chown/remap is done correctly.
|
|
assert.Contains(t, output, "=0=0=")
|
|
},
|
|
},
|
|
"volumes permissions": {
|
|
sandboxOpts: []PodSandboxOpts{
|
|
WithPodUserNs(containerID, hostID, size),
|
|
},
|
|
hostVolumes: true,
|
|
containerOpts: []ContainerOpts{
|
|
WithUserNamespace(containerID, hostID, size),
|
|
WithIDMapVolumeMount(volumeHostPath, "/mnt", idmap, idmap),
|
|
// Prints numeric UID and GID for path.
|
|
// For example, if UID and GID is 0 it will print: =0=0=
|
|
// We add the "=" signs so we use can assert.Contains() and be sure
|
|
// the UID/GID is 0 and not things like 100 (that contain 0).
|
|
// We can't use assert.Equal() easily as it contains timestamp, etc.
|
|
WithCommand("stat", "-c", "'=%u=%g='", "/mnt/"),
|
|
},
|
|
checkOutput: func(t *testing.T, output string) {
|
|
// The UID and GID should be the current user if chown/remap is done correctly.
|
|
uid := "0"
|
|
user, err := user.Current()
|
|
if user != nil && err == nil {
|
|
uid = user.Uid
|
|
}
|
|
assert.Contains(t, output, "="+uid+"="+uid+"=")
|
|
},
|
|
},
|
|
"fails with several mappings": {
|
|
sandboxOpts: []PodSandboxOpts{
|
|
WithPodUserNs(containerID, hostID, size),
|
|
WithPodUserNs(containerID*2, hostID*2, size*2),
|
|
},
|
|
expectErr: true,
|
|
},
|
|
} {
|
|
t.Run(name, func(t *testing.T) {
|
|
if !supportsUserNS() {
|
|
t.Skip("User namespaces are not supported")
|
|
}
|
|
if !supportsIDMap(defaultRoot) {
|
|
t.Skipf("ID mappings are not supported on: %v", defaultRoot)
|
|
}
|
|
if test.hostVolumes && !supportsIDMap(volumeHostPath) {
|
|
t.Skipf("ID mappings are not supported host volume filesystem: %v", volumeHostPath)
|
|
}
|
|
if err := supportsRuncIDMap(); err != nil {
|
|
t.Skipf("OCI runtime doesn't support idmap mounts: %v", err)
|
|
}
|
|
|
|
testPodLogDir := t.TempDir()
|
|
sandboxOpts := append(test.sandboxOpts, WithPodLogDirectory(testPodLogDir))
|
|
t.Log("Create a sandbox with userns")
|
|
sbConfig := PodSandboxConfig("sandbox", "userns", sandboxOpts...)
|
|
sb, err := runtimeService.RunPodSandbox(sbConfig, *runtimeHandler)
|
|
if err != nil {
|
|
if !test.expectErr {
|
|
t.Fatalf("Unexpected RunPodSandbox error: %v", err)
|
|
}
|
|
return
|
|
}
|
|
// Make sure the sandbox is cleaned up.
|
|
defer func() {
|
|
assert.NoError(t, runtimeService.StopPodSandbox(sb))
|
|
assert.NoError(t, runtimeService.RemovePodSandbox(sb))
|
|
}()
|
|
if test.expectErr {
|
|
t.Fatalf("Expected RunPodSandbox to return error")
|
|
}
|
|
|
|
var (
|
|
testImage = images.Get(images.BusyBox)
|
|
containerName = "test-container"
|
|
)
|
|
|
|
EnsureImageExists(t, testImage)
|
|
|
|
containerOpts := append(test.containerOpts,
|
|
WithLogPath(containerName),
|
|
)
|
|
t.Log("Create a container for userns")
|
|
cnConfig := ContainerConfig(
|
|
containerName,
|
|
testImage,
|
|
containerOpts...,
|
|
)
|
|
cn, err := runtimeService.CreateContainer(sb, cnConfig, sbConfig)
|
|
require.NoError(t, err)
|
|
|
|
t.Log("Start the container")
|
|
require.NoError(t, runtimeService.StartContainer(cn))
|
|
|
|
t.Log("Wait for container to finish running")
|
|
require.NoError(t, Eventually(func() (bool, error) {
|
|
s, err := runtimeService.ContainerStatus(cn)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
if s.GetState() == runtime.ContainerState_CONTAINER_EXITED {
|
|
return true, nil
|
|
}
|
|
return false, nil
|
|
}, time.Second, 30*time.Second))
|
|
|
|
content, err := os.ReadFile(filepath.Join(testPodLogDir, containerName))
|
|
assert.NoError(t, err)
|
|
|
|
t.Log("Running check function")
|
|
test.checkOutput(t, string(content))
|
|
})
|
|
}
|
|
}
|
|
|
|
// TestIssue10598 tests a case[1] that init processes in container should be able
|
|
// to open /dev/stdout or /dev/stderr if init processes are running in their
|
|
// user namespace instead of root user.
|
|
//
|
|
// The shim server creates pipe for init processes' standard output. By default,
|
|
// the owner of pipe is the same to shim server (root user). Let's say, the init
|
|
// process is running with uid=1000/gid=1000 user. Init processes inherits the
|
|
// pipe created by shim server so that it can just write data into that pipe.
|
|
// However, if that init process tries to open /dev/stderr, the kernel will
|
|
// return no permission error.
|
|
//
|
|
// The following output is from retsnoop[2].
|
|
//
|
|
// → do_open
|
|
// → inode_permission
|
|
// → generic_permission
|
|
// ↔ make_vfsuid [0] 0.500us
|
|
// ↔ make_vfsuid [0] 6.501us
|
|
// ↔ from_kuid [0xffffffff] 0.700us
|
|
// ← generic_permission [-EACCES] 13.501us
|
|
//
|
|
// Since uid_map/gid_map doesn't cover uid=0/gid=0, the kernel can't convert
|
|
// uid=0 into valid uid in that uid_map. So, `from_kuid` returns invalid uid
|
|
// value and then `do_open` returns EACCES error.
|
|
//
|
|
// [1]: https://github.com/containerd/containerd/issues/10598
|
|
// [2]: https://github.com/anakryiko/retsnoop
|
|
func TestIssue10598(t *testing.T) {
|
|
if !supportsUserNS() {
|
|
t.Skip("User namespaces are not supported")
|
|
}
|
|
if !supportsIDMap(defaultRoot) {
|
|
t.Skipf("ID mappings are not supported on: %v", defaultRoot)
|
|
}
|
|
if err := supportsRuncIDMap(); err != nil {
|
|
t.Skipf("OCI runtime doesn't support idmap mounts: %v", err)
|
|
}
|
|
|
|
testPodLogDir := t.TempDir()
|
|
|
|
containerID := uint32(0)
|
|
hostID := uint32(65536)
|
|
size := uint32(65536)
|
|
|
|
t.Log("Create a sandbox with userns")
|
|
sandboxOpts := []PodSandboxOpts{
|
|
WithPodUserNs(containerID, hostID, size),
|
|
WithPodLogDirectory(testPodLogDir),
|
|
}
|
|
sbConfig := PodSandboxConfig("issue10598", "userns", sandboxOpts...)
|
|
sb, err := runtimeService.RunPodSandbox(sbConfig, *runtimeHandler)
|
|
require.NoError(t, err)
|
|
|
|
// Make sure the sandbox is cleaned up.
|
|
defer func() {
|
|
assert.NoError(t, runtimeService.StopPodSandbox(sb))
|
|
assert.NoError(t, runtimeService.RemovePodSandbox(sb))
|
|
}()
|
|
|
|
t.Log("Create a container for userns")
|
|
|
|
containerName := "nginx-userns"
|
|
testImage := images.Get(images.Nginx)
|
|
|
|
EnsureImageExists(t, testImage)
|
|
|
|
containerOpts := []ContainerOpts{
|
|
WithUserNamespace(containerID, hostID, size),
|
|
WithLogPath(containerName),
|
|
// The SELinux policy enforced by container-selinux prevents
|
|
// NGINX from opening the /proc/self/fd/2 pipe. This scenario
|
|
// is not intended to verify SELinux behavior in the user namespace
|
|
// but rather to confirm the ownership of the standard output
|
|
// file descriptor. The following option demonstrates how to
|
|
// disable the restrictive SELinux rule for the NGINX process.
|
|
WithSELinuxOptions(
|
|
"unconfined_u",
|
|
"unconfined_r",
|
|
"container_runtime_t",
|
|
"s0",
|
|
),
|
|
}
|
|
|
|
cnConfig := ContainerConfig(
|
|
containerName,
|
|
testImage,
|
|
containerOpts...,
|
|
)
|
|
cn, err := runtimeService.CreateContainer(sb, cnConfig, sbConfig)
|
|
require.NoError(t, err)
|
|
|
|
t.Log("Start the container")
|
|
require.NoError(t, runtimeService.StartContainer(cn))
|
|
|
|
t.Log("Wait for container to start")
|
|
require.NoError(t, Eventually(func() (bool, error) {
|
|
content, err := os.ReadFile(filepath.Join(testPodLogDir, containerName))
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
s, err := runtimeService.ContainerStatus(cn)
|
|
if err != nil {
|
|
return false, err
|
|
}
|
|
|
|
if state := s.GetState(); state != runtime.ContainerState_CONTAINER_RUNNING {
|
|
return false, fmt.Errorf("%s is not running\nstate: %s\nlog: %s",
|
|
containerName, state, string(content))
|
|
}
|
|
|
|
started := strings.Contains(string(content), "start worker processes")
|
|
if started {
|
|
t.Log(string(content))
|
|
}
|
|
return started, nil
|
|
}, time.Second, 30*time.Second))
|
|
}
|
|
|
|
func supportsRuncIDMap() error {
|
|
var r runc.Runc
|
|
features, err := r.Features(context.Background())
|
|
if err != nil {
|
|
// If the features command is not implemented, then runc is too old.
|
|
return fmt.Errorf("features command failed: %w", err)
|
|
}
|
|
|
|
if features.Linux.MountExtensions == nil || features.Linux.MountExtensions.IDMap == nil {
|
|
return errors.New("missing `mountExtensions.idmap` entry in `features` command")
|
|
|
|
}
|
|
if enabled := features.Linux.MountExtensions.IDMap.Enabled; enabled == nil || !*enabled {
|
|
return errors.New("idmap mounts not supported")
|
|
}
|
|
|
|
return nil
|
|
}
|