diff --git a/integration/images/image_list.go b/integration/images/image_list.go index 38d1026f6..3c55e0931 100644 --- a/integration/images/image_list.go +++ b/integration/images/image_list.go @@ -38,6 +38,7 @@ type ImageList struct { VolumeOwnership string ArgsEscaped string DockerSchema1 string + Nginx string } var ( @@ -57,6 +58,7 @@ func initImages(imageListFile string) { VolumeOwnership: "ghcr.io/containerd/volume-ownership:2.1", ArgsEscaped: "cplatpublic.azurecr.io/args-escaped-test-image-ns:1.0", DockerSchema1: "registry.k8s.io/busybox@sha256:4bdd623e848417d96127e16037743f0cd8b528c026e9175e22a84f639eca58ff", + Nginx: "ghcr.io/containerd/nginx:1.27.0", } if imageListFile != "" { @@ -96,6 +98,8 @@ const ( ArgsEscaped // DockerSchema1 image with docker schema 1 DockerSchema1 + // Nginx image + Nginx ) func initImageMap(imageList ImageList) map[int]string { @@ -108,6 +112,7 @@ func initImageMap(imageList ImageList) map[int]string { images[VolumeOwnership] = imageList.VolumeOwnership images[ArgsEscaped] = imageList.ArgsEscaped images[DockerSchema1] = imageList.DockerSchema1 + images[Nginx] = imageList.Nginx return images } diff --git a/integration/main_test.go b/integration/main_test.go index 1a3acb0f4..77138b40c 100644 --- a/integration/main_test.go +++ b/integration/main_test.go @@ -471,6 +471,24 @@ func WithDevice(containerPath, hostPath, permissions string) ContainerOpts { } } +// WithSELinuxOptions allows to set SELinux option for container. +func WithSELinuxOptions(user, role, typ, level string) ContainerOpts { + return func(c *runtime.ContainerConfig) { + if c.Linux == nil { + c.Linux = &runtime.LinuxContainerConfig{} + } + if c.Linux.SecurityContext == nil { + c.Linux.SecurityContext = &runtime.LinuxContainerSecurityContext{} + } + c.Linux.SecurityContext.SelinuxOptions = &runtime.SELinuxOption{ + User: user, + Role: role, + Type: typ, + Level: level, + } + } +} + // ContainerConfig creates a container config given a name and image name // and additional container config options func ContainerConfig(name, image string, opts ...ContainerOpts) *runtime.ContainerConfig { diff --git a/integration/pod_userns_linux_test.go b/integration/pod_userns_linux_test.go index 16bb5a650..3615051ff 100644 --- a/integration/pod_userns_linux_test.go +++ b/integration/pod_userns_linux_test.go @@ -304,6 +304,125 @@ func TestPodUserNS(t *testing.T) { } } +// TestIssue10598 tests a case[1] that init processes in container should be able +// to open /dev/stdout or /dev/stderr if init processes are running in their +// user namespace instead of root user. +// +// The shim server creates pipe for init processes' standard output. By default, +// the owner of pipe is the same to shim server (root user). Let's say, the init +// process is running with uid=1000/gid=1000 user. Init processes inherits the +// pipe created by shim server so that it can just write data into that pipe. +// However, if that init process tries to open /dev/stderr, the kernel will +// return no permission error. +// +// The following output is from retsnoop[2]. +// +// → do_open +// → inode_permission +// → generic_permission +// ↔ make_vfsuid [0] 0.500us +// ↔ make_vfsuid [0] 6.501us +// ↔ from_kuid [0xffffffff] 0.700us +// ← generic_permission [-EACCES] 13.501us +// +// Since uid_map/gid_map doesn't cover uid=0/gid=0, the kernel can't convert +// uid=0 into valid uid in that uid_map. So, `from_kuid` returns invalid uid +// value and then `do_open` returns EACCES error. +// +// [1]: https://github.com/containerd/containerd/issues/10598 +// [2]: https://github.com/anakryiko/retsnoop +func TestIssue10598(t *testing.T) { + if !supportsUserNS() { + t.Skip("User namespaces are not supported") + } + if !supportsIDMap(defaultRoot) { + t.Skipf("ID mappings are not supported on: %v", defaultRoot) + } + if err := supportsRuncIDMap(); err != nil { + t.Skipf("OCI runtime doesn't support idmap mounts: %v", err) + } + + testPodLogDir := t.TempDir() + + containerID := uint32(0) + hostID := uint32(65536) + size := uint32(65536) + + t.Log("Create a sandbox with userns") + sandboxOpts := []PodSandboxOpts{ + WithPodUserNs(containerID, hostID, size), + WithPodLogDirectory(testPodLogDir), + } + sbConfig := PodSandboxConfig("issue10598", "userns", sandboxOpts...) + sb, err := runtimeService.RunPodSandbox(sbConfig, *runtimeHandler) + require.NoError(t, err) + + // Make sure the sandbox is cleaned up. + defer func() { + assert.NoError(t, runtimeService.StopPodSandbox(sb)) + assert.NoError(t, runtimeService.RemovePodSandbox(sb)) + }() + + t.Log("Create a container for userns") + + containerName := "nginx-userns" + testImage := images.Get(images.Nginx) + + EnsureImageExists(t, testImage) + + containerOpts := []ContainerOpts{ + WithUserNamespace(containerID, hostID, size), + WithLogPath(containerName), + // The SELinux policy enforced by container-selinux prevents + // NGINX from opening the /proc/self/fd/2 pipe. This scenario + // is not intended to verify SELinux behavior in the user namespace + // but rather to confirm the ownership of the standard output + // file descriptor. The following option demonstrates how to + // disable the restrictive SELinux rule for the NGINX process. + WithSELinuxOptions( + "unconfined_u", + "unconfined_r", + "container_runtime_t", + "s0", + ), + } + + cnConfig := ContainerConfig( + containerName, + testImage, + containerOpts..., + ) + cn, err := runtimeService.CreateContainer(sb, cnConfig, sbConfig) + require.NoError(t, err) + + t.Log("Start the container") + require.NoError(t, runtimeService.StartContainer(cn)) + + t.Log("Wait for container to start") + require.NoError(t, Eventually(func() (bool, error) { + content, err := os.ReadFile(filepath.Join(testPodLogDir, containerName)) + if err != nil { + return false, err + } + + s, err := runtimeService.ContainerStatus(cn) + if err != nil { + return false, err + } + + if state := s.GetState(); state != runtime.ContainerState_CONTAINER_RUNNING { + return false, fmt.Errorf("%s is not running\nstate: %s\nlog: %s", + containerName, state, string(content)) + } + + started := strings.Contains(string(content), "start worker processes") + if started { + t.Log(string(content)) + } + return started, nil + }, time.Second, 30*time.Second)) +} + func supportsRuncIDMap() error { var r runc.Runc features, err := r.Features(context.Background()) diff --git a/internal/cri/server/container_start.go b/internal/cri/server/container_start.go index 1e0681392..36e4d7246 100644 --- a/internal/cri/server/container_start.go +++ b/internal/cri/server/container_start.go @@ -129,6 +129,12 @@ func (c *criService) StartContainer(ctx context.Context, r *runtime.StartContain containerd.WithTaskAPIEndpoint(endpoint.Address, endpoint.Version)) } + ioOwnerTaskOpts, err := updateContainerIOOwner(ctx, container, config) + if err != nil { + return nil, fmt.Errorf("failed to update container IO owner: %w", err) + } + taskOpts = append(taskOpts, ioOwnerTaskOpts...) + task, err := container.NewTask(ctx, ioCreation, taskOpts...) if err != nil { return nil, fmt.Errorf("failed to create containerd task: %w", err) diff --git a/internal/cri/server/container_start_linux.go b/internal/cri/server/container_start_linux.go new file mode 100644 index 000000000..0f0a49cf0 --- /dev/null +++ b/internal/cri/server/container_start_linux.go @@ -0,0 +1,66 @@ +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package server + +import ( + "context" + "fmt" + + containerd "github.com/containerd/containerd/v2/client" + "github.com/containerd/containerd/v2/internal/userns" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +// updateContainerIOOwner updates I/O files' owner to align with initial processe's UID/GID. +func updateContainerIOOwner(ctx context.Context, cntr containerd.Container, config *runtime.ContainerConfig) ([]containerd.NewTaskOpts, error) { + if config.GetLinux() == nil { + return nil, nil + } + + // FIXME(fuweid): Ideally, the pipe owner should be aligned with process owner. + // No matter what user namespace container uses, it should work well. However, + // it breaks the sig-node conformance case - [when querying /stats/summary should report resource usage through the stats api]. + // In order to keep compatible, the change should apply to user namespace only. + if config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions() == nil { + return nil, nil + } + + spec, err := cntr.Spec(ctx) + if err != nil { + return nil, fmt.Errorf("failed to get spec: %w", err) + } + + if spec.Linux == nil || spec.Process == nil { + return nil, fmt.Errorf("invalid linux platform oci runtime spec") + } + + hostID, err := userns.IDMap{ + UidMap: spec.Linux.UIDMappings, + GidMap: spec.Linux.GIDMappings, + }.ToHost(userns.User{ + Uid: spec.Process.User.UID, + Gid: spec.Process.User.GID, + }) + if err != nil { + return nil, fmt.Errorf("failed to do idmap to get host ID: %w", err) + } + + return []containerd.NewTaskOpts{ + containerd.WithUIDOwner(hostID.Uid), + containerd.WithGIDOwner(hostID.Gid), + }, nil +} diff --git a/internal/cri/server/container_start_other.go b/internal/cri/server/container_start_other.go new file mode 100644 index 000000000..50f79f408 --- /dev/null +++ b/internal/cri/server/container_start_other.go @@ -0,0 +1,31 @@ +//go:build !linux + +/* + Copyright The containerd Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package server + +import ( + "context" + + containerd "github.com/containerd/containerd/v2/client" + runtime "k8s.io/cri-api/pkg/apis/runtime/v1" +) + +// updateContainerIOOwner updates I/O files' owner to align with initial processe's UID/GID. +func updateContainerIOOwner(ctx context.Context, cntr containerd.Container, config *runtime.ContainerConfig) ([]containerd.NewTaskOpts, error) { + return nil, nil +}