containerd/internal/cri/server/podsandbox/helpers_linux.go
Wei Fu ee0ed75d64 internal/cri: simplify netns setup with pinned userns
Motivation:

For pod-level user namespaces, it's impossible to force the container runtime
to join an existing network namespace after creating a new user namespace.

According to the capabilities section in [user_namespaces(7)][1], a network
namespace created by containerd is owned by the root user namespace. When the
container runtime (like runc or crun) creates a new user namespace, it becomes
a child of the root user namespace. Processes within this child user namespace
are not permitted to access resources owned by the parent user namespace.

If the network namespace is not owned by the new user namespace, the container
runtime will fail to mount /sys due to the [sysfs: Restrict mounting sysfs][2]
patch.

Referencing the [cap_capable][3] function in Linux, a process can access a
resource if:

* The resource is owned by the process's user namespace, and the process has
the required capability.

* The resource is owned by a child of the process's user namespace, and the
owner's user namespace was created by the process's UID.

In the context of pod-level user namespaces, the CRI plugin delegates the
creation of the network namespace to the container runtime when running the
pause container. After the pause container is initialized, the CRI plugin pins
the pause container's network namespace into `/run/netns` and then executes
the `CNI_ADD` command over it.

However, if the pause container is terminated during the pinning process, the
CRI plugin might encounter a PID cycle, leading to the `CNI_ADD` command
operating on an incorrect network namespace.

Moreover, rolling back the `RunPodSandbox` API is complex due to the delegation
of network namespace creation. As highlighted in issue #10363, the CRI plugin
can lose IP information after a containerd restart, making it challenging to
maintain robustness in the RunPodSandbox API.

Solution:

Allow containerd to create a new user namespace and then create the network
namespace within that user namespace. This way, the CRI plugin can force the
container runtime to join both the user namespace and the network namespace.
Since the network namespace is owned by the newly created user namespace,
the container runtime will have the necessary permissions to mount `/sys` on
the container's root filesystem. As a result, delegation of network namespace
creation is no longer needed.

NOTE:

* The CRI plugin does not need to pin the newly created user namespace as it
does with the network namespace, because the kernel allows retrieving a user
namespace reference via [ioctl_ns(2)][4]. As a result, the podsandbox
implementation can obtain the user namespace using the `netnsPath` parameter.

[1]: <https://man7.org/linux/man-pages/man7/user_namespaces.7.html>
[2]: <7dc5dbc879>
[3]: <2c85ebc57b/security/commoncap.c (L65)>
[4]: <https://man7.org/linux/man-pages/man2/ioctl_ns.2.html>

Signed-off-by: Wei Fu <fuweid89@gmail.com>
2024-09-11 07:21:43 +08:00

395 lines
12 KiB
Go

/*
Copyright The containerd Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/
package podsandbox
import (
"context"
"fmt"
"os"
"path"
"path/filepath"
"regexp"
"sort"
"strings"
"syscall"
"time"
"github.com/containerd/log"
"github.com/moby/sys/mountinfo"
runtimespec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/opencontainers/selinux/go-selinux/label"
"golang.org/x/sys/unix"
runtime "k8s.io/cri-api/pkg/apis/runtime/v1"
containerd "github.com/containerd/containerd/v2/client"
"github.com/containerd/containerd/v2/core/mount"
"github.com/containerd/containerd/v2/core/snapshots"
"github.com/containerd/containerd/v2/internal/cri/seutil"
"github.com/containerd/containerd/v2/pkg/seccomp"
"github.com/containerd/containerd/v2/pkg/sys"
)
const (
// defaultSandboxOOMAdj is default omm adj for sandbox container. (kubernetes#47938).
defaultSandboxOOMAdj = -998
// defaultShmSize is the default size of the sandbox shm.
defaultShmSize = int64(1024 * 1024 * 64)
// relativeRootfsPath is the rootfs path relative to bundle path.
relativeRootfsPath = "rootfs"
// devShm is the default path of /dev/shm.
devShm = "/dev/shm"
// etcHosts is the default path of /etc/hosts file.
etcHosts = "/etc/hosts"
// resolvConfPath is the abs path of resolv.conf on host or container.
resolvConfPath = "/etc/resolv.conf"
)
// getCgroupsPath generates container cgroups path.
func getCgroupsPath(cgroupsParent, id string) string {
base := path.Base(cgroupsParent)
if strings.HasSuffix(base, ".slice") {
// For a.slice/b.slice/c.slice, base is c.slice.
// runc systemd cgroup path format is "slice:prefix:name".
return strings.Join([]string{base, "cri-containerd", id}, ":")
}
return filepath.Join(cgroupsParent, id)
}
// getSandboxHostname returns the hostname file path inside the sandbox root directory.
func (c *Controller) getSandboxHostname(id string) string {
return filepath.Join(c.getSandboxRootDir(id), "hostname")
}
// getSandboxHosts returns the hosts file path inside the sandbox root directory.
func (c *Controller) getSandboxHosts(id string) string {
return filepath.Join(c.getSandboxRootDir(id), "hosts")
}
// getResolvPath returns resolv.conf filepath for specified sandbox.
func (c *Controller) getResolvPath(id string) string {
return filepath.Join(c.getSandboxRootDir(id), "resolv.conf")
}
// getSandboxDevShm returns the shm file path inside the sandbox root directory.
func (c *Controller) getSandboxDevShm(id string) string {
return filepath.Join(c.getVolatileSandboxRootDir(id), "shm")
}
// getSandboxPinnedNamespaces returns the pinned namespaces directory inside the
// sandbox state directory.
func (c *Controller) getSandboxPinnedNamespaces(id string) string {
return filepath.Join(c.getVolatileSandboxRootDir(id), "pinned-namespaces")
}
// getSandboxPinnedUserNamespace returns the pinned user namespace file.
func (c *Controller) getSandboxPinnedUserNamespace(id string) string {
return filepath.Join(c.getSandboxPinnedNamespaces(id), "user")
}
// pinUserNamespace persists user namespace in namespace filesystem.
func (c *Controller) pinUserNamespace(sandboxID string, netnsPath string) error {
nsPath := c.getSandboxPinnedUserNamespace(sandboxID)
baseDir := filepath.Dir(nsPath)
if err := os.MkdirAll(baseDir, 0755); err != nil {
return fmt.Errorf("failed to init pinned-namespaces directory %s: %w", baseDir, err)
}
emptyFd, err := os.OpenFile(nsPath, os.O_RDWR|os.O_CREATE|os.O_EXCL, 0666)
if err != nil {
return fmt.Errorf("failed to create empty file %s: %w", nsPath, err)
}
emptyFd.Close()
netnsFd, err := os.Open(netnsPath)
if err != nil {
return fmt.Errorf("failed to open netns(%s): %w", netnsPath, err)
}
defer netnsFd.Close()
usernsFd, err := sys.GetUsernsForNamespace(netnsFd.Fd())
if err != nil {
return fmt.Errorf("failed to get user namespace for netns(%s): %w", netnsPath, err)
}
defer usernsFd.Close()
if err = unix.Mount(usernsFd.Name(), nsPath, "none", unix.MS_BIND, ""); err != nil {
return fmt.Errorf("failed to bind mount ns src: %v at %s: %w", usernsFd.Name(), nsPath, err)
}
return nil
}
func toLabel(selinuxOptions *runtime.SELinuxOption) ([]string, error) {
var labels []string
if selinuxOptions == nil {
return nil, nil
}
if err := checkSelinuxLevel(selinuxOptions.Level); err != nil {
return nil, err
}
if selinuxOptions.User != "" {
labels = append(labels, "user:"+selinuxOptions.User)
}
if selinuxOptions.Role != "" {
labels = append(labels, "role:"+selinuxOptions.Role)
}
if selinuxOptions.Type != "" {
labels = append(labels, "type:"+selinuxOptions.Type)
}
if selinuxOptions.Level != "" {
labels = append(labels, "level:"+selinuxOptions.Level)
}
return labels, nil
}
func initLabelsFromOpt(selinuxOpts *runtime.SELinuxOption) (string, string, error) {
labels, err := toLabel(selinuxOpts)
if err != nil {
return "", "", err
}
return label.InitLabels(labels)
}
func checkSelinuxLevel(level string) error {
if len(level) == 0 {
return nil
}
matched, err := regexp.MatchString(`^s\d(-s\d)??(:c\d{1,4}(\.c\d{1,4})?(,c\d{1,4}(\.c\d{1,4})?)*)?$`, level)
if err != nil {
return fmt.Errorf("the format of 'level' %q is not correct: %w", level, err)
}
if !matched {
return fmt.Errorf("the format of 'level' %q is not correct", level)
}
return nil
}
func (c *Controller) seccompEnabled() bool {
return seccomp.IsEnabled()
}
// unmountRecursive unmounts the target and all mounts underneath, starting with
// the deepest mount first.
func unmountRecursive(ctx context.Context, target string) error {
target, err := mount.CanonicalizePath(target)
if err != nil {
return err
}
toUnmount, err := mountinfo.GetMounts(mountinfo.PrefixFilter(target))
if err != nil {
return err
}
// Make the deepest mount be first
sort.Slice(toUnmount, func(i, j int) bool {
return len(toUnmount[i].Mountpoint) > len(toUnmount[j].Mountpoint)
})
for i, m := range toUnmount {
if err := mount.UnmountAll(m.Mountpoint, unix.MNT_DETACH); err != nil {
if i == len(toUnmount)-1 { // last mount
return err
}
// This is some submount, we can ignore this error for now, the final unmount will fail if this is a real problem
log.G(ctx).WithError(err).Debugf("failed to unmount submount %s", m.Mountpoint)
}
}
return nil
}
// ensureRemoveAll wraps `os.RemoveAll` to check for specific errors that can
// often be remedied.
// Only use `ensureRemoveAll` if you really want to make every effort to remove
// a directory.
//
// Because of the way `os.Remove` (and by extension `os.RemoveAll`) works, there
// can be a race between reading directory entries and then actually attempting
// to remove everything in the directory.
// These types of errors do not need to be returned since it's ok for the dir to
// be gone we can just retry the remove operation.
//
// This should not return a `os.ErrNotExist` kind of error under any circumstances
func ensureRemoveAll(ctx context.Context, dir string) error {
notExistErr := make(map[string]bool)
// track retries
exitOnErr := make(map[string]int)
maxRetry := 50
// Attempt to unmount anything beneath this dir first.
if err := unmountRecursive(ctx, dir); err != nil {
log.G(ctx).WithError(err).Debugf("failed to do initial unmount of %s", dir)
}
for {
err := os.RemoveAll(dir)
if err == nil {
return nil
}
pe, ok := err.(*os.PathError)
if !ok {
return err
}
if os.IsNotExist(err) {
if notExistErr[pe.Path] {
return err
}
notExistErr[pe.Path] = true
// There is a race where some subdir can be removed but after the
// parent dir entries have been read.
// So the path could be from `os.Remove(subdir)`
// If the reported non-existent path is not the passed in `dir` we
// should just retry, but otherwise return with no error.
if pe.Path == dir {
return nil
}
continue
}
if pe.Err != syscall.EBUSY {
return err
}
if e := mount.Unmount(pe.Path, unix.MNT_DETACH); e != nil {
return fmt.Errorf("error while removing %s: %w", dir, e)
}
if exitOnErr[pe.Path] == maxRetry {
return err
}
exitOnErr[pe.Path]++
time.Sleep(100 * time.Millisecond)
}
}
var vmbasedRuntimes = []string{
"io.containerd.kata",
}
func isVMBasedRuntime(runtimeType string) bool {
for _, rt := range vmbasedRuntimes {
if strings.Contains(runtimeType, rt) {
return true
}
}
return false
}
func modifyProcessLabel(runtimeType string, spec *runtimespec.Spec) error {
if !isVMBasedRuntime(runtimeType) {
return nil
}
l, err := seutil.ChangeToKVM(spec.Process.SelinuxLabel)
if err != nil {
return fmt.Errorf("failed to get selinux kvm label: %w", err)
}
spec.Process.SelinuxLabel = l
return nil
}
func parseUsernsIDMap(runtimeIDMap []*runtime.IDMapping) ([]runtimespec.LinuxIDMapping, error) {
var m []runtimespec.LinuxIDMapping
if len(runtimeIDMap) == 0 {
return m, nil
}
if len(runtimeIDMap) > 1 {
// We only accept 1 line, because containerd.WithRemappedSnapshot() only supports that.
return m, fmt.Errorf("only one mapping line supported, got %v mapping lines", len(runtimeIDMap))
}
// We know len is 1 now.
if runtimeIDMap[0] == nil {
return m, nil
}
uidMap := *runtimeIDMap[0]
if uidMap.Length < 1 {
return m, fmt.Errorf("invalid mapping length: %v", uidMap.Length)
}
m = []runtimespec.LinuxIDMapping{
{
ContainerID: uidMap.ContainerId,
HostID: uidMap.HostId,
Size: uidMap.Length,
},
}
return m, nil
}
func parseUsernsIDs(userns *runtime.UserNamespace) (uids, gids []runtimespec.LinuxIDMapping, retErr error) {
if userns == nil {
// If userns is not set, the kubelet doesn't support this option
// and we should just fallback to no userns. This is completely
// valid.
return nil, nil, nil
}
uids, err := parseUsernsIDMap(userns.GetUids())
if err != nil {
return nil, nil, fmt.Errorf("UID mapping: %w", err)
}
gids, err = parseUsernsIDMap(userns.GetGids())
if err != nil {
return nil, nil, fmt.Errorf("GID mapping: %w", err)
}
switch mode := userns.GetMode(); mode {
case runtime.NamespaceMode_NODE:
if len(uids) != 0 || len(gids) != 0 {
return nil, nil, fmt.Errorf("can't use user namespace mode %q with mappings. Got %v UID mappings and %v GID mappings", mode, len(uids), len(gids))
}
case runtime.NamespaceMode_POD:
// This is valid, we will handle it in WithPodNamespaces().
if len(uids) == 0 || len(gids) == 0 {
return nil, nil, fmt.Errorf("can't use user namespace mode %q without UID and GID mappings", mode)
}
default:
return nil, nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
}
return uids, gids, nil
}
func snapshotterRemapOpts(nsOpts *runtime.NamespaceOption) ([]snapshots.Opt, error) {
snapshotOpt := []snapshots.Opt{}
usernsOpts := nsOpts.GetUsernsOptions()
if usernsOpts == nil {
return snapshotOpt, nil
}
uids, gids, err := parseUsernsIDs(usernsOpts)
if err != nil {
return nil, fmt.Errorf("user namespace configuration: %w", err)
}
if usernsOpts.GetMode() == runtime.NamespaceMode_POD {
snapshotOpt = append(snapshotOpt, containerd.WithRemapperLabels(0, uids[0].HostID, 0, gids[0].HostID, uids[0].Size))
}
return snapshotOpt, nil
}