diff --git a/RELEASES.md b/RELEASES.md index 086ba6f1f..8c8964a9c 100644 --- a/RELEASES.md +++ b/RELEASES.md @@ -461,4 +461,4 @@ more quickly. | [NRI in CRI Support](https://github.com/containerd/containerd/pull/6019) | containerd v1.7 | containerd v2.0 | | [gRPC Shim](https://github.com/containerd/containerd/pull/8052) | containerd v1.7 | containerd v2.0 | | [CRI Runtime Specific Snapshotter](https://github.com/containerd/containerd/pull/6899) | containerd v1.7 | containerd v2.0 | -| [CRI Support for User Namespaces](https://github.com/containerd/containerd/pull/7679) | containerd v1.7 | containerd v2.0 | +| [CRI Support for User Namespaces](./docs/user-namespaces/README.md) | containerd v1.7 | containerd v2.0 | diff --git a/docs/user-namespaces/README.md b/docs/user-namespaces/README.md new file mode 100644 index 000000000..92e83c454 --- /dev/null +++ b/docs/user-namespaces/README.md @@ -0,0 +1,146 @@ +# Support for user namespaces + +Kubernetes supports running pods with user namespace since v1.25. This document explains the +containerd support for this feature. + +## What are user namespaces? + +A user namespace isolates the user running inside the container from the one in the host. + +A process running as root in a container can run as a different (non-root) user in the host; in +other words, the process has full privileges for operations inside the user namespace, but is +unprivileged for operations outside the namespace. + +You can use this feature to reduce the damage a compromised container can do to the host or other +pods in the same node. There are several security vulnerabilities rated either HIGH or CRITICAL that +were not exploitable when user namespaces is active. It is expected user namespace will mitigate +some future vulnerabilities too. + +See [the kubernetes documentation][kube-intro] for a high-level introduction to +user namespaces. + +[kube-intro]: https://kubernetes.io/docs/concepts/workloads/pods/user-namespaces/#introduction + +## Stack requirements + +The Kubernetes implementation was redesigned in 1.27, so the requirements are different for versions +pre and post Kubernetes 1.27. + +Please note that if you try to use user namespaces with containerd 1.6 or older, the `hostUsers: +false` setting in your pod.spec will be **silently ignored**. + +### Kubernetes 1.25 and 1.26 + + * Containerd 1.7 or greater + * runc 1.1 or greater + +### Kubernetes 1.27 and greater + + * Linux 6.3 or greater + * Containerd 2.0 or greater + * You can use runc or crun as the OCI runtime: + * runc 1.2 or greater + * crun 1.9 or greater + +Furthermore, all the file-systems used by the volumes in the pod need kernel-support for idmap +mounts. Some popular file-systems that support idmap mounts in Linux 6.3 are: `btrfs`, `ext4`, `xfs`, +`fat`, `tmpfs`, `overlayfs`. + +The kubelet is in charge of populating some files to the containers (like configmap, secrets, etc.). +The file-system used in that path needs to support idmap mounts too. See [the Kubernetes +documentation][kube-req] for more info on that. + + +[kube-req]: https://kubernetes.io/docs/concepts/workloads/pods/user-namespaces/#before-you-begin + +## Creating a Kubernetes pod with user namespaces + +First check your containerd, Linux and Kubernetes versions. If those are okay, then there is no +special configuration needed on conntainerd. You can just follow the steps in the [Kubernetes +website][kube-example]. + +[kube-example]: https://kubernetes.io/docs/tasks/configure-pod-container/user-namespaces/ + +# Limitations + +You can check the limitations Kubernetes has [here][kube-limitations]. Note that different +Kubernetes versions have different limitations, be sure to check the site for the Kubernetes version +you are using. + +Different containerd versions have different limitations too, those are highlighted in this section. + +[kube-limitations]: https://kubernetes.io/docs/concepts/workloads/pods/user-namespaces/#limitations + +### containerd 1.7 + +One limitation present in containerd 1.7 is that it needs to change the ownership of every file and +directory inside the container image, during Pod startup. This means it has a storage overhead (the +size of the container image is duplicated each time a pod is created) and can significantly impact +the container startup latency. + +You can mitigate this limitation by switching `/sys/module/overlay/parameters/metacopy` to `Y`. This +will significantly reduce the storage and performance overhead, as only the inode for each file of +the container image will be duplicated, but not the content of the file. This means it will use less +storage and it will be faster. However, it is not a panacea. + +If you change the metacopy param, make sure to do it in a way that is persistant across reboots. You +should also be aware that this setting will be used for all containers, not just containers with +user namespaces enabled. This will affect all the snapshots that you take manually (if you happen to +do that). In that case, make sure to use the same value of `/sys/module/overlay/parameters/metacopy` +when creating and restoring the snapshot. + +### containerd 2.0 + +The storage and latency limitation from containerd 1.7 are not present in container 2.0 and above, +if you use the overlay snapshotter (this is used by default). It will not use more storage at all, +and there is no startup latency. + +This is achieved by using the kernel feature idmap mounts with the container rootfs (the container +image). This allows an overlay file-system to expose the image with different UID/GID without copying +the files nor the inodes, just using a bind-mount. + +You can check if you are using idmap mounts for the container image if you create a pod with user +namespaces, exec into it and run: + +``` +mount | grep overlay +``` + +You should see a reference to the idmap mount in the `lowerdir` parameter, in this case we can see +`idmapped` used there: + +``` +overlay on / type overlay (rw,relatime,lowerdir=/tmp/ovl-idmapped823885363/0,upperdir=/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/1018/fs,workdir=/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/1018/work) +``` + +## Creating a container with user namespaces with `ctr` + +You can also create a container with user namespaces using `ctr`. This is more low-level, be warned. + +Create an OCI bundle as explained [here][runc-bundle]. Then, change the UID/GID to 65536: + +``` +sudo chown -R 65536:65536 rootfs/ +``` + +Copy [this config.json](./config.json) and replace `XXX-path-to-rootfs` with the +absolute path to the rootfs you just chowned. + +Then create and start the container with: + +``` +sudo ctr create --config /config.json userns-test +sudo ctr t start userns-test +``` + +This will open a shell inside the container. You can run this, to verify you are inside a user +namespace: + +``` +root@runc:/# cat /proc/self/uid_map + 0 65536 65536 +``` + +The output should be exactly the same. + +[runc-bundle]: https://github.com/opencontainers/runc#creating-an-oci-bundle diff --git a/docs/user-namespaces/config.json b/docs/user-namespaces/config.json new file mode 100644 index 000000000..246876388 --- /dev/null +++ b/docs/user-namespaces/config.json @@ -0,0 +1,199 @@ +{ + "ociVersion": "1.0.2-dev", + "process": { + "terminal": true, + "user": { + "uid": 0, + "gid": 0 + }, + "args": [ + "bash" + ], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "/", + "capabilities": { + "bounding": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "effective": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "inheritable": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "permitted": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ], + "ambient": [ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE" + ] + }, + "rlimits": [ + { + "type": "RLIMIT_NOFILE", + "hard": 1024, + "soft": 1024 + } + ], + "noNewPrivileges": true + }, + "root": { + "path": "XXX-path-to-rootfs" + }, + "hostname": "runc", + "mounts": [ + { + "destination": "/proc", + "type": "proc", + "source": "proc" + }, + { + "destination": "/dev", + "type": "tmpfs", + "source": "tmpfs", + "options": [ + "nosuid", + "strictatime", + "mode=755", + "size=65536k" + ] + }, + { + "destination": "/dev/pts", + "type": "devpts", + "source": "devpts", + "options": [ + "nosuid", + "noexec", + "newinstance", + "ptmxmode=0666", + "mode=0620", + "gid=5" + ] + }, + { + "destination": "/dev/shm", + "type": "tmpfs", + "source": "shm", + "options": [ + "nosuid", + "noexec", + "nodev", + "mode=1777", + "size=65536k" + ] + }, + { + "destination": "/dev/mqueue", + "type": "mqueue", + "source": "mqueue", + "options": [ + "nosuid", + "noexec", + "nodev" + ] + }, + { + "destination": "/sys", + "type": "sysfs", + "source": "sysfs", + "options": [ + "nosuid", + "noexec", + "nodev", + "ro" + ] + }, + { + "destination": "/sys/fs/cgroup", + "type": "cgroup", + "source": "cgroup", + "options": [ + "nosuid", + "noexec", + "nodev", + "relatime", + "ro" + ] + } + ], + "linux": { + "uidMappings": [ + { + "containerID": 0, + "hostID": 65536, + "size": 65536 + } + ], + "gidMappings": [ + { + "containerID": 0, + "hostID": 65536, + "size": 65536 + } + ], + "resources": { + "devices": [ + { + "allow": false, + "access": "rwm" + } + ] + }, + "namespaces": [ + { + "type": "pid" + }, + { + "type": "network" + }, + { + "type": "ipc" + }, + { + "type": "uts" + }, + { + "type": "mount" + }, + { + "type": "cgroup" + }, + { + "type": "user" + } + ], + "maskedPaths": [ + "/proc/acpi", + "/proc/asound", + "/proc/kcore", + "/proc/keys", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + "/proc/scsi" + ], + "readonlyPaths": [ + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger" + ] + } +}