From ec9e0dca915a82c00d71b17500eda937e1790b66 Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Tue, 19 Sep 2023 13:08:05 +0200 Subject: [PATCH 1/6] overlay: Require opt-in if idmap mounts are not supported. If we don't use idmap mounts, doing a chown per pod is very expensive: it implies duplicating the container storage for the image for every pod and the latency to start a new pod is affected too. Let's make sure users are aware of this, by having them opt-in, for snapshotters that we have a better solution (like overlayfs, that has support for idmap mounts). Signed-off-by: Rodrigo Campos --- snapshots/overlay/overlay.go | 8 ++++++++ snapshots/overlay/plugin/plugin.go | 16 +++++++++++++++- snapshotter_opts_unix.go | 14 +++++++++++++- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/snapshots/overlay/overlay.go b/snapshots/overlay/overlay.go index e566aa5c6..8e4428a26 100644 --- a/snapshots/overlay/overlay.go +++ b/snapshots/overlay/overlay.go @@ -46,6 +46,7 @@ type SnapshotterConfig struct { ms MetaStore mountOptions []string remapIds bool + slowChown bool } // Opt is an option to configure the overlay snapshotter @@ -98,6 +99,11 @@ func WithRemapIds(config *SnapshotterConfig) error { return nil } +func WithSlowChown(config *SnapshotterConfig) error { + config.slowChown = true + return nil +} + type snapshotter struct { root string ms MetaStore @@ -105,6 +111,7 @@ type snapshotter struct { upperdirLabel bool options []string remapIds bool + slowChown bool } // NewSnapshotter returns a Snapshotter which uses overlayfs. The overlayfs @@ -161,6 +168,7 @@ func NewSnapshotter(root string, opts ...Opt) (snapshots.Snapshotter, error) { upperdirLabel: config.upperdirLabel, options: config.mountOptions, remapIds: config.remapIds, + slowChown: config.slowChown, }, nil } diff --git a/snapshots/overlay/plugin/plugin.go b/snapshots/overlay/plugin/plugin.go index 645bc8c95..89adeebb2 100644 --- a/snapshots/overlay/plugin/plugin.go +++ b/snapshots/overlay/plugin/plugin.go @@ -28,7 +28,8 @@ import ( ) const ( - capaRemapIds = "remap-ids" + capaRemapIds = "remap-ids" + capaOnlyRemapIds = "only-remap-ids" ) // Config represents configuration for the overlay plugin. @@ -38,6 +39,11 @@ type Config struct { UpperdirLabel bool `toml:"upperdir_label"` SyncRemove bool `toml:"sync_remove"` + // slowChown allows the plugin to fallback to a recursive chown if fast options (like + // idmap mounts) are not available. See more info about the overhead this can have in + // github.com/containerd/containerd/docs/user-namespaces/. + SlowChown bool `toml:"slow_chown"` + // MountOptions are options used for the overlay mount (not used on bind mounts) MountOptions []string `toml:"mount_options"` } @@ -76,6 +82,14 @@ func init() { ic.Meta.Capabilities = append(ic.Meta.Capabilities, capaRemapIds) } + if config.SlowChown { + oOpts = append(oOpts, overlay.WithSlowChown) + } else { + // If slowChown is false, we use capaOnlyRemapIds to signal we only + // allow idmap mounts. + ic.Meta.Capabilities = append(ic.Meta.Capabilities, capaOnlyRemapIds) + } + ic.Meta.Exports["root"] = root return overlay.NewSnapshotter(root, oOpts...) }, diff --git a/snapshotter_opts_unix.go b/snapshotter_opts_unix.go index 2dff9b424..d0f208d90 100644 --- a/snapshotter_opts_unix.go +++ b/snapshotter_opts_unix.go @@ -26,7 +26,8 @@ import ( ) const ( - capabRemapIDs = "remap-ids" + capabRemapIDs = "remap-ids" + capaOnlyRemapIds = "only-remap-ids" ) // WithRemapperLabels creates the labels used by any supporting snapshotter @@ -72,6 +73,17 @@ func resolveSnapshotOptions(ctx context.Context, client *Client, snapshotterName return parent, nil } + capaOnlyRemap := false + for _, capa := range capabs { + if capa == capaOnlyRemapIds { + capaOnlyRemap = true + } + } + + if capaOnlyRemap { + return "", fmt.Errorf("snapshotter %q doesn't support idmap mounts on this host, configure `slow_chown` to allow a slower and expensive fallback", snapshotterName) + } + var ctrUID, hostUID, length uint32 _, err = fmt.Sscanf(uidMap, "%d:%d:%d", &ctrUID, &hostUID, &length) if err != nil { From 8bf8e2b975e6e280b5159a121379609e66c346f0 Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Tue, 19 Sep 2023 14:27:04 +0200 Subject: [PATCH 2/6] snapshotter: Use capa prefix consistently for capabilities The overlay snapshotter is using capa, not capab, let's use that in all the palces. Signed-off-by: Rodrigo Campos --- snapshotter_opts_unix.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/snapshotter_opts_unix.go b/snapshotter_opts_unix.go index d0f208d90..ac970219e 100644 --- a/snapshotter_opts_unix.go +++ b/snapshotter_opts_unix.go @@ -26,7 +26,7 @@ import ( ) const ( - capabRemapIDs = "remap-ids" + capaRemapIDs = "remap-ids" capaOnlyRemapIds = "only-remap-ids" ) @@ -46,7 +46,7 @@ func resolveSnapshotOptions(ctx context.Context, client *Client, snapshotterName } for _, capab := range capabs { - if capab == capabRemapIDs { + if capab == capaRemapIDs { // Snapshotter supports ID remapping, we don't need to do anything. return parent, nil } From e3790820006aaff3ba7acd099fd8e8b6e75ec0de Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Tue, 19 Sep 2023 14:42:20 +0200 Subject: [PATCH 3/6] docs/userns: Document the need to opt-in for a slow chown Signed-off-by: Rodrigo Campos --- docs/user-namespaces/README.md | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/docs/user-namespaces/README.md b/docs/user-namespaces/README.md index 92e83c454..55e205755 100644 --- a/docs/user-namespaces/README.md +++ b/docs/user-namespaces/README.md @@ -89,7 +89,7 @@ user namespaces enabled. This will affect all the snapshots that you take manual do that). In that case, make sure to use the same value of `/sys/module/overlay/parameters/metacopy` when creating and restoring the snapshot. -### containerd 2.0 +### containerd 2.0 and above The storage and latency limitation from containerd 1.7 are not present in container 2.0 and above, if you use the overlay snapshotter (this is used by default). It will not use more storage at all, @@ -99,8 +99,36 @@ This is achieved by using the kernel feature idmap mounts with the container roo image). This allows an overlay file-system to expose the image with different UID/GID without copying the files nor the inodes, just using a bind-mount. -You can check if you are using idmap mounts for the container image if you create a pod with user -namespaces, exec into it and run: +Containerd by default will refuse to create a container with user namespaces, if overlayfs is the +snapshotter and the kernel running doesn't support idmap mounts for overlayfs. This is to make sure +before falling back to the expensive chown (in terms of storage and pod startup latency), you +understand the implications and decide to opt-in. Please read the containerd 1.7 limitations for an +explanation of those. + +If your kernel doesn't support idmap mounts for the overlayfs snapshotter, you will see an error +like: + +``` +failed to create containerd container: snapshotter "overlayfs" doesn't support idmap mounts on this host, configure `slow_chown` to allow a slower and expensive fallback +``` + +Linux supports idmap mounts on an overlayfs since version 5.19. + +You can opt-in for the slow chown by adding the `slow_chown` field to your config in the overlayfs +snapshotter section, like this: + +``` + [plugins."io.containerd.snapshotter.v1.overlayfs"] + slow_chown = true +``` + +Note that only overlayfs users need to opt-in for the slow chown, as it as it is the only one that +containerd provides a better option (only the overlayfs snapshotter supports idmap mounts in +containerd). If you use another snapshotter, you will fall-back to the expensive chown without the +need to opt-in. + +That being said, you can double check if your container is using idmap mounts for the container +image if you create a pod with user namespaces, exec into it and run: ``` mount | grep overlay From d008d64a8f431697f380eebf24890218fab6cd08 Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Tue, 19 Sep 2023 15:06:19 +0200 Subject: [PATCH 4/6] docs/userns: Clarify containerd 1.7 limitations Signed-off-by: Rodrigo Campos --- docs/user-namespaces/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/user-namespaces/README.md b/docs/user-namespaces/README.md index 55e205755..70d45363d 100644 --- a/docs/user-namespaces/README.md +++ b/docs/user-namespaces/README.md @@ -74,9 +74,9 @@ Different containerd versions have different limitations too, those are highligh ### containerd 1.7 One limitation present in containerd 1.7 is that it needs to change the ownership of every file and -directory inside the container image, during Pod startup. This means it has a storage overhead (the -size of the container image is duplicated each time a pod is created) and can significantly impact -the container startup latency. +directory inside the container image, during Pod startup. This means it has a storage overhead, as +**the size of the container image is duplicated each time a pod is created**, and can significantly +impact the container startup latency, as doing such a copy takes time too. You can mitigate this limitation by switching `/sys/module/overlay/parameters/metacopy` to `Y`. This will significantly reduce the storage and performance overhead, as only the inode for each file of From 46d3094aa34bb5f1c455be3bc77d7522bc3c27ef Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Tue, 19 Sep 2023 15:08:51 +0200 Subject: [PATCH 5/6] docs/userns: Fix small typo Signed-off-by: Rodrigo Campos --- docs/user-namespaces/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user-namespaces/README.md b/docs/user-namespaces/README.md index 70d45363d..6405de3c5 100644 --- a/docs/user-namespaces/README.md +++ b/docs/user-namespaces/README.md @@ -83,7 +83,7 @@ will significantly reduce the storage and performance overhead, as only the inod the container image will be duplicated, but not the content of the file. This means it will use less storage and it will be faster. However, it is not a panacea. -If you change the metacopy param, make sure to do it in a way that is persistant across reboots. You +If you change the metacopy param, make sure to do it in a way that is persistent across reboots. You should also be aware that this setting will be used for all containers, not just containers with user namespaces enabled. This will affect all the snapshots that you take manually (if you happen to do that). In that case, make sure to use the same value of `/sys/module/overlay/parameters/metacopy` From 8e3722c7d100f3c2a75fdb7ff303a4eaef37f6da Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Tue, 19 Sep 2023 15:24:49 +0200 Subject: [PATCH 6/6] CI: Set slow_chown for overlayfs snapshotter Userns requires idmap mounts or to opt-in for a slow and expensive chown. As idmap mounts support for overlayfs was merged in 5.19, let's add the slow_chown config for our CI. The config is harmless to keep it in new kernels, as if idmap mounts is supported, it will be just used. Whenever all our CI is run with kernels >= 5.19, we can remove this setting. Signed-off-by: Rodrigo Campos --- script/test/utils.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/script/test/utils.sh b/script/test/utils.sh index b49d3ac0e..a516e7031 100755 --- a/script/test/utils.sh +++ b/script/test/utils.sh @@ -52,6 +52,13 @@ version=2 [plugins."io.containerd.grpc.v1.cri"] drain_exec_sync_io_timeout = "10s" + +# Userns requires idmap mount support for overlayfs (added in 5.19) +# Let's opt-in for a recursive chown, so we can always test this even in old distros. +# Note that if idmap mounts support is present, we will use that, so it is harmless to keep this +# here. +[plugins."io.containerd.snapshotter.v1.overlayfs"] + slow_chown = true EOF if command -v sestatus >/dev/null 2>&1; then