pkg/cri/server: Fix net.ipv4.ping_group_range with userns

userns.RunningInUserNS() checks if the code calling that function is
running inside a user namespace. But we need to check if the container
we will create will use a user namespace, in that case we need to
disable the sysctl too (or we would need to take the userns mapping into
account to set the IDs).

This was added in PR:
        https://github.com/containerd/containerd/pull/6170/

And the param documentation says it is not enabled when user namespaces
are in use:
        https://github.com/containerd/containerd/pull/6170/files#diff-91d0a4c61f6d3523b5a19717d1b40b5fffd7e392d8fe22aed7c905fe195b8902R118

I'm not sure if the intention was to disable this if containerd is
running inside a userns (rootless, if that is even supported) or just
when the pod has user namespaces.

Out of an abundance of caution, I'm keeping the userns.RunningInUserNS()
so it is still not used if containerd runs inside a user namespace.

With this patch and "enable_unprivileged_icmp = true" in the config,
running containerd as root on the host, pods with user namespaces start
just fine. Without this patch they fail with:
        ... failed to create containerd task: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: w
 /proc/sys/net/ipv4/ping_group_range: invalid argument: unknown

Thanks a lot to Andy on the k8s slack for reporting the issue. He also
mentions he hits this with k3s on a default installation (the param
is off by default on containerd, but k3s turns that on by default it
seems). He also debugged which part of the stack was setting that
sysctl, found the PR that added this code in containerd and a workaround
(to turn the bool off).

Signed-off-by: Rodrigo Campos <rodrigoca@microsoft.com>
This commit is contained in:
Rodrigo Campos 2023-07-06 12:47:22 +02:00
parent d5ec7286ae
commit 9bf5aeca77
2 changed files with 6 additions and 1 deletions

View File

@ -146,6 +146,9 @@ func (c *Controller) sandboxContainerSpec(id string, config *runtime.PodSandboxC
if c.config.EnableUnprivilegedPorts && !ipUnprivilegedPortStart {
sysctls["net.ipv4.ip_unprivileged_port_start"] = "0"
}
// TODO (rata): We need to set this only if the pod will
// **not** use user namespaces either.
// This will be done when user namespaces is ported to sbserver.
if c.config.EnableUnprivilegedICMP && !pingGroupRange && !userns.RunningInUserNS() {
sysctls["net.ipv4.ping_group_range"] = "0 2147483647"
}

View File

@ -95,6 +95,7 @@ func (c *criService) sandboxContainerSpec(id string, config *runtime.PodSandboxC
usernsOpts := nsOptions.GetUsernsOptions()
uids, gids, err := parseUsernsIDs(usernsOpts)
var usernsEnabled bool
if err != nil {
return nil, fmt.Errorf("user namespace configuration: %w", err)
}
@ -105,6 +106,7 @@ func (c *criService) sandboxContainerSpec(id string, config *runtime.PodSandboxC
specOpts = append(specOpts, customopts.WithoutNamespace(runtimespec.UserNamespace))
case runtime.NamespaceMode_POD:
specOpts = append(specOpts, oci.WithUserNamespace(uids, gids))
usernsEnabled = true
default:
return nil, fmt.Errorf("unsupported user namespace mode: %q", mode)
}
@ -164,7 +166,7 @@ func (c *criService) sandboxContainerSpec(id string, config *runtime.PodSandboxC
if c.config.EnableUnprivilegedPorts && !ipUnprivilegedPortStart {
sysctls["net.ipv4.ip_unprivileged_port_start"] = "0"
}
if c.config.EnableUnprivilegedICMP && !pingGroupRange && !userns.RunningInUserNS() {
if c.config.EnableUnprivilegedICMP && !pingGroupRange && !userns.RunningInUserNS() && !usernsEnabled {
sysctls["net.ipv4.ping_group_range"] = "0 2147483647"
}
}