diff --git a/pkg/cri/sbserver/sandbox_run.go b/pkg/cri/sbserver/sandbox_run.go index 713a785be..4b3e77159 100644 --- a/pkg/cri/sbserver/sandbox_run.go +++ b/pkg/cri/sbserver/sandbox_run.go @@ -23,6 +23,7 @@ import ( "fmt" "math" "path/filepath" + goruntime "runtime" "strings" "time" @@ -143,8 +144,27 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox } }() + userNsEnabled := false + if goruntime.GOOS != "windows" { + usernsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions() + if usernsOpts != nil && usernsOpts.GetMode() == runtime.NamespaceMode_POD { + userNsEnabled = true + } + } + // Setup the network namespace if host networking wasn't requested. - if !hostNetwork(config) { + if !hostNetwork(config) && !userNsEnabled { + // XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too. + // We can't move this to a function, as the defer calls need to be executed if other + // errors are returned in this function. So, we would need more refactors to move + // this code to a function and the idea was to not change the current code for + // !userNsEnabled case, therefore doing it would defeat the purpose. + // + // The difference between the cases is the use of netns.NewNetNS() vs + // netns.NewNetNSFromPID(). + // + // To simplify this, in the future, we should just remove this case (podNetwork && + // !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled). netStart := time.Now() // If it is not in host network namespace then create a namespace and set the sandbox // handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network @@ -244,6 +264,88 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, fmt.Errorf("failed to start sandbox %q: %w", id, err) } + if !hostNetwork(config) && userNsEnabled { + // If userns is enabled, then the netns was created by the OCI runtime + // on controller.Start(). The OCI runtime needs to create the netns + // because, if userns is in use, the netns needs to be owned by the + // userns. So, let the OCI runtime just handle this for us. + // If the netns is not owned by the userns several problems will happen. + // For instance, the container will lack permission (even if + // capabilities are present) to modify the netns or, even worse, the OCI + // runtime will fail to mount sysfs: + // https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164 + // + // Note we do this after controller.Start(), as before that we + // can't get the PID for the sandbox that we need for the netns. + // Doing a controller.Status() call before that fails (can't + // find the sandbox) so we can't get the PID. + netStart := time.Now() + + // If it is not in host network namespace then create a namespace and set the sandbox + // handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network + // namespaces. If the pod is in host network namespace then both are empty and should not + // be used. + var netnsMountDir = "/var/run/netns" + if c.config.NetNSMountsUnderStateDir { + netnsMountDir = filepath.Join(c.config.StateDir, "netns") + } + + sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, ctrl.Pid) + if err != nil { + return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err) + } + + // Update network namespace in the store, which is used to generate the container's spec + sandbox.NetNSPath = sandbox.NetNS.GetPath() + defer func() { + // Remove the network namespace only if all the resource cleanup is done + if retErr != nil && cleanupErr == nil { + if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id) + return + } + sandbox.NetNSPath = "" + } + }() + + if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil { + return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err) + } + // Save sandbox metadata to store + if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil { + return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err) + } + + // Define this defer to teardownPodNetwork prior to the setupPodNetwork function call. + // This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource + // creation functions. + defer func() { + // Remove the network namespace only if all the resource cleanup is done. + if retErr != nil && cleanupErr == nil { + deferCtx, deferCancel := util.DeferContext() + defer deferCancel() + // Teardown network if an error is returned. + if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id) + } + + } + }() + + // Setup network for sandbox. + // Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524) + // rely on the assumption that CRI shim will not be querying the network namespace to check the + // network states such as IP. + // In future runtime implementation should avoid relying on CRI shim implementation details. + // In this case however caching the IP will add a subtle performance enhancement by avoiding + // calls to network namespace of the pod to query the IP of the veth interface on every + // SandboxStatus request. + if err := c.setupPodNetwork(ctx, &sandbox); err != nil { + return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err) + } + sandboxCreateNetworkTimer.UpdateSince(netStart) + } + // TODO: get rid of this. sandbox object should no longer have Container field. if ociRuntime.SandboxMode == string(criconfig.ModePodSandbox) { container, err := c.client.LoadContainer(ctx, id)