cri/sbserver: Let OCI runtime create netns when userns is used
This commit just ports 36f520dc04
("Let OCI runtime create netns when
userns is used") to sbserver.
The CNI network setup is done after OCI start, as it didn't seem simple
to get the sandbox PID we need for the netns otherwise.
Signed-off-by: Rodrigo Campos <rodrigoca@microsoft.com>
This commit is contained in:
parent
73c75e2c73
commit
c99cb95f07
@ -23,6 +23,7 @@ import (
|
||||
"fmt"
|
||||
"math"
|
||||
"path/filepath"
|
||||
goruntime "runtime"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
@ -143,8 +144,27 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
||||
}
|
||||
}()
|
||||
|
||||
userNsEnabled := false
|
||||
if goruntime.GOOS != "windows" {
|
||||
usernsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions()
|
||||
if usernsOpts != nil && usernsOpts.GetMode() == runtime.NamespaceMode_POD {
|
||||
userNsEnabled = true
|
||||
}
|
||||
}
|
||||
|
||||
// Setup the network namespace if host networking wasn't requested.
|
||||
if !hostNetwork(config) {
|
||||
if !hostNetwork(config) && !userNsEnabled {
|
||||
// XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too.
|
||||
// We can't move this to a function, as the defer calls need to be executed if other
|
||||
// errors are returned in this function. So, we would need more refactors to move
|
||||
// this code to a function and the idea was to not change the current code for
|
||||
// !userNsEnabled case, therefore doing it would defeat the purpose.
|
||||
//
|
||||
// The difference between the cases is the use of netns.NewNetNS() vs
|
||||
// netns.NewNetNSFromPID().
|
||||
//
|
||||
// To simplify this, in the future, we should just remove this case (podNetwork &&
|
||||
// !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled).
|
||||
netStart := time.Now()
|
||||
// If it is not in host network namespace then create a namespace and set the sandbox
|
||||
// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
|
||||
@ -244,6 +264,88 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox
|
||||
return nil, fmt.Errorf("failed to start sandbox %q: %w", id, err)
|
||||
}
|
||||
|
||||
if !hostNetwork(config) && userNsEnabled {
|
||||
// If userns is enabled, then the netns was created by the OCI runtime
|
||||
// on controller.Start(). The OCI runtime needs to create the netns
|
||||
// because, if userns is in use, the netns needs to be owned by the
|
||||
// userns. So, let the OCI runtime just handle this for us.
|
||||
// If the netns is not owned by the userns several problems will happen.
|
||||
// For instance, the container will lack permission (even if
|
||||
// capabilities are present) to modify the netns or, even worse, the OCI
|
||||
// runtime will fail to mount sysfs:
|
||||
// https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164
|
||||
//
|
||||
// Note we do this after controller.Start(), as before that we
|
||||
// can't get the PID for the sandbox that we need for the netns.
|
||||
// Doing a controller.Status() call before that fails (can't
|
||||
// find the sandbox) so we can't get the PID.
|
||||
netStart := time.Now()
|
||||
|
||||
// If it is not in host network namespace then create a namespace and set the sandbox
|
||||
// handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network
|
||||
// namespaces. If the pod is in host network namespace then both are empty and should not
|
||||
// be used.
|
||||
var netnsMountDir = "/var/run/netns"
|
||||
if c.config.NetNSMountsUnderStateDir {
|
||||
netnsMountDir = filepath.Join(c.config.StateDir, "netns")
|
||||
}
|
||||
|
||||
sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, ctrl.Pid)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err)
|
||||
}
|
||||
|
||||
// Update network namespace in the store, which is used to generate the container's spec
|
||||
sandbox.NetNSPath = sandbox.NetNS.GetPath()
|
||||
defer func() {
|
||||
// Remove the network namespace only if all the resource cleanup is done
|
||||
if retErr != nil && cleanupErr == nil {
|
||||
if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil {
|
||||
log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id)
|
||||
return
|
||||
}
|
||||
sandbox.NetNSPath = ""
|
||||
}
|
||||
}()
|
||||
|
||||
if err := sandboxInfo.AddExtension(podsandbox.MetadataKey, &sandbox.Metadata); err != nil {
|
||||
return nil, fmt.Errorf("unable to save sandbox %q to store: %w", id, err)
|
||||
}
|
||||
// Save sandbox metadata to store
|
||||
if sandboxInfo, err = c.client.SandboxStore().Update(ctx, sandboxInfo, "extensions"); err != nil {
|
||||
return nil, fmt.Errorf("unable to update extensions for sandbox %q: %w", id, err)
|
||||
}
|
||||
|
||||
// Define this defer to teardownPodNetwork prior to the setupPodNetwork function call.
|
||||
// This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource
|
||||
// creation functions.
|
||||
defer func() {
|
||||
// Remove the network namespace only if all the resource cleanup is done.
|
||||
if retErr != nil && cleanupErr == nil {
|
||||
deferCtx, deferCancel := util.DeferContext()
|
||||
defer deferCancel()
|
||||
// Teardown network if an error is returned.
|
||||
if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil {
|
||||
log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id)
|
||||
}
|
||||
|
||||
}
|
||||
}()
|
||||
|
||||
// Setup network for sandbox.
|
||||
// Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524)
|
||||
// rely on the assumption that CRI shim will not be querying the network namespace to check the
|
||||
// network states such as IP.
|
||||
// In future runtime implementation should avoid relying on CRI shim implementation details.
|
||||
// In this case however caching the IP will add a subtle performance enhancement by avoiding
|
||||
// calls to network namespace of the pod to query the IP of the veth interface on every
|
||||
// SandboxStatus request.
|
||||
if err := c.setupPodNetwork(ctx, &sandbox); err != nil {
|
||||
return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err)
|
||||
}
|
||||
sandboxCreateNetworkTimer.UpdateSince(netStart)
|
||||
}
|
||||
|
||||
// TODO: get rid of this. sandbox object should no longer have Container field.
|
||||
if ociRuntime.SandboxMode == string(criconfig.ModePodSandbox) {
|
||||
container, err := c.client.LoadContainer(ctx, id)
|
||||
|
Loading…
Reference in New Issue
Block a user