diff --git a/pkg/cri/server/sandbox_run.go b/pkg/cri/server/sandbox_run.go index 4c1365f47..b7ed6c150 100644 --- a/pkg/cri/server/sandbox_run.go +++ b/pkg/cri/server/sandbox_run.go @@ -23,6 +23,7 @@ import ( "fmt" "math" "path/filepath" + goruntime "runtime" "strings" "time" @@ -244,8 +245,27 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, fmt.Errorf("failed to get sandbox container info: %w", err) } + userNsEnabled := false + if goruntime.GOOS != "windows" { + usernsOpts := config.GetLinux().GetSecurityContext().GetNamespaceOptions().GetUsernsOptions() + if usernsOpts != nil && usernsOpts.GetMode() == runtime.NamespaceMode_POD { + userNsEnabled = true + } + } + // Setup the network namespace if host networking wasn't requested. - if !hostNetwork(config) { + if !hostNetwork(config) && !userNsEnabled { + // XXX: We do c&p of this code later for the podNetwork && userNsEnabled case too. + // We can't move this to a function, as the defer calls need to be executed if other + // errors are returned in this function. So, we would need more refactors to move + // this code to a function and the idea was to not change the current code for + // !userNsEnabled case, therefore doing it would defeat the purpose. + // + // The difference between the cases is the use of netns.NewNetNS() vs + // netns.NewNetNSFromPID() and we verify the task is still running in the other case. + // + // To simplify this, in the future, we should just remove this case (podNetwork && + // !userNsEnabled) and just keep the other case (podNetwork && userNsEnabled). netStart := time.Now() // If it is not in host network namespace then create a namespace and set the sandbox @@ -353,6 +373,88 @@ func (c *criService) RunPodSandbox(ctx context.Context, r *runtime.RunPodSandbox return nil, fmt.Errorf("failed to wait for sandbox container task: %w", err) } + if !hostNetwork(config) && userNsEnabled { + // If userns is enabled, then the netns was created by the OCI runtime + // when creating "task". The OCI runtime needs to create the netns + // because, if userns is in use, the netns needs to be owned by the + // userns. So, let the OCI runtime just handle this for us. + // If the netns is not owned by the userns several problems will happen. + // For instance, the container will lack permission (even if + // capabilities are present) to modify the netns or, even worse, the OCI + // runtime will fail to mount sysfs: + // https://github.com/torvalds/linux/commit/7dc5dbc879bd0779924b5132a48b731a0bc04a1e#diff-4839664cd0c8eab716e064323c7cd71fR1164 + netStart := time.Now() + + // If it is not in host network namespace then create a namespace and set the sandbox + // handle. NetNSPath in sandbox metadata and NetNS is non empty only for non host network + // namespaces. If the pod is in host network namespace then both are empty and should not + // be used. + var netnsMountDir = "/var/run/netns" + if c.config.NetNSMountsUnderStateDir { + netnsMountDir = filepath.Join(c.config.StateDir, "netns") + } + sandbox.NetNS, err = netns.NewNetNSFromPID(netnsMountDir, task.Pid()) + if err != nil { + return nil, fmt.Errorf("failed to create network namespace for sandbox %q: %w", id, err) + } + + // Verify task is still in created state. + if st, err := task.Status(ctx); err != nil || st.Status != containerd.Created { + return nil, fmt.Errorf("failed to create pod sandbox %q: err is %v - status is %q and is expected %q", id, err, st.Status, containerd.Created) + } + sandbox.NetNSPath = sandbox.NetNS.GetPath() + + defer func() { + // Remove the network namespace only if all the resource cleanup is done. + if retErr != nil && cleanupErr == nil { + if cleanupErr = sandbox.NetNS.Remove(); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to remove network namespace %s for sandbox %q", sandbox.NetNSPath, id) + return + } + sandbox.NetNSPath = "" + } + }() + + // Update network namespace in the container's spec + c.updateNetNamespacePath(spec, sandbox.NetNSPath) + + if err := container.Update(ctx, + // Update spec of the container + containerd.UpdateContainerOpts(containerd.WithSpec(spec)), + // Update sandbox metadata to include NetNS info + containerd.UpdateContainerOpts(containerd.WithContainerExtension(sandboxMetadataExtension, &sandbox.Metadata))); err != nil { + return nil, fmt.Errorf("failed to update the network namespace for the sandbox container %q: %w", id, err) + } + + // Define this defer to teardownPodNetwork prior to the setupPodNetwork function call. + // This is because in setupPodNetwork the resource is allocated even if it returns error, unlike other resource creation functions. + defer func() { + // Teardown the network only if all the resource cleanup is done. + if retErr != nil && cleanupErr == nil { + deferCtx, deferCancel := ctrdutil.DeferContext() + defer deferCancel() + // Teardown network if an error is returned. + if cleanupErr = c.teardownPodNetwork(deferCtx, sandbox); cleanupErr != nil { + log.G(ctx).WithError(cleanupErr).Errorf("Failed to destroy network for sandbox %q", id) + } + } + }() + + // Setup network for sandbox. + // Certain VM based solutions like clear containers (Issue containerd/cri-containerd#524) + // rely on the assumption that CRI shim will not be querying the network namespace to check the + // network states such as IP. + // In future runtime implementation should avoid relying on CRI shim implementation details. + // In this case however caching the IP will add a subtle performance enhancement by avoiding + // calls to network namespace of the pod to query the IP of the veth interface on every + // SandboxStatus request. + if err := c.setupPodNetwork(ctx, &sandbox); err != nil { + return nil, fmt.Errorf("failed to setup network for sandbox %q: %w", id, err) + } + + sandboxCreateNetworkTimer.UpdateSince(netStart) + } + if c.nri.isEnabled() { err = c.nri.runPodSandbox(ctx, &sandbox) if err != nil { diff --git a/pkg/netns/netns_linux.go b/pkg/netns/netns_linux.go index 03f68a568..6e1a61f72 100644 --- a/pkg/netns/netns_linux.go +++ b/pkg/netns/netns_linux.go @@ -50,7 +50,9 @@ import ( // newNS creates a new persistent (bind-mounted) network namespace and returns the // path to the network namespace. -func newNS(baseDir string) (nsPath string, err error) { +// If pid is not 0, returns the netns from that pid persistently mounted. Otherwise, +// a new netns is created. +func newNS(baseDir string, pid uint32) (nsPath string, err error) { b := make([]byte, 16) _, err = rand.Read(b) @@ -81,6 +83,16 @@ func newNS(baseDir string) (nsPath string, err error) { } }() + if pid != 0 { + procNsPath := getNetNSPathFromPID(pid) + // bind mount the netns onto the mount point. This causes the namespace + // to persist, even when there are no threads in the ns. + if err = unix.Mount(procNsPath, nsPath, "none", unix.MS_BIND, ""); err != nil { + return "", fmt.Errorf("failed to bind mount ns src: %v at %s: %w", procNsPath, nsPath, err) + } + return nsPath, nil + } + var wg sync.WaitGroup wg.Add(1) @@ -155,6 +167,10 @@ func getCurrentThreadNetNSPath() string { return fmt.Sprintf("/proc/%d/task/%d/ns/net", os.Getpid(), unix.Gettid()) } +func getNetNSPathFromPID(pid uint32) string { + return fmt.Sprintf("/proc/%d/ns/net", pid) +} + // NetNS holds network namespace. type NetNS struct { path string @@ -162,7 +178,12 @@ type NetNS struct { // NewNetNS creates a network namespace. func NewNetNS(baseDir string) (*NetNS, error) { - path, err := newNS(baseDir) + return NewNetNSFromPID(baseDir, 0) +} + +// NewNetNS returns the netns from pid or a new netns if pid is 0. +func NewNetNSFromPID(baseDir string, pid uint32) (*NetNS, error) { + path, err := newNS(baseDir, pid) if err != nil { return nil, fmt.Errorf("failed to setup netns: %w", err) } diff --git a/pkg/netns/netns_other.go b/pkg/netns/netns_other.go index ec8124ceb..3cd60ef3f 100644 --- a/pkg/netns/netns_other.go +++ b/pkg/netns/netns_other.go @@ -35,6 +35,11 @@ func NewNetNS(baseDir string) (*NetNS, error) { return nil, errNotImplementedOnUnix } +// NewNetNS returns the netns from pid or a new netns if pid is 0. +func NewNetNSFromPID(baseDir string, pid uint32) (*NetNS, error) { + return nil, errNotImplementedOnUnix +} + // LoadNetNS loads existing network namespace. func LoadNetNS(path string) *NetNS { return &NetNS{path: path} diff --git a/pkg/netns/netns_windows.go b/pkg/netns/netns_windows.go index de02094b8..2d26d6f71 100644 --- a/pkg/netns/netns_windows.go +++ b/pkg/netns/netns_windows.go @@ -16,14 +16,20 @@ package netns -import "github.com/Microsoft/hcsshim/hcn" +import ( + "errors" + + "github.com/Microsoft/hcsshim/hcn" +) + +var errNotImplementedOnWindows = errors.New("not implemented on windows") // NetNS holds network namespace for sandbox type NetNS struct { path string } -// NewNetNS creates a network namespace for the sandbox +// NewNetNS creates a network namespace for the sandbox. func NewNetNS(baseDir string) (*NetNS, error) { temp := hcn.HostComputeNamespace{} hcnNamespace, err := temp.Create() @@ -34,6 +40,11 @@ func NewNetNS(baseDir string) (*NetNS, error) { return &NetNS{path: hcnNamespace.Id}, nil } +// NewNetNS returns the netns from pid or a new netns if pid is 0. +func NewNetNSFromPID(baseDir string, pid uint32) (*NetNS, error) { + return nil, errNotImplementedOnWindows +} + // LoadNetNS loads existing network namespace. func LoadNetNS(path string) *NetNS { return &NetNS{path: path}